⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 retrieval.cpp

📁 一个检索单个汉字、多个汉字和词组
💻 CPP
字号:
// retrieval.cpp: implementation of the retrieval class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "TestCorpus.h"
#include "retrieval.h"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

extern CString corpusName;
extern CStringArray texts;
extern CHzInfo hzInfo[6768];
extern int FindOneHZ(const char* str, const char *hz);
extern CString wordType(CString &w);

static CString RetrievedLine;
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

/* 创建retrieval类时VC++自动生成的部分,构造函数与析构函数
retrieval::retrieval()
{

}

retrieval::~retrieval()
{

}
*/
void CHzInfo::AddTextID(WORD id)
{
	Count++;
	int n=TextID.GetSize();
	if(n>0 && id==TextID[n-1])
		return;
	TextID.Add(id);
}
/*
void CHzInfo::DeleteTextID(WORD id)
{
	Count--;
	int n=TextID.GetSize();
	int m=TextID[n-1];
//	if (n>0 && id==TextID[n-1])
//		TextID.RemoveAt(n-1); // 把待删除的语料库文件从数组中移走
	TextID.RemoveAt(id);
}
*/
IMPLEMENT_SERIAL(CHzInfo,CObject,0)

void CHzInfo::Serialize(CArchive &ar)
{
	if(ar.IsStoring()) {
		ar << Count;
		Count=0;
		TextID.Serialize(ar);
		TextID.RemoveAll();
	}
	else {
		ar >> Count;
		TextID.Serialize(ar);
	}
}

void Retrieve(CWordArray * txtID, CString key)
{
	if(txtID->GetSize()==0)
		return;
	FILE * in, * out;
	int Examples=0;
	out=fopen("output.txt","wt");
	if(!out) {
		AfxMessageBox("无法创建检索输出文件!");
		return;
	}
	CStdioFile outFile(out);
	outFile.WriteString("*** 查找汉字串:“");
	outFile.WriteString(key+"”*** \n\n");
	for(int i=0;i<txtID->GetSize();i++) {
		int id=txtID->GetAt(i);
		CString fname;
		if(id<texts.GetSize())
			fname=texts[id];
		else {
			AfxMessageBox("超出数组范围!!!");
			continue;
		}
		in=fopen((const char *) fname,"rt");
		if(!in) {
			AfxMessageBox("找不到语料文件!"+fname);
			continue;
		}
		CStdioFile inFile(in);
		char s[3000];
		CString ss="",dd;
		while(inFile.ReadString(s,3000))
			if(key.GetLength()>2 && strstr(s,(const char *)key) || key.GetLength()==2 && FindOneHZ(s,(const char *)key)>=0) {
				Examples++;
				dd.Format("例%04d:",Examples);
				ss+=dd+s;
				ss+='\n';
			}
			if(ss.GetLength()>0) {
				outFile.WriteString("=== "+fname+": ===\n\n");
				outFile.WriteString(ss+"\n");
			}
			inFile.Close();
	}
	outFile.Close();
	if(Examples>0) {
		CString msg;
		msg.Format("共有%d个实例符合检索条件,\n请打开output.txt文件看检索结果!",Examples);
		AfxMessageBox(msg);
	}
	else
		AfxMessageBox("找不到这个汉字串!");
}

BOOL GoodHzStr(CString s)
{
	int n=s.GetLength();
	if(n==0 || n%2!=0)
		return FALSE;
	for(int i=0;i<n-1;i+=2)
		if((unsigned char)s[i]<176 || (unsigned char)s[i+1]<161)
			return FALSE;
	return TRUE;
}

CWordArray *Intersection(CWordArray &wi,CWordArray &wj)
{
	CWordArray *pw=new CWordArray;
	for(int i=0,j=0;i<wi.GetSize()&&j<wj.GetSize();) {
		if(wi[i]==wj[j]) {
			pw->Add(wi[i]);
			i++;
			j++;
		}
		else
			if(wi[i]<wj[j])
				i++;
			else
				j++;
	}
	return pw;
}

void RetrievalDupPattern(CString dup)
{
	if(texts.GetSize()==0)
		return;
	FILE * in, * out;
	int Examples=0;
	out=fopen("dup.txt","wt");
	if(!out) {
		AfxMessageBox("无法创建检索输出文件!");
		return;
	}

	int testdup=0;
	
	if (dup=="AA")	testdup=1;

	if (dup=="AABB") testdup=2;

	if (dup=="ABAB") testdup=3;

	if (dup=="A一A") testdup=4;

	if (dup=="AAB") testdup=5;

	if (dup=="ABB") testdup=6;

	if (dup=="A了A") testdup=7;

	if (dup=="A不A") testdup=8;
	
	if (dup=="A没A") testdup=9;
	
	if (dup=="A里AB") testdup=10;
			
	CStdioFile outFile(out);
	outFile.WriteString("*** 重叠模式:“");
	outFile.WriteString(dup+"”*** \n\n");
	for(int i=0;i<texts.GetSize();i++) {
		CString fname;
		if(i<texts.GetSize())
			fname=texts[i];
		else {
			AfxMessageBox("超出数组范围!!!");
			continue;
		}
		in=fopen((const char *) fname,"rt");
		if(!in) {
			AfxMessageBox("找不到语料文件!"+fname);
			continue;
		}

		CStdioFile inFile(in);
		char s[3000];
		CString ss="",dd;
		while(inFile.ReadString(s,3000)) {
			if(FindDupPattern(s,(const char *)dup)==testdup) {
				Examples++;
				dd.Format("例%04d:",Examples);
				ss+=dd+RetrievedLine;
				ss+='\n';
			}
		}
		if(ss.GetLength()>0) {
			outFile.WriteString("=== "+fname+": ===\n\n");
			outFile.WriteString(ss+"\n");
		}
		inFile.Close();
	}

	outFile.Close();
	
	if(Examples>0) {
		CString msg;
		msg.Format("共有%d个实例符合检索条件,\n请打开dup.txt文件看检索结果!",Examples);
		AfxMessageBox(msg);
	}
	else
		AfxMessageBox("找不到这种重叠形式的例句!");
}

int FindDupPattern (const char * str, CString dup_pattern)
{
	char * p = (char * ) str;
	RetrievedLine="";
	CString sent;
	int templen=0;

	if (dup_pattern == "AA")
		templen = 4;
	else {
		if ((dup_pattern == "AAB") || (dup_pattern == "ABB") || (dup_pattern=="A一A") || (dup_pattern=="A了A") || (dup_pattern=="A没A") || (dup_pattern=="A不A"))
			templen = 6;
		else 
			if ((dup_pattern == "AABB") || (dup_pattern == "ABAB") || (dup_pattern=="A里AB"))
				templen = 8;
	}

	while (*p !='\0') {
		if (*p>0) { // 如果p指向的不是汉字
			RetrievedLine=RetrievedLine + *p;
			p ++ ;
		}
		else {
			if (*p<0) { // 如果p指向一个汉字
				CString tempunit="";
				int CurrentStringLength=0;

				while (CurrentStringLength<templen && (*p < 0)) {
					sent = p;
					tempunit = tempunit+sent.Left(2);
					CurrentStringLength+=2;
					p+=2;
				}

				if (CurrentStringLength==4) {
					if (templen == 4) {
						if (dup_pattern == "AA" && wordType(tempunit)=="AA") {
							sent=p;
							RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
							return 1;
						}
						else { 
							RetrievedLine = RetrievedLine + *(p-4) + *(p-3);
							p = p-2;
						}
					}
					else {
						RetrievedLine = RetrievedLine + tempunit;
					}
				}
				else {
					if (CurrentStringLength==8) {
						if (dup_pattern == "AABB" && wordType(tempunit)=="AABB") {
							sent=p;
							RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
							return 2;
						}
						
						if (dup_pattern == "ABAB" && wordType(tempunit)=="ABAB") {
							sent=p;
							RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
							return 3;
						}
						
						if (dup_pattern == "A里AB" && wordType(tempunit)=="A里AB") {
							sent=p;
							RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
							return 10;				
						}
						else {
							RetrievedLine = RetrievedLine + *(p-8) + *(p-7);
							p = p-6;
						}
					}
					else {	
						if (CurrentStringLength==6) {
							if (dup_pattern == "A一A" && wordType(tempunit)=="A一A") {
								sent = p;
								RetrievedLine = RetrievedLine +  "【" + tempunit + "】" + sent;
								return 4;
							}						
						
							if (dup_pattern == "AAB" && wordType(tempunit)=="AAB") {
								sent = p;
								RetrievedLine = RetrievedLine +  "【" + tempunit + "】" + sent;
								return 5;
							}
						
							if (dup_pattern == "ABB" && wordType(tempunit)=="ABB") {
								sent = p;
								RetrievedLine = RetrievedLine +  "【" + tempunit + "】" + sent;
								return 6;
							}
												
							if (dup_pattern == "A了A" && wordType(tempunit)=="A了A") {
								sent = p;
								RetrievedLine = RetrievedLine +  "【" + tempunit + "】" + sent;
								return 7;
							}
						
							if (dup_pattern == "A不A" && wordType(tempunit)=="A不A") {
								sent = p;
								RetrievedLine = RetrievedLine +  "【" + tempunit + "】" + sent;
								return 8;
							}
						
							if (dup_pattern == "A没A" && wordType(tempunit)=="A没A") {
								sent = p;
								RetrievedLine = RetrievedLine +  "【" + tempunit + "】" + sent;
								return 9;
							}
							else {
								RetrievedLine = RetrievedLine + *(p-6) + *(p-5);
								p = p-4;
							}
						}
						else {
							RetrievedLine=RetrievedLine+tempunit;
						}
					}
				}
			}
			else {
				RetrievedLine=RetrievedLine + *p + *(p+1);
				p+=2;
			}
		}
	}
	return -1;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -