📄 retrieval.cpp
字号:
// retrieval.cpp: implementation of the retrieval class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "TestCorpus.h"
#include "retrieval.h"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
extern CString corpusName;
extern CStringArray texts;
extern CHzInfo hzInfo[6768];
extern int FindOneHZ(const char* str, const char *hz);
extern CString wordType(CString &w);
static CString RetrievedLine;
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
/* 创建retrieval类时VC++自动生成的部分,构造函数与析构函数
retrieval::retrieval()
{
}
retrieval::~retrieval()
{
}
*/
void CHzInfo::AddTextID(WORD id)
{
Count++;
int n=TextID.GetSize();
if(n>0 && id==TextID[n-1])
return;
TextID.Add(id);
}
/*
void CHzInfo::DeleteTextID(WORD id)
{
Count--;
int n=TextID.GetSize();
int m=TextID[n-1];
// if (n>0 && id==TextID[n-1])
// TextID.RemoveAt(n-1); // 把待删除的语料库文件从数组中移走
TextID.RemoveAt(id);
}
*/
IMPLEMENT_SERIAL(CHzInfo,CObject,0)
void CHzInfo::Serialize(CArchive &ar)
{
if(ar.IsStoring()) {
ar << Count;
Count=0;
TextID.Serialize(ar);
TextID.RemoveAll();
}
else {
ar >> Count;
TextID.Serialize(ar);
}
}
void Retrieve(CWordArray * txtID, CString key)
{
if(txtID->GetSize()==0)
return;
FILE * in, * out;
int Examples=0;
out=fopen("output.txt","wt");
if(!out) {
AfxMessageBox("无法创建检索输出文件!");
return;
}
CStdioFile outFile(out);
outFile.WriteString("*** 查找汉字串:“");
outFile.WriteString(key+"”*** \n\n");
for(int i=0;i<txtID->GetSize();i++) {
int id=txtID->GetAt(i);
CString fname;
if(id<texts.GetSize())
fname=texts[id];
else {
AfxMessageBox("超出数组范围!!!");
continue;
}
in=fopen((const char *) fname,"rt");
if(!in) {
AfxMessageBox("找不到语料文件!"+fname);
continue;
}
CStdioFile inFile(in);
char s[3000];
CString ss="",dd;
while(inFile.ReadString(s,3000))
if(key.GetLength()>2 && strstr(s,(const char *)key) || key.GetLength()==2 && FindOneHZ(s,(const char *)key)>=0) {
Examples++;
dd.Format("例%04d:",Examples);
ss+=dd+s;
ss+='\n';
}
if(ss.GetLength()>0) {
outFile.WriteString("=== "+fname+": ===\n\n");
outFile.WriteString(ss+"\n");
}
inFile.Close();
}
outFile.Close();
if(Examples>0) {
CString msg;
msg.Format("共有%d个实例符合检索条件,\n请打开output.txt文件看检索结果!",Examples);
AfxMessageBox(msg);
}
else
AfxMessageBox("找不到这个汉字串!");
}
BOOL GoodHzStr(CString s)
{
int n=s.GetLength();
if(n==0 || n%2!=0)
return FALSE;
for(int i=0;i<n-1;i+=2)
if((unsigned char)s[i]<176 || (unsigned char)s[i+1]<161)
return FALSE;
return TRUE;
}
CWordArray *Intersection(CWordArray &wi,CWordArray &wj)
{
CWordArray *pw=new CWordArray;
for(int i=0,j=0;i<wi.GetSize()&&j<wj.GetSize();) {
if(wi[i]==wj[j]) {
pw->Add(wi[i]);
i++;
j++;
}
else
if(wi[i]<wj[j])
i++;
else
j++;
}
return pw;
}
void RetrievalDupPattern(CString dup)
{
if(texts.GetSize()==0)
return;
FILE * in, * out;
int Examples=0;
out=fopen("dup.txt","wt");
if(!out) {
AfxMessageBox("无法创建检索输出文件!");
return;
}
int testdup=0;
if (dup=="AA") testdup=1;
if (dup=="AABB") testdup=2;
if (dup=="ABAB") testdup=3;
if (dup=="A一A") testdup=4;
if (dup=="AAB") testdup=5;
if (dup=="ABB") testdup=6;
if (dup=="A了A") testdup=7;
if (dup=="A不A") testdup=8;
if (dup=="A没A") testdup=9;
if (dup=="A里AB") testdup=10;
CStdioFile outFile(out);
outFile.WriteString("*** 重叠模式:“");
outFile.WriteString(dup+"”*** \n\n");
for(int i=0;i<texts.GetSize();i++) {
CString fname;
if(i<texts.GetSize())
fname=texts[i];
else {
AfxMessageBox("超出数组范围!!!");
continue;
}
in=fopen((const char *) fname,"rt");
if(!in) {
AfxMessageBox("找不到语料文件!"+fname);
continue;
}
CStdioFile inFile(in);
char s[3000];
CString ss="",dd;
while(inFile.ReadString(s,3000)) {
if(FindDupPattern(s,(const char *)dup)==testdup) {
Examples++;
dd.Format("例%04d:",Examples);
ss+=dd+RetrievedLine;
ss+='\n';
}
}
if(ss.GetLength()>0) {
outFile.WriteString("=== "+fname+": ===\n\n");
outFile.WriteString(ss+"\n");
}
inFile.Close();
}
outFile.Close();
if(Examples>0) {
CString msg;
msg.Format("共有%d个实例符合检索条件,\n请打开dup.txt文件看检索结果!",Examples);
AfxMessageBox(msg);
}
else
AfxMessageBox("找不到这种重叠形式的例句!");
}
int FindDupPattern (const char * str, CString dup_pattern)
{
char * p = (char * ) str;
RetrievedLine="";
CString sent;
int templen=0;
if (dup_pattern == "AA")
templen = 4;
else {
if ((dup_pattern == "AAB") || (dup_pattern == "ABB") || (dup_pattern=="A一A") || (dup_pattern=="A了A") || (dup_pattern=="A没A") || (dup_pattern=="A不A"))
templen = 6;
else
if ((dup_pattern == "AABB") || (dup_pattern == "ABAB") || (dup_pattern=="A里AB"))
templen = 8;
}
while (*p !='\0') {
if (*p>0) { // 如果p指向的不是汉字
RetrievedLine=RetrievedLine + *p;
p ++ ;
}
else {
if (*p<0) { // 如果p指向一个汉字
CString tempunit="";
int CurrentStringLength=0;
while (CurrentStringLength<templen && (*p < 0)) {
sent = p;
tempunit = tempunit+sent.Left(2);
CurrentStringLength+=2;
p+=2;
}
if (CurrentStringLength==4) {
if (templen == 4) {
if (dup_pattern == "AA" && wordType(tempunit)=="AA") {
sent=p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 1;
}
else {
RetrievedLine = RetrievedLine + *(p-4) + *(p-3);
p = p-2;
}
}
else {
RetrievedLine = RetrievedLine + tempunit;
}
}
else {
if (CurrentStringLength==8) {
if (dup_pattern == "AABB" && wordType(tempunit)=="AABB") {
sent=p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 2;
}
if (dup_pattern == "ABAB" && wordType(tempunit)=="ABAB") {
sent=p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 3;
}
if (dup_pattern == "A里AB" && wordType(tempunit)=="A里AB") {
sent=p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 10;
}
else {
RetrievedLine = RetrievedLine + *(p-8) + *(p-7);
p = p-6;
}
}
else {
if (CurrentStringLength==6) {
if (dup_pattern == "A一A" && wordType(tempunit)=="A一A") {
sent = p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 4;
}
if (dup_pattern == "AAB" && wordType(tempunit)=="AAB") {
sent = p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 5;
}
if (dup_pattern == "ABB" && wordType(tempunit)=="ABB") {
sent = p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 6;
}
if (dup_pattern == "A了A" && wordType(tempunit)=="A了A") {
sent = p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 7;
}
if (dup_pattern == "A不A" && wordType(tempunit)=="A不A") {
sent = p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 8;
}
if (dup_pattern == "A没A" && wordType(tempunit)=="A没A") {
sent = p;
RetrievedLine = RetrievedLine + "【" + tempunit + "】" + sent;
return 9;
}
else {
RetrievedLine = RetrievedLine + *(p-6) + *(p-5);
p = p-4;
}
}
else {
RetrievedLine=RetrievedLine+tempunit;
}
}
}
}
else {
RetrievedLine=RetrievedLine + *p + *(p+1);
p+=2;
}
}
}
return -1;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -