📄 cataloglist.cpp
字号:
}
long CCatalogList::GetDocNum()
{
long i=0;
POSITION pos=GetFirstPosition();
while(pos!=NULL)
{
CCatalogNode& catanode=GetNext(pos);
i+=catanode.GetDocNum();
}
return i;
}
int CCatalogList::GetCataNum()
{
return m_lstCatalogList.GetCount();
}
POSITION CCatalogNode::AddDoc(CDocNode &docnode)
{
return m_lstDocList.AddTail(docnode);
}
bool CCatalogList::GetCataName(short idxCata,CString& strCataName)
{
POSITION pos=GetFirstPosition();
while(pos!=NULL)
{
CCatalogNode& catanode=GetNext(pos);
if(idxCata==catanode.m_idxCata)
{
strCataName=catanode.m_strCatalogName;
return true;
}
}
return false;
}
bool CCatalogList::GetDocName(short idxCata, long idxDoc, CString &strDocName)
{
POSITION pos_cata=GetFirstPosition();
while(pos_cata!=NULL)
{
CCatalogNode& catanode=GetNext(pos_cata);
if(idxCata==catanode.m_idxCata)
{
POSITION pos_doc=catanode.GetFirstPosition();
while(pos_doc!=NULL)
{
CDocNode& docnode=catanode.GetNext(pos_doc);
if(idxDoc==docnode.m_idxDoc)
{
strDocName=docnode.m_strDocName;
return true;
}
}
return false;
}
}
return false;
}
POSITION CCatalogList::AddCata(CCatalogNode &catanode)
{
return m_lstCatalogList.AddTail(catanode);
}
POSITION CCatalogList::GetFirstPosition()
{
return m_lstCatalogList.GetHeadPosition();
}
CCatalogNode& CCatalogList::GetNext(POSITION &rPos)
{
return m_lstCatalogList.GetNext(rPos);
}
POSITION CCatalogNode::GetFirstPosition()
{
return m_lstDocList.GetHeadPosition();
}
CDocNode& CCatalogNode::GetNext(POSITION &rPos)
{
return m_lstDocList.GetNext(rPos);
}
UINT CCatalogNode::GetDocNum()
{
return m_lstDocList.GetCount();
}
//扫描目录构建类和文档节点
long CCatalogList::ScanDirectory(CString strPath)
{
if(_chdir(strPath)) // if can't find the dir
{
CString csTmp = "目录";
csTmp+=strPath;
csTmp+="不存在!";
AfxMessageBox(csTmp);
return -1;
}
long docNum=0;
short idxCurCata=0;
HANDLE hFinder;
LPWIN32_FIND_DATA lpFindFileData;
lpFindFileData = new WIN32_FIND_DATA;
hFinder = ::FindFirstFile("*.*",lpFindFileData );
while(::FindNextFile(hFinder,lpFindFileData)) // process the catalog dir;
{
if( !strcmp(lpFindFileData->cFileName,".") || !strcmp(lpFindFileData->cFileName,"..") )
continue;
if((lpFindFileData->dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
{
CCatalogNode catalognode;
catalognode.m_strCatalogName=lpFindFileData->cFileName;
catalognode.m_strDirName=strPath+"\\"+catalognode.m_strCatalogName;
catalognode.m_idxCata=idxCurCata++;
POSITION posTemp=AddCata(catalognode);
CCatalogNode& cataTemp=GetAt(posTemp);
cataTemp.SetStartDocID(docNum);
docNum=cataTemp.ScanDirectory(catalognode.m_strDirName);
}
}
delete lpFindFileData;
return docNum;
}
CCatalogNode CCatalogList::GetAt(POSITION pos) const
{
return m_lstCatalogList.GetAt(pos);
}
CCatalogNode& CCatalogList::GetAt(POSITION pos)
{
return m_lstCatalogList.GetAt(pos);
}
//根据标识获得类节点
CCatalogNode* CCatalogList::GetCata(short idxCata)
{
POSITION pos_cata=GetFirstPosition();
while(pos_cata!=NULL)
{
CCatalogNode& catanode=GetNext(pos_cata);
if(catanode.m_idxCata==idxCata)
return &catanode;
}
return NULL;
}
//根据文档名获得文档
CDocNode* CCatalogList::GetDocByName(CString strDocName)
{
POSITION pos_cata=GetFirstPosition();
while(pos_cata!=NULL)
{
CCatalogNode& catanode=GetNext(pos_cata);
POSITION pos_doc=catanode.GetFirstPosition();
while(pos_doc!=NULL)
{
CDocNode& docnode=catanode.GetNext(pos_doc);
if(docnode.m_strDocName==strDocName)
return &docnode;
}
}
return NULL;
}
//根据类名获得类
CCatalogNode* CCatalogList::GetCataByName(CString strCataName)
{
POSITION pos_cata=GetFirstPosition();
while(pos_cata!=NULL)
{
CCatalogNode& catanode=GetNext(pos_cata);
if(catanode.m_strCatalogName==strCataName)
return &catanode;
}
return NULL;
}
//根据类名获得类的ID
short CCatalogList::GetCataIDByName(CString strCataName)
{
POSITION pos_cata=GetFirstPosition();
while(pos_cata!=NULL)
{
CCatalogNode& catanode=GetNext(pos_cata);
if(catanode.m_strCatalogName==strCataName)
return catanode.m_idxCata;
}
return -1;
}
int CDocNode::GetWordNum()
{
int s=0;
for(int i=0;i<m_nAllocLen;i++)
{
s+=m_sWeightSet[i].s_tfi;
}
return s;
}
int CDocNode::GenDocVector()
{
int i,nSum=0;
for(i=0;i<m_nAllocTempLen;i++)
if(m_pTemp[i].s_dWeight>dZero) nSum++;
AllocBuffer(nSum);
nSum=0;
for(i=0;i<m_nAllocTempLen;i++)
{
if(m_pTemp[i].s_dWeight>dZero)
{
m_sWeightSet[nSum].s_idxWord=m_pTemp[i].s_idxWord;
m_sWeightSet[nSum].s_tfi=m_pTemp[i].s_tfi;
m_sWeightSet[nSum].s_dWeight=m_pTemp[i].s_dWeight;
nSum++;
}
}
return nSum;
}
int CDocNode::GenDocVector(DOC &doc)
{
if(m_pTemp==NULL||m_nAllocTempLen<=0) return -1;
int i,nSum=0;
for(i=0;i<CDocNode::m_nAllocTempLen;i++)
{
if(CDocNode::m_pTemp[i].s_dWeight>0) nSum++;
}
doc.words=(SVM_WORD *)malloc(sizeof(SVM_WORD)*(nSum+12));
nSum=0;
for(i=0;i<CDocNode::m_nAllocTempLen;i++)
{
if(CDocNode::m_pTemp[i].s_dWeight>0)
{
//DOC的特征ID从1开始
(doc.words[nSum]).wnum=i+1;
(doc.words[nSum]).weight=CDocNode::m_pTemp[i].s_dWeight;
nSum++;
}
}
(doc.words[nSum]).wnum=0;
doc.docnum=-1;
return nSum;
}
int CDocNode::ScanChineseStringWithDict(char *pPath,CWordList& wordList)
{
char *buffer=pPath;
int i,j,k;
short l,n;
char gram[12];
//realcnt为文章中去掉停用词后剩下的总共词数
//nStart为一个句子在buffer中的开始位置
int realcnt=0,nStart=0,nNewStart=0;
//句子的长度
int nSentenceLen=0;
memset(m_pTemp,0,sizeof(sWeightNode)*m_nAllocTempLen);
CWordNode wordNode;
while(buffer[nStart]!='\0')
{
nSentenceLen=ParseFile(buffer,nStart,nNewStart);
nStart=nNewStart;
if(nSentenceLen==0) continue;
if(m_pSentence[0]>0) //如果是一个英文单词
{
//如果英文单词的长度大于等于2,且不是数字
if((nSentenceLen>=2)&&((m_pSentence[0]<'0')||(m_pSentence[0]>'9')))
{
if(wordList.Lookup(m_pSentence,wordNode))
{
m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID;
m_pTemp[wordNode.m_nWordID].s_tfi+=1;
m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight;
realcnt++;
}
}
}
else //如果是汉字串
{
if(nSentenceLen%2!=0) continue;
//倒着扫描句子,这样分词的准确率高一些
i=nSentenceLen;
while(i>0)
{
//最长扫描5个汉字的单词
if(i>10) k=10;
else k=i;
for(j=k;j>0;j=j-2)
{
//将Gram项拷贝到gram中
n=0;
for(l=j;l>0;l--) gram[n++]=m_pSentence[i-l];
gram[n]='\0';
if(wordList.Lookup(gram,wordNode))
{
m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID;
m_pTemp[wordNode.m_nWordID].s_tfi+=1;
m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight;
realcnt++;
i=i-j+2;
break;
}
}
i=i-2;
}
}
}
//对文档向量中的每一维进行加权
if(realcnt>0)
{
double sum=0;
for(i=0;i<m_nAllocTempLen;i++)
{
m_pTemp[i].s_dWeight*=m_pTemp[i].s_tfi;
sum+=(m_pTemp[i].s_dWeight*m_pTemp[i].s_dWeight);
}
sum=sqrt(sum);
for(i=0;i<m_nAllocTempLen;i++)
m_pTemp[i].s_dWeight/=sum;
}
return realcnt;
}
int CDocNode::ScanChineseWithDict(char *pPath,CWordList& wordList)
{
CFile fin;
char *buffer;
strcpy(pTempStr,pPath);
strcat(pTempStr,"\\");
strcat(pTempStr,m_strDocName.GetBuffer(0));
if(!fin.Open(pTempStr,CFile::modeRead))
return -1;
int flen=fin.GetLength();
buffer=new char[flen+1];
flen=fin.ReadHuge(buffer,flen);
buffer[flen]='\0';
fin.Close();
int num=ScanChineseStringWithDict(buffer,wordList);
delete[] buffer;
return num;
}
int CDocNode::ScanEnglishStringWithDict(char *pPath, CWordList &wordList, bool bStem)
{
char *buffer=pPath;
_strlwr(buffer);
int nFilePos=0;
int realcnt=0,wordLen=0;
char c, *p;
p=buffer;
memset(m_pTemp,0,sizeof(sWeightNode)*m_nAllocTempLen);
CWordNode wordNode;
while(buffer[nFilePos]!='\0')
{
c=buffer[nFilePos];
if(c==' '||c=='\r'||c=='\n'||
(c>32&&c<=47)||(c>=58&&c<=64)||(c>=91&&c<=96)||(c>=123&&c<=127))
{
buffer[nFilePos]='\0';
wordLen=buffer+nFilePos-p;
if(wordLen>2)
{
if(bStem) theStemmer.stem(p,0,wordLen-1);
if(wordList.Lookup(p,wordNode))
{
m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID;
m_pTemp[wordNode.m_nWordID].s_tfi+=1;
m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight;
realcnt++;
}
}
p=buffer+nFilePos+1;
}
nFilePos++;
}
//对文档向量中的每一维进行加权
int i;
if(realcnt>0)
{
double sum=0;
for(i=0;i<m_nAllocTempLen;i++)
{
m_pTemp[i].s_dWeight*=m_pTemp[i].s_tfi;
sum+=(m_pTemp[i].s_dWeight*m_pTemp[i].s_dWeight);
}
sum=sqrt(sum);
for(i=0;i<m_nAllocTempLen;i++)
m_pTemp[i].s_dWeight/=sum;
}
return realcnt;
}
int CDocNode::ScanEnglishWithDict(char *pPath, CWordList &wordList, bool bStem)
{
CFile fin;
char *buffer;
strcpy(pTempStr,pPath);
strcat(pTempStr,"\\");
strcat(pTempStr,m_strDocName.GetBuffer(0));
if(!fin.Open(pTempStr,CFile::modeRead))
return -1;
int flen=fin.GetLength();
buffer=new char[flen+1];
flen=fin.ReadHuge(buffer,flen);
buffer[flen]='\0';
fin.Close();
int num=ScanEnglishStringWithDict(buffer,wordList,bStem);
delete[] buffer;
return num;
}
double CDocNode::ComputeSimilarityRatio()
{
double sum=0.0;
for(int i=0;i<m_nAllocLen;i++)
sum+=m_sWeightSet[i].s_dWeight*m_pTemp[m_sWeightSet[i].s_idxWord].s_dWeight;
return sum;
}
//对pBuffer中的文字进行处理,得到一个句子的结束位置nEnd
//并且将去掉空格回车等字母后的句子放入变量m_pSentence
int CDocNode::ParseFile(char *pBuffer, int nStart, int &nEnd)
{
//nSum为得到的句子包含的字节数
int nCurrent,nSum=0;
byte bChar[2];
//是否为中文句子
bool bChinese=true;
nCurrent=nStart;
//去掉句首的全角和半角空格
//判断句子是以中文字母开头,还是英文字母开头
while(pBuffer[nCurrent]!='\0')
{
bChar[0]=pBuffer[nCurrent];
if(bChar[0]==' '||bChar[0]=='\r'||bChar[0]=='\n')
nCurrent++;
else if(bChar[0]==0xA1)
{
bChar[1]=pBuffer[nCurrent+1];
if(bChar[1]==0xA1)
nCurrent+=2;
else
{
bChinese=true;
break;
}
}
else if(pBuffer[nCurrent]>0)
{
bChinese=false;
break;
}
else
{
bChinese=true;
break;
}
}
while(pBuffer[nCurrent]!='\0')
{
bChar[0]=pBuffer[nCurrent];
if(bChar[0]>127)
{
if(!bChinese) break;
nCurrent++;
bChar[1]=pBuffer[nCurrent];
//0xA1A1为全角的空格
if((bChar[0]!=0xA1)||(bChar[1]!=0xA1))
{
//如果为"的"字,或为标点符号或其它全角字母
if(((bChar[0]==0xB5)&&(bChar[1]==0xC4))||
((bChar[0]==0xA1)&&(bChar[1]>0xA1)&&(bChar[1]<=0xFE))||
((bChar[0]==0xA2)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFC))||
((bChar[0]==0xA3)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFE))||
((bChar[0]==0xA4)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF3))||
((bChar[0]==0xA5)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF6))||
((bChar[0]==0xA6)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF5))||
((bChar[0]==0xA7)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF1))||
((bChar[0]==0xA8)&&(bChar[1]>=0x40)&&(bChar[1]<=0xE9))||
((bChar[0]==0xA9)&&(bChar[1]>=0x40)&&(bChar[1]<=0xEF))||
((bChar[0]==0xAA)&&(bChar[1]==0xA5)))
{
nCurrent++;
break;
}
//如果后半部分小于0x80,代表是一个错误的汉字
else if(bChar[1]>=0x80)
{
m_pSentence[nSum]=pBuffer[nCurrent-1];
nSum++;
m_pSentence[nSum]=pBuffer[nCurrent];
nSum++;
}
}
}
else
{
if(bChar[0]==' '||bChar[0]=='\r'||bChar[0]=='\n')
{
if(!bChinese)
{
nCurrent++;
break;
}
}
else
{
//if(bChar[0]=='!'||bChar[0]=='?'||bChar[0]==':'||
// bChar[0]==';'||bChar[0]=='.')
if((bChar[0]>32&&bChar[0]<=47)||(bChar[0]>=58&&bChar[0]<=64)||
(bChar[0]>=91&&bChar[0]<=96)||(bChar[0]>=123&&bChar[0]<=127))
{
nCurrent++;
break;
}
else if(bChinese) break;
else
{
m_pSentence[nSum]=pBuffer[nCurrent];
nSum++;
}
}
}
nCurrent++;
}
m_pSentence[nSum]='\0';
nEnd=nCurrent;
return nSum;
}
bool CCatalogList::BuildCatalogID(CCatalogList & catalogList)
{
POSITION pos=GetFirstPosition();
CString strCatalogName;
short cataID;
while(pos!=NULL)
{
CCatalogNode& cataNode=GetNext(pos);
cataID=catalogList.GetCataIDByName(cataNode.m_strCatalogName);
if(cataID<0) return false;
cataNode.m_idxCata=cataID;
}
return true;
}
bool CDocNode::IsNumber(char *p)
{
int i=0;
while(p[i]!='\0'&&p[i]>='0'&&p[i]<='9') i++;
if(p[i]=='\0') return true;
else return false;
}
//函数将smart格式的类别字符串line转换一个类别ID数组
//函数的返回值为字符串line中包含无法识别的类别总数
short CCatalogList::GetCataIDArrayFromString(char * line, CArray<short, short> &aryCataID)
{
int pos=0,id,d;
short result=0;
char type[MAX_PATH];
aryCataID.RemoveAll();
while(sscanf(line+pos,"%s %d",type,&d) != EOF)
{
id=GetCataIDByName(type);
if(id>=0)
{
aryCataID.Add(id);
while(line[pos]!=';'&&line[pos]!='\r'&&line[pos]!='\0')
pos++;
if(line[pos]==';') pos=pos+2;
}
else result++;
}
return result;
}
CDocNode& CCatalogNode::GetAt(POSITION position)
{
return m_lstDocList.GetAt(position);
}
int CCatalogList::GetSaveMode()
{
return CCatalogList::m_nSaveMode;
}
double CDocNode::ComputeProbability(CWordList &wordlist,int n)
{
/*
double sum=0.0;
for(int i=0;i<m_nAllocTempLen;i++)
{
int j=m_pTemp[i].s_idxWord;
CString str = wordlist.GetWordByID(i);
CWordNode &wordnode = wordlist.m_lstWordList[str];
sum+= wordnode.m_pCataWeightPro[n];
}
return sum;
*/
return m_nAllocLen;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -