📄 classifier.cpp
字号:
CTime startTime;
CTimeSpan totalTime;
startTime=CTime::GetCurrentTime();
CMessage::PrintInfo(_T("正在对测试文档进行分类,请稍候..."));
char fname[10],type[1024],line[4096],content[100*1024];
//falg=1 下一行的内容是文档的类别
//flag=2 下一行的内容是文档的标题
//flag=3 下一行的内容是文档的内容
int flag=0,nCount,len,i;
long lUnknown=0,lDocNum=0;
CStringArray typeArray;
CString strFileName,strCopyFile;
bool bTitle=false; //是否已经读出标题
double dThreshold=(double)m_paramClassifier.m_dThreshold/100.0;
int nWordNum=m_lstTrainWordList.GetCount();
while(!feof(stream1))
{
if(fgets(line,4096,stream1)==NULL) continue;
if(line[0]=='.')
{
if(flag==3)
{
CDocNode doc;
posTemp=cataTemp.AddDoc(doc);
CDocNode& docnode=cataTemp.GetAt(posTemp);
docnode.m_strDocName=fname;
if(m_paramClassifier.m_nClassifierType==CClassifierParam::nCT_KNN)
{
nCount=KNNCategory(content,docnode,false);
}
else
{
if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
nCount=docnode.ScanChineseStringWithDict(content,m_lstTrainWordList)-1;
else
nCount=docnode.ScanEnglishStringWithDict(content,m_lstTrainWordList,
m_paramClassifier.m_bStem)-1;
fprintf(stream2,"%d",1);
for(i=0;i<nWordNum;i++)
{
if(docnode.m_pTemp[i].s_tfi!=0)
fprintf(stream2," %d:%f",i+1,docnode.m_pTemp[i].s_dWeight);
}
fprintf(stream2,"\n");
}
if(m_paramClassifier.m_bEvaluation) typeArray.Add(type);
if(nCount<0)
{
CMessage::PrintError("无法识别文档"+docnode.m_strDocName+"的类别!");
lUnknown++;
}
CMessage::PrintStatusInfo(_T("扫描文档")+docnode.m_strDocName);
flag=0;
bTitle=false;
fname[0]=0;
type[0]=0;
content[0]=0;
lDocNum++;
}
switch(line[1])
{
case 'I':
strcpy(fname,line+3);
len=strlen(fname);
if(fname[len-1]='\r') fname[len-1]='\0';
break;
case 'C':
flag=1;
break;
case 'T':
flag=2;
break;
case 'W':
flag=3;
break;
}
}
else
{
switch(flag)
{
case 1:
strcpy(type,line);
break;
case 2:
if(!bTitle)
{
strcpy(content,line);
bTitle=true;
}
else
strcat(content,line);
break;
case 3:
strcat(content,line);
break;
}
}
}
fclose(stream1);
CMessage::PrintStatusInfo("");
if(m_paramClassifier.m_nClassifierType==CClassifierParam::nCT_SVM)
{
fclose(stream2);
startTime=CTime::GetCurrentTime();
CMessage::PrintInfo(_T("正在使用SVM分类器对文档进行分类,请稍候..."));
m_theSVM.com_param.classifyfile=m_paramClassifier.m_strResultDir+"\\test.dat";
SVMClassifyVectorFile(m_theSVM.com_param.classifyfile);
totalTime=CTime::GetCurrentTime()-startTime;
CMessage::PrintInfo(_T("SVM分类过程结束,耗时")+totalTime.Format("%H:%M:%S"));
}
long lCorrect=SaveResults(m_lstTestCatalogList,m_paramClassifier.m_strResultDir+"\\results.txt",&typeArray);
long lTotalNum=m_lstTestCatalogList.GetDocNum()-lUnknown;
CString str;
totalTime=CTime::GetCurrentTime()-startTime;
CMessage::PrintInfo(_T("测试文档分类结束,耗时")+totalTime.Format("%H:%M:%S"));
if (lUnknown>0)
{
str.Format("无法分类的文档数%d:",lUnknown);
CMessage::PrintInfo(str);
}
if(m_paramClassifier.m_bEvaluation&&lTotalNum>0&&lCorrect>0)
str.Format("测试文档总数:%d,准确率:%f",m_lstTestCatalogList.GetDocNum(),(float)(lCorrect)/(float)(lTotalNum));
else
str.Format("测试文档总数:%d",m_lstTestCatalogList.GetDocNum());
CMessage::PrintInfo(str);
return true;
}
//对文档进行分类,计算文档和每个类别的相似度,返回值为类别无法识别的文档总数
long CClassifier::SVMClassify(CCatalogList &cataList)
{
long lUnknown=0;
FILE *stream;
m_theSVM.com_param.classifyfile=m_paramClassifier.m_strResultDir+"\\test.dat";
if((stream=fopen(m_theSVM.com_param.classifyfile,"w"))==NULL)
{
CMessage::PrintError("无法创建测试文档向量文件"+m_theSVM.com_param.classifyfile+"!");
return 0;
}
CTime startTime;
CTimeSpan totalTime;
CString str;
int nCount=0;
long lWordNum=m_lstTrainWordList.GetCount();
startTime=CTime::GetCurrentTime();
CMessage::PrintInfo(_T("正在生成测试文档的向量形式,请稍候..."));
POSITION pos_cata=cataList.GetFirstPosition();
while(pos_cata!=NULL)
{
CCatalogNode& catalognode=cataList.GetNext(pos_cata);
char *path=catalognode.m_strDirName.GetBuffer(0);
POSITION pos_doc=catalognode.GetFirstPosition();
while(pos_doc!=NULL)
{
CDocNode& docnode=catalognode.GetNext(pos_doc);
if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
nCount=docnode.ScanChineseWithDict(path,m_lstTrainWordList);
else
nCount=docnode.ScanEnglishWithDict(path,m_lstTrainWordList,
m_paramClassifier.m_bStem);
fprintf(stream,"%d",catalognode.m_idxCata+1);
for(int i=0;i<lWordNum;i++)
{
if(docnode.m_pTemp[i].s_tfi!=0)
fprintf(stream," %d:%f",i+1,docnode.m_pTemp[i].s_dWeight);
}
fprintf(stream,"\n");
CMessage::PrintStatusInfo(_T("扫描文档")+docnode.m_strDocName);
if(nCount<0)
{
str="无法识别文档";
str+=catalognode.m_strDirName;
str+="\\"+docnode.m_strDocName+"的类别!";
lUnknown++;
CMessage::PrintError(str);
}
}
}
CMessage::PrintStatusInfo(_T(""));
fclose(stream);
totalTime=CTime::GetCurrentTime()-startTime;
CMessage::PrintInfo(_T("测试文档的向量生成结束,耗时")+totalTime.Format("%H:%M:%S"));
startTime=CTime::GetCurrentTime();
CMessage::PrintInfo(_T("正在使用SVM分类器对文档进行分类,请稍候..."));
//为每篇文档和各个类别的相似读数组分配内存
pos_cata=cataList.GetFirstPosition();
while(pos_cata!=NULL)
{
CCatalogNode& catalognode=cataList.GetNext(pos_cata);
POSITION pos_doc=catalognode.GetFirstPosition();
while(pos_doc!=NULL)
{
CDocNode& docnode=catalognode.GetNext(pos_doc);
docnode.AllocResultsBuffer(m_nClassNum);
}
}
SVMClassifyVectorFile(m_theSVM.com_param.classifyfile);
CMessage::PrintStatusInfo(_T(""));
totalTime=CTime::GetCurrentTime()-startTime;
CMessage::PrintInfo(_T("SVM分类过程结束,耗时")+totalTime.Format("%H:%M:%S"));
return lUnknown;
}
//使用KNN方法对文档进行分类,计算文档和每个类别的相似度
//返回值为类别无法识别的文档总数
long CClassifier::KNNClassify(CCatalogList& cataList,int nCmpType)
{
long docID=0,lUnknown=0;
CString str;
POSITION pos_cata=cataList.GetFirstPosition();
while(pos_cata!=NULL)
{
CCatalogNode& cataNode=cataList.GetNext(pos_cata);
POSITION pos_doc=cataNode.GetFirstPosition();
char *path=cataNode.m_strDirName.GetBuffer(0);
while(pos_doc!=NULL)
{
CDocNode& docNode=cataNode.GetNext(pos_doc);
short id=KNNCategory(path, docNode, true, nCmpType);
if(id==-1)
{
str="无法识别文档";
str+=cataNode.m_strDirName;
str+="\\"+docNode.m_strDocName+"的类别!";
CMessage::PrintError(str);
lUnknown++;
}
CMessage::PrintStatusInfo(_T("扫描文档")+docNode.m_strDocName);
}
}
return lUnknown;
}
//计算文档和每个类别的相似度,返回与文档相似度最大的类别ID
//nCmpType代表相似度的不同计算方式
short CClassifier::KNNCategory(char *pPath, CDocNode &docNode, bool bFile, int nCmpType)
{
short nCataID=-1;
if(KNNClassify(pPath,docNode,bFile,nCmpType)) nCataID=SingleCategory(docNode);
return nCataID;
}
//如果bFile为真,则参数file为文件的文件名称(包括它的路径)
short CClassifier::KNNCategory(char *file, bool bFile, int nCmpType)
{
CDocNode docNode;
short id=-1;
if(bFile)
{
char *fname=strrchr(file,'\\');
if(fname==NULL) return -1;
docNode.m_strDocName=(fname+1);
char path[MAX_PATH];
strncpy(path,file,fname-file);
path[fname-file]=0;
id=KNNCategory(path,docNode,bFile,nCmpType);
}
else
id=KNNCategory(file,docNode,bFile,nCmpType);
return id;
}
//生成文档docNode的向量形式,调用方法ComputeSimRatio计算器其和每一个类别的相似度
//参数bFile为真代表pPath为docNode的路径,否则代表需要进行分类的文档的内容
//参数nCmpType代表相似度的不同计算方式
bool CClassifier::KNNClassify(char *pPath, CDocNode &docNode, bool bFile, int nCmpType)
{
int nCount=0;
if(bFile)
{
if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
nCount=docNode.ScanChineseWithDict(pPath,m_lstTrainWordList);
else
nCount=docNode.ScanEnglishWithDict(pPath,m_lstTrainWordList,
m_paramClassifier.m_bStem);
}
else
{
if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
nCount=docNode.ScanChineseStringWithDict(pPath,m_lstTrainWordList);
else
nCount=docNode.ScanEnglishStringWithDict(pPath,m_lstTrainWordList,
m_paramClassifier.m_bStem);
}
if((m_lDocNum>0)&&(nCount>0))
{
ComputeSimRatio(docNode,nCmpType);
return true;
}
else
return false;
}
//得到与文档docNode的相似度大于阈值dThreshold的所有类别
//如果没有大于值阈值的类别,则返回相似度最大的类别
bool CClassifier::MultiCategory(CDocNode &docNode, CArray<short,short>& aryResult, double dThreshold)
{
double *pSimRatio=docNode.m_pResults;
if(pSimRatio==NULL) return false;
double dMax=pSimRatio[0];
short nMax=0;
bool bFound=false;
aryResult.RemoveAll();
for(short i=1;i<m_nClassNum;i++)
{
if(pSimRatio[i]>dMax)
{
dMax=pSimRatio[i];
nMax=i;
}
if(pSimRatio[i]>dThreshold)
{
aryResult.Add(i);
bFound=true;
}
}
if(!bFound) aryResult.Add(nMax);
return true;
}
//计算文档docNode和每一个类别的相似度
//nCmpType代表相似度的不同计算方式
void CClassifier::ComputeSimRatio(CDocNode &docNode,int nCmpType)
{
//计算文档与训练集中每一篇文档的相似度
int i;
long k;
for(i=0;i<m_lDocNum;i++)
{
m_pSimilarityRatio[i].dWeight=m_pDocs[i].pDocNode->ComputeSimilarityRatio();
m_pSimilarityRatio[i].lDocID=i;
}
//将测试文档与训练文档集中文档的相似度进行降序排序
Sort(m_pSimilarityRatio,m_lDocNum-1);
docNode.AllocResultsBuffer(m_nClassNum);
double *pSimRatio=docNode.m_pResults;
for(i=0;i<m_nClassNum;i++) pSimRatio[i]=0;
if(nCmpType<=0)
{
//计算出测试文档的k近邻在每个类别中的数目
for(i=0;i<m_paramClassifier.m_nKNN;i++)
{
k=m_pSimilarityRatio[i].lDocID;
k=m_pDocs[k].nCataID;
pSimRatio[k]+=1;
}
//按照"测试文档的k近邻在某个类别中的数目/k"得到测试文档和这个类别的相似度
for(i=0;i<m_nClassNum;i++)
pSimRatio[i]/=m_paramClassifier.m_nKNN;
}
else if(nCmpType==1)
{
for(i=0;i<m_paramClassifier.m_nKNN;i++)
{
k=m_pSimilarityRatio[i].lDocID;
k=m_pDocs[k].nCataID;
pSimRatio[k]+=m_pSimilarityRatio[i].dWeight;
}
}
}
bool CClassifier::SVMClassify(char *pPath, CDocNode &docNode, bool bFile)
{
int nCount=0;
if(bFile)
{
if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
nCount=docNode.ScanChineseWithDict(pPath,m_lstTrainWordList);
else
nCount=docNode.ScanEnglishWithDict(pPath,m_lstTrainWordList,
m_paramClassifier.m_bStem);
}
else
{
if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -