📄 classifier.cpp

📁 基于径向基函数的神经网络文本自动分类系统。
💻 CPP
📖 第 1 页 / 共 4 页
字号:
	CTime startTime;
	CTimeSpan totalTime;
	startTime=CTime::GetCurrentTime();
	CMessage::PrintInfo(_T("正在对测试文档进行分类，请稍候..."));

	char fname[10],type[1024],line[4096],content[100*1024];
	//falg=1 下一行的内容是文档的类别
	//flag=2 下一行的内容是文档的标题
	//flag=3 下一行的内容是文档的内容
	int flag=0,nCount,len,i;
	long lUnknown=0,lDocNum=0;
	CStringArray typeArray;
	CString strFileName,strCopyFile;
	bool bTitle=false; //是否已经读出标题
	double dThreshold=(double)m_paramClassifier.m_dThreshold/100.0;
	int nWordNum=m_lstTrainWordList.GetCount();
	while(!feof(stream1))
	{
		if(fgets(line,4096,stream1)==NULL) continue;
		if(line[0]=='.')
		{
			if(flag==3)
			{
				CDocNode doc;
				posTemp=cataTemp.AddDoc(doc);
				CDocNode& docnode=cataTemp.GetAt(posTemp);
				docnode.m_strDocName=fname;
				if(m_paramClassifier.m_nClassifierType==CClassifierParam::nCT_KNN)
				{
					nCount=KNNCategory(content,docnode,false);
				}
				else
				{
					if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
						nCount=docnode.ScanChineseStringWithDict(content,m_lstTrainWordList)-1;
					else
						nCount=docnode.ScanEnglishStringWithDict(content,m_lstTrainWordList,
											m_paramClassifier.m_bStem)-1;

					fprintf(stream2,"%d",1);
					for(i=0;i<nWordNum;i++)
					{
						if(docnode.m_pTemp[i].s_tfi!=0) 
							fprintf(stream2," %d:%f",i+1,docnode.m_pTemp[i].s_dWeight);
					}
					fprintf(stream2,"\n");
				}
				if(m_paramClassifier.m_bEvaluation) typeArray.Add(type);
				if(nCount<0) 
				{
					CMessage::PrintError("无法识别文档"+docnode.m_strDocName+"的类别!");
					lUnknown++;
				}
				CMessage::PrintStatusInfo(_T("扫描文档")+docnode.m_strDocName);
				flag=0;
				bTitle=false;
				fname[0]=0;
				type[0]=0;
				content[0]=0;
				lDocNum++;
			}
			switch(line[1])
			{
				case 'I':
					strcpy(fname,line+3);
					len=strlen(fname);
					if(fname[len-1]='\r') fname[len-1]='\0';
					break;
				case 'C':
					flag=1;
					break;
				case 'T':
					flag=2;
					break;
				case 'W':
					flag=3;
					break;
			}
		}
		else
		{
			switch(flag)
			{
				case 1:
					strcpy(type,line);
					break;
				case 2:
					if(!bTitle)
					{
						strcpy(content,line);
						bTitle=true;
					}
					else
						strcat(content,line);
					break;
				case 3:
					strcat(content,line);
					break;
			}
		}
	}
	fclose(stream1);
	CMessage::PrintStatusInfo("");
	if(m_paramClassifier.m_nClassifierType==CClassifierParam::nCT_SVM) 
	{
		fclose(stream2);
		startTime=CTime::GetCurrentTime();
		CMessage::PrintInfo(_T("正在使用SVM分类器对文档进行分类,请稍候..."));
		m_theSVM.com_param.classifyfile=m_paramClassifier.m_strResultDir+"\\test.dat";
		SVMClassifyVectorFile(m_theSVM.com_param.classifyfile);
		totalTime=CTime::GetCurrentTime()-startTime;
		CMessage::PrintInfo(_T("SVM分类过程结束，耗时")+totalTime.Format("%H:%M:%S"));
	}

	long lCorrect=SaveResults(m_lstTestCatalogList,m_paramClassifier.m_strResultDir+"\\results.txt",&typeArray);
	long lTotalNum=m_lstTestCatalogList.GetDocNum()-lUnknown;
	CString str;
	totalTime=CTime::GetCurrentTime()-startTime;
	CMessage::PrintInfo(_T("测试文档分类结束，耗时")+totalTime.Format("%H:%M:%S"));
	if (lUnknown>0) 
	{
		str.Format("无法分类的文档数%d:",lUnknown);
		CMessage::PrintInfo(str);
	}
	if(m_paramClassifier.m_bEvaluation&&lTotalNum>0&&lCorrect>0)
		str.Format("测试文档总数:%d,准确率:%f",m_lstTestCatalogList.GetDocNum(),(float)(lCorrect)/(float)(lTotalNum));
	else
		str.Format("测试文档总数:%d",m_lstTestCatalogList.GetDocNum());
	CMessage::PrintInfo(str);
	return true;
}

//对文档进行分类，计算文档和每个类别的相似度，返回值为类别无法识别的文档总数
long CClassifier::SVMClassify(CCatalogList &cataList)
{
	long lUnknown=0;
	FILE *stream;
	m_theSVM.com_param.classifyfile=m_paramClassifier.m_strResultDir+"\\test.dat";
	if((stream=fopen(m_theSVM.com_param.classifyfile,"w"))==NULL)
	{
		CMessage::PrintError("无法创建测试文档向量文件"+m_theSVM.com_param.classifyfile+"!");
		return 0;
	}
	
	CTime startTime;
	CTimeSpan totalTime;
	CString str;
	int nCount=0;
	long lWordNum=m_lstTrainWordList.GetCount();
	startTime=CTime::GetCurrentTime();
	CMessage::PrintInfo(_T("正在生成测试文档的向量形式，请稍候..."));
	POSITION pos_cata=cataList.GetFirstPosition();
	while(pos_cata!=NULL)
	{
		CCatalogNode& catalognode=cataList.GetNext(pos_cata);
		char *path=catalognode.m_strDirName.GetBuffer(0);
		POSITION pos_doc=catalognode.GetFirstPosition();
		while(pos_doc!=NULL)
		{
			CDocNode& docnode=catalognode.GetNext(pos_doc);
			if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
				nCount=docnode.ScanChineseWithDict(path,m_lstTrainWordList);
			else
				nCount=docnode.ScanEnglishWithDict(path,m_lstTrainWordList,
											m_paramClassifier.m_bStem);

			fprintf(stream,"%d",catalognode.m_idxCata+1);
			for(int i=0;i<lWordNum;i++)
			{
				if(docnode.m_pTemp[i].s_tfi!=0) 
					fprintf(stream," %d:%f",i+1,docnode.m_pTemp[i].s_dWeight);
			}
			fprintf(stream,"\n");
			CMessage::PrintStatusInfo(_T("扫描文档")+docnode.m_strDocName);
		
			if(nCount<0)
			{
				str="无法识别文档";
				str+=catalognode.m_strDirName;
				str+="\\"+docnode.m_strDocName+"的类别!";
				lUnknown++;
				CMessage::PrintError(str);
			}
		}
	}
	CMessage::PrintStatusInfo(_T(""));
	fclose(stream);
	totalTime=CTime::GetCurrentTime()-startTime;
	CMessage::PrintInfo(_T("测试文档的向量生成结束，耗时")+totalTime.Format("%H:%M:%S"));
	
	startTime=CTime::GetCurrentTime();
	CMessage::PrintInfo(_T("正在使用SVM分类器对文档进行分类,请稍候..."));
	//为每篇文档和各个类别的相似读数组分配内存
	pos_cata=cataList.GetFirstPosition();
	while(pos_cata!=NULL)
	{
		CCatalogNode& catalognode=cataList.GetNext(pos_cata);
		POSITION pos_doc=catalognode.GetFirstPosition();
		while(pos_doc!=NULL)
		{
			CDocNode& docnode=catalognode.GetNext(pos_doc);
			docnode.AllocResultsBuffer(m_nClassNum);
		}
	}
	SVMClassifyVectorFile(m_theSVM.com_param.classifyfile);
	CMessage::PrintStatusInfo(_T(""));

	totalTime=CTime::GetCurrentTime()-startTime;
	CMessage::PrintInfo(_T("SVM分类过程结束，耗时")+totalTime.Format("%H:%M:%S"));
	return lUnknown;
}

//使用KNN方法对文档进行分类，计算文档和每个类别的相似度
//返回值为类别无法识别的文档总数
long CClassifier::KNNClassify(CCatalogList& cataList,int nCmpType)
{
	long docID=0,lUnknown=0;
	CString str;
	POSITION pos_cata=cataList.GetFirstPosition();
	while(pos_cata!=NULL)
	{
		CCatalogNode& cataNode=cataList.GetNext(pos_cata);
		POSITION pos_doc=cataNode.GetFirstPosition();
		char *path=cataNode.m_strDirName.GetBuffer(0);
		while(pos_doc!=NULL)
		{
			CDocNode& docNode=cataNode.GetNext(pos_doc);
			short id=KNNCategory(path, docNode, true, nCmpType);
			if(id==-1) 
			{
				str="无法识别文档";
				str+=cataNode.m_strDirName;
				str+="\\"+docNode.m_strDocName+"的类别!";
				CMessage::PrintError(str);
				lUnknown++;
			}
			CMessage::PrintStatusInfo(_T("扫描文档")+docNode.m_strDocName);
		}
	}
	return lUnknown;
}

//计算文档和每个类别的相似度，返回与文档相似度最大的类别ID
//nCmpType代表相似度的不同计算方式
short CClassifier::KNNCategory(char *pPath, CDocNode &docNode, bool bFile, int nCmpType)
{
	short nCataID=-1;
	if(KNNClassify(pPath,docNode,bFile,nCmpType)) nCataID=SingleCategory(docNode);
	return nCataID;
}

//如果bFile为真,则参数file为文件的文件名称(包括它的路径)
short CClassifier::KNNCategory(char *file, bool bFile, int nCmpType)
{
	CDocNode docNode;
	short id=-1;
	if(bFile)
	{
		char *fname=strrchr(file,'\\');
		if(fname==NULL) return -1;
		docNode.m_strDocName=(fname+1);

		char path[MAX_PATH];
		strncpy(path,file,fname-file);
		path[fname-file]=0;
		id=KNNCategory(path,docNode,bFile,nCmpType);
	}
	else
		id=KNNCategory(file,docNode,bFile,nCmpType);
	return id;
}

//生成文档docNode的向量形式，调用方法ComputeSimRatio计算器其和每一个类别的相似度
//参数bFile为真代表pPath为docNode的路径,否则代表需要进行分类的文档的内容
//参数nCmpType代表相似度的不同计算方式
bool CClassifier::KNNClassify(char *pPath, CDocNode &docNode, bool bFile, int nCmpType)
{
	int nCount=0;
	if(bFile)
	{
		if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
			nCount=docNode.ScanChineseWithDict(pPath,m_lstTrainWordList);
		else
			nCount=docNode.ScanEnglishWithDict(pPath,m_lstTrainWordList,
											m_paramClassifier.m_bStem);
	}
	else
	{
		if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
			nCount=docNode.ScanChineseStringWithDict(pPath,m_lstTrainWordList);
		else
			nCount=docNode.ScanEnglishStringWithDict(pPath,m_lstTrainWordList,
											m_paramClassifier.m_bStem);
	}

	if((m_lDocNum>0)&&(nCount>0))
	{
		ComputeSimRatio(docNode,nCmpType);
		return true;
	}
	else
		return false;
}

//得到与文档docNode的相似度大于阈值dThreshold的所有类别
//如果没有大于值阈值的类别，则返回相似度最大的类别
bool CClassifier::MultiCategory(CDocNode &docNode, CArray<short,short>& aryResult, double dThreshold)
{
	double *pSimRatio=docNode.m_pResults;
	if(pSimRatio==NULL) return false;
	
	double dMax=pSimRatio[0];
	short nMax=0;
	bool bFound=false;
	aryResult.RemoveAll();
	for(short i=1;i<m_nClassNum;i++)
	{
		if(pSimRatio[i]>dMax)
		{
			dMax=pSimRatio[i];
			nMax=i;
		}
		if(pSimRatio[i]>dThreshold) 
		{
			aryResult.Add(i);
			bFound=true;
		}
	}
	if(!bFound) aryResult.Add(nMax);
	return true;
}

//计算文档docNode和每一个类别的相似度
//nCmpType代表相似度的不同计算方式
void CClassifier::ComputeSimRatio(CDocNode &docNode,int nCmpType)
{
	//计算文档与训练集中每一篇文档的相似度
	int i;
	long k;
	for(i=0;i<m_lDocNum;i++)
	{
		m_pSimilarityRatio[i].dWeight=m_pDocs[i].pDocNode->ComputeSimilarityRatio();
		m_pSimilarityRatio[i].lDocID=i;
	}
	//将测试文档与训练文档集中文档的相似度进行降序排序
	Sort(m_pSimilarityRatio,m_lDocNum-1);
	docNode.AllocResultsBuffer(m_nClassNum);
	double *pSimRatio=docNode.m_pResults;
	for(i=0;i<m_nClassNum;i++) pSimRatio[i]=0;
	if(nCmpType<=0)
	{
		//计算出测试文档的k近邻在每个类别中的数目
		for(i=0;i<m_paramClassifier.m_nKNN;i++)
		{
			k=m_pSimilarityRatio[i].lDocID;
			k=m_pDocs[k].nCataID;
			pSimRatio[k]+=1;
		}
		//按照"测试文档的k近邻在某个类别中的数目/k"得到测试文档和这个类别的相似度
		for(i=0;i<m_nClassNum;i++)
			pSimRatio[i]/=m_paramClassifier.m_nKNN;
	}
	else if(nCmpType==1)
	{
		for(i=0;i<m_paramClassifier.m_nKNN;i++)
		{
			k=m_pSimilarityRatio[i].lDocID;
			k=m_pDocs[k].nCataID;
			pSimRatio[k]+=m_pSimilarityRatio[i].dWeight;
		}
	}
}

bool CClassifier::SVMClassify(char *pPath, CDocNode &docNode, bool bFile)
{
	int nCount=0;
	if(bFile)
	{
		if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
			nCount=docNode.ScanChineseWithDict(pPath,m_lstTrainWordList);
		else
			nCount=docNode.ScanEnglishWithDict(pPath,m_lstTrainWordList,
											m_paramClassifier.m_bStem);
	}
	else
	{
		if(m_paramClassifier.m_nLanguageType==CClassifierParam::nLT_Chinese)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -