📄 textclassify.cpp

📁 用来文本分类的
💻 CPP
字号:
// TextClassify.cpp: implementation of the TextClassify class.
//CCM part--created by leiyun
//fenci part--created by xuran
//////////////////////////////////////////////////////////////////////

#include "TextClassify.h"
#include "stdio.h"
#include "string.h"
#include "math.h"
#define DICLENGTH 30083	//词典的长度
#include "fenci_main.h"

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

char * myfgets(char *string, int n, FILE *stream)
{
	fgets(string,n,stream);
	char string2[20];
	sscanf(string,"%s",string2);
	strcpy(string,string2);
	return string;
}

TextClassify::TextClassify(char *file,char *strStopWordFile)
{
	wlisti.WListNameConstruct(file);
	wlisti.MakeLMNameIndex();
	//stop using word
	FILE *fp;
	if((fp=fopen(strStopWordFile,"r"))==NULL)
	{
		printf("the stop using word file can not be opened");
		return;
	}
	char temp[10];
	int i=0;
	while(!feof(fp))
	{
		fgets(temp,10,fp);
		i++;
	}
	nNumOfStopWord=i;
	//repositions the file pointer associated with stream to the beginning of the file
	rewind(fp);
	listStopWord=new char[nNumOfStopWord][10];	
	for(i=0;i<nNumOfStopWord;i++)
		myfgets(listStopWord[i],10,fp);
	fclose(fp);
}

TextClassify::~TextClassify()
{
	delete[] listStopWord;

}

bool TextClassify::InitialTree()
{
	//open file 
	FILE * fp;
	if((fp=fopen("resource//InitialTree.txt","r+b"))==NULL)
	{
		printf("The Tree.txt File can not be opened.\n");
		return 0;
	}

//Start to Initial the ClassTree;
	char strTemp[20];

	//build root of the tree
	myfgets(strTemp,20,fp);
	tree.BuildRoot(strTemp);
	
	//insert children
	while(!feof(fp))
	{
		myfgets(strTemp,20,fp);
		if(!strcmp(strTemp,"<sub>"))
		{
			myfgets(strTemp,20,fp);
			tree.InsertChild(NULL,strTemp);
			tree.FirstChild();
		}
		else if(strcmp(strTemp,"</sub>")==0)
			tree.Parent();
		else
		{
			tree.InsertSibling(NULL,strTemp);
			tree.NextSibling();
		}

	}
	fclose(fp);
//	tree.Root();
//	tree.PostOrder();
	return 1;
}
bool TextClassify::UpdateTree(char *strUpdatePath)
{	
	//open file 
	FILE * fp;
	if((fp=fopen(strUpdatePath,"r+b"))==NULL)
	{
		printf("Update:The SaveTree.txt File can not be opened.\n");
		return 0;
	}
	tree.UpdateTree(fp);
	fclose(fp);
	return	1;
}
bool TextClassify::SaveTree(char * SaveTreePath)
{//save the tree node with preOrder
		//open file 
	FILE * fp;
	if((fp=fopen(SaveTreePath,"w+b"))==NULL)
	{
		printf("Save:The SaveTree.txt File can not be opened.\n");
		exit(0);
	}
	tree.SaveTree(fp);
	fclose(fp);
	return 1;
	
}
bool TextClassify::TrainTree(CString strTrainDirectory,int WordFreq,float Weight,float Central)
{
	//initial variable
	strTrainPath=strTrainDirectory;
	WordFreqThreshold=WordFreq;
	WeightThreshold=Weight;
	CentralThreshold=Central;
   //start train
	tree.Root();
	PostOrderTrain();
	return true;
}
bool TextClassify::Classify(char * strFileName,CString& strClassifyResult)
{ 
	if(strFileName==NULL)
		return false;

	tree.Root();
	int pTFArray[DICLENGTH];
	tfStat(strFileName,pTFArray);
	strClassifyResult=tree.root->KindName;
	int nNumOfKind;
	//classify along the tree with up-down order
	while(tree.current->pFirstChild!=NULL)
	{
		int tempResult;
		nNumOfKind=tree.GetChildNum();
		WORDVECTOR **pWordVectorArray=new WORDVECTOR *[nNumOfKind];
		tree.FirstChild();
		for(int i=0;i<nNumOfKind;i++)
		{
			pWordVectorArray[i]=&tree.current->WordVector;
			tree.NextSibling();
		}
		tree.Parent();
		tempResult=ClassifyKind(pTFArray,nNumOfKind,pWordVectorArray);
		delete[] pWordVectorArray;

		//classify at the next rank
		tree.FirstChild();
		for(i=0;i<tempResult;i++)
			tree.NextSibling();

		//record the classify result
		strClassifyResult=strClassifyResult+"--->"+tree.current->KindName;
	}
	return true;
}

//private function in train part
void TextClassify::PostOrderTrain()
{
	if(tree.current!=NULL)
	{
		ClassTreeNode *p=tree.current;
		bool IsSuccess=tree.FirstChild();
		while(IsSuccess)
		{
			PostOrderTrain();
			IsSuccess=tree.NextSibling();
		}
		tree.current=p;
		VisitNode();
	}
}
void TextClassify::VisitNode()
{
	printf("%s\n",tree.current->KindName);
	//if the current node is leaf node
	if(tree.current->pFirstChild==NULL)
	{
		//***stat. the word frequency of the document under this folder
		//***and sum
		//allocate memory 
		int *pTFArray=new int[DICLENGTH];
		for(int i=0;i<DICLENGTH;i++)
			pTFArray[i]=0;
		int tempArray[DICLENGTH];

		//Get the  current directory
		CString strDirectory=strTrainPath;
		strDirectory+="\\";
		CString strPath=tree.current->KindName;
		CString strTemp;
		ClassTreeNode *current=tree.current;
		while(tree.Parent())
		{
			strTemp=tree.current->KindName;
			strPath=strTemp+"\\"+strPath;
		}
		strDirectory+=strPath;
		tree.current=current;

		//find the file under this directory and sta.
		CString strFileName;
		strDirectory+="\\*.txt";
		CFileFind finder;
		BOOL bWorking = finder.FindFile(strDirectory.GetBuffer(0));
		while (bWorking)
		{
			bWorking = finder.FindNextFile();
			strFileName=finder.GetFilePath();
			//stat. this file and output the result ----xuran
			tfStat(strFileName.GetBuffer(0),tempArray);
			printf("%s\n",strFileName);
			for(i=0;i<DICLENGTH;i++)
				pTFArray[i]=pTFArray[i]+tempArray[i];
		}
		//link the result to current node in the tree
		tree.current->pTFArray=pTFArray;
	}
	else//the current node is not a leaf node ,first calculate CCM of the children ,then tfStat 
	{		
		//get the children number of the current node 
		int nNumOfKind=tree.GetChildNum();

		//allocate 2 dimension array pTF2Array
		//allocate  list array pWordVectorArray    and link to the tree
		int **pTF2Array=new int *[nNumOfKind];
		WORDVECTOR **pWordVectorArray=new WORDVECTOR *[nNumOfKind];
		tree.FirstChild();
		for(int i=0;i<nNumOfKind;i++)
		{
			pTF2Array[i]=tree.current->pTFArray;
			pWordVectorArray[i]=&tree.current->WordVector;
			tree.NextSibling();
		}
		tree.Parent();
		
		//train the tree
		CCM(pTF2Array,nNumOfKind,pWordVectorArray);     //leiyun
		//update the children's member variable :nDimOfVector
		tree.FirstChild();
		for(i=0;i<nNumOfKind;i++)
		{
			tree.current->nDimOfVector=tree.current->WordVector.GetCount();
			tree.NextSibling();
		}
		tree.Parent();

		//get tfStat. of current node with sum its children if the current node is not a root
		if(tree.current!=tree.root)
		{
			tree.current->pTFArray=new int[DICLENGTH];
			for(i=0;i<DICLENGTH;i++)
				tree.current->pTFArray[i]=0;
			for(i=0;i<nNumOfKind;i++)
				for(int j=0;j<DICLENGTH;j++)
					tree.current->pTFArray[j]+=pTF2Array[i][j];
		}
		
		//free the tfstat. memory of the children
		for(i=0;i<nNumOfKind;i++)
			delete[] pTF2Array[i];
		delete[] pTF2Array;

		//free the pWordVectorArray
		delete[] pWordVectorArray;

	}

}
void TextClassify::tfStat(char *fileName,int *pTFArray)
{//create by xuren
    tfcout(fileName,pTFArray,wlisti,listStopWord,nNumOfStopWord);
}
void TextClassify::CCM(int ** pkinddoct,int nNumOfKind, WORDVECTOR **pwordlist)
{//createb by leiyun
	 double Central[DICLENGTH];
	int Ndoctotal=nNumOfKind;		//Ndoctotal是文章类的数量(最大值）
	int sumwordfreq=0;
//	long int wordfreq[nNumOfKind];
	int *wordfreq=new int[nNumOfKind];
	int *pwordfreq;
	int j=0;
	double *Pd=new double[nNumOfKind];
	double *ppd;
	double (*Pdw)[DICLENGTH]=new double[nNumOfKind][DICLENGTH];
//	double **ppdw;
	int sumword[DICLENGTH];	//一个词出现的总数
	int (*num)[DICLENGTH]=new int [nNumOfKind][DICLENGTH];	//一个词在一个文档里出现的数目
	int *pnum;
	int m=0,i=0;
	
//	memset(Central,0,DICLENGTH);
	for(i=0;i<nNumOfKind;i++)
	{
		pwordfreq=wordfreq+i;
		*pwordfreq=0;
		ppd=Pd+i;
		*ppd=0;
	}
//	memset(wordfreq,0,nNumOfKind);
	for(i=0;i<DICLENGTH;i++)
	{
		*(sumword+i)=0;
		*(Central+i)=0;
	}
//	memset(sumword,0,DICLENGTH);
//	memset(Pd,0,nNumOfKind);
	for(i=0;i<nNumOfKind;i++)
	{
		for(j=0;j<DICLENGTH;j++)
		{
			*(*(Pdw+i)+j)=0;
			*(*(num+i)+j)=0;
		}
		//memset(*(Pdw+i),0,DICLENGTH);
		//memset(*(num+i),0,DICLENGTH);
	}
	j=0;
	while (m<nNumOfKind)
	{
		for(int i=0;i<DICLENGTH;i++)
		{
			if(*((*pkinddoct)+i)!=0)
			{
				Central[i]++;
			}
			sumwordfreq+=*((*pkinddoct)+i);
			pwordfreq=wordfreq+j;
			*pwordfreq+=*((*pkinddoct)+i);	
			sumword[i]+=*((*pkinddoct)+i);
			pnum=(*(num+j)+i);
			*pnum=*((*pkinddoct)+i);		
		}
		j++;m++;
		pkinddoct++;
	
	}
	for(i=0;i<DICLENGTH;i++)
	{
		if(Central[i]!=0)
			Central[i]=log(Ndoctotal/Central[i]+0.01);
	}
	for(i=0;i<Ndoctotal;i++)
	{
		if((double)(*(wordfreq+i))!=0)
		*(Pd+i)=((double)(*(wordfreq+i)))/((double)sumwordfreq);
		for(j=0;j<DICLENGTH;j++)
		{
			if((double)(*(*(num+i)+j))!=0)
			*(*(Pdw+i)+j)=((double)(*(*(num+i)+j)))/sumword[j];
		}
	}
	
	long double Hd=0,Hdw[DICLENGTH];
	for(i=0;i<DICLENGTH;i++)
	{
		Hdw[i]=0;
	}
	
	for(i=0;i<Ndoctotal;i++)
	{
		if((*Pd+i)!=0)
			Hd+=-((*(Pd+i))*log(*(Pd+i)));
	}
	for(i=0;i<DICLENGTH;i++)
	{
		for(j=0;j<Ndoctotal;j++)
		{
			if((*(*(Pdw+j)+i))!=0)
				Hdw[i]+=-(*(*(Pdw+j)+i)*log(*(*(Pdw+j)+i)));
		}
	}
	
	double IG[DICLENGTH];
	
	for(i=0;i<DICLENGTH;i++)
	{
		IG[i]=fabs(Hd-Hdw[i]);
	}
	
	double (*Weight)[DICLENGTH]=new double [nNumOfKind][DICLENGTH];
	
	for(i=0;i<nNumOfKind;i++)
	{
		for(j=0;j<DICLENGTH;j++)
		{
			*(*(Weight+i)+j)=0;
		}
	//	memset(*(Weight+i),0,DICLENGTH);
	}

	for(i=0;i<Ndoctotal;i++)
	{
		double Weightstd=0;
		for(j=0;j<DICLENGTH;j++)
		{
			*(*(Weight+i)+j)=*(*(num+i)+j)*Central[j]*IG[j];
			Weightstd+=(*(*(Weight+i)+j))*(*(*(Weight+i)+j));
		}
		for(j=0;j<DICLENGTH;j++)
		{
			if(Weightstd!=0)
			*(*(Weight+i)+j)=(*(*(Weight+i)+j))/sqrt(Weightstd);
		}
	}

	CListNode node;

	for(i=0;i<Ndoctotal;i++)
	{
		for(j=0;j<DICLENGTH;j++)
		{
			if(Weight[i][j]>WeightThreshold && num[i][j]>WordFreqThreshold && Central[j]>CentralThreshold)
			{
				node.WordIndex = j;
				node.Central=Central[j];
				node.Weight=*(*(Weight+i)+j);
				node.WordFreq=*(*(num+i)+j);

				(**pwordlist).AddTail(node);

			}
		}
		
		*pwordlist++;
		
	}
	delete wordfreq;
	delete[] Pdw;
	delete[] Weight;
	delete[] num;
	delete Pd;
}
int TextClassify::ClassifyKind(int *pTFArray,int nNumOfKind,WORDVECTOR **pWordVectorArray)//return the kind
{
	double *Sim=new double[nNumOfKind];
	for(int i=0;i<nNumOfKind;i++)
	{
		*(Sim+i)=0;
	}
	for(i=0;i<nNumOfKind;i++)
	{
		POSITION pos=pWordVectorArray[i]->GetHeadPosition();
		for(int j=0;j<pWordVectorArray[i]->GetCount();j++)
		{
			int k=pWordVectorArray[i]->GetAt(pos).WordIndex;
			Sim[i]+=pTFArray[k]*pWordVectorArray[i]->GetAt(pos).Weight;
			pWordVectorArray[i]->GetNext(pos);
		}
	}

	double k=Sim[0];
	int index=0;
//	Sim[2]=50;
	for(i=0;i<nNumOfKind;i++)
	{
		if(k<Sim[i])
		{
			k=Sim[i];
			index=i;
		}
	}

	delete[] Sim;

	return index;
}

//test

void TextClassify::test(double weight,double central,int frequence)
{
	WeightThreshold=weight;
	CentralThreshold=central;
	WordFreqThreshold=frequence;
}
void TextClassify::veracity(double *veracity,CString m_strResultPath)
{
	//计算准确度

	CString stream,left,right;
	char string[256];
	double numerator=0,denominator=0;
	ifstream fin(m_strResultPath, ios::nocreate);
	if(fin.is_open()==NULL)
	{
		cout<<"Error Opening "<<m_strResultPath<<" for read. "<<endl;
		return;
	}
	while(! fin.eof()) 
	{
		memset(string, 0, 256);//清空数组；
		fin.getline(string, 256);
		stream=string;
		stream.MakeLower();
		int length=stream.GetLength();
		left=stream.Left(stream.ReverseFind('>'));
		right=stream.Right(length-stream.ReverseFind('>')-1);
		if(left.Find(right)!=-1)
			numerator++;
		denominator++;
	}
	denominator--;
	*veracity=numerator/denominator;
}
void TextClassify::clear(CString m_strResultPath)
{
	CFile myFile(m_strResultPath,CFile::modeCreate);
	myFile.Close();
}
💿 文件大小 51 K
👤 上传用户 teasler111
📂 所属分类其他
🏷️ 相关标签

#文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -