📄 textclassify.cpp
字号:
// TextClassify.cpp: implementation of the TextClassify class.
//CCM part--created by leiyun
//fenci part--created by xuran
//////////////////////////////////////////////////////////////////////
#include "TextClassify.h"
#include "stdio.h"
#include "string.h"
#include "math.h"
#define DICLENGTH 30083 //词典的长度
#include "fenci_main.h"
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
char * myfgets(char *string, int n, FILE *stream)
{
fgets(string,n,stream);
char string2[20];
sscanf(string,"%s",string2);
strcpy(string,string2);
return string;
}
TextClassify::TextClassify(char *file,char *strStopWordFile)
{
wlisti.WListNameConstruct(file);
wlisti.MakeLMNameIndex();
//stop using word
FILE *fp;
if((fp=fopen(strStopWordFile,"r"))==NULL)
{
printf("the stop using word file can not be opened");
return;
}
char temp[10];
int i=0;
while(!feof(fp))
{
fgets(temp,10,fp);
i++;
}
nNumOfStopWord=i;
//repositions the file pointer associated with stream to the beginning of the file
rewind(fp);
listStopWord=new char[nNumOfStopWord][10];
for(i=0;i<nNumOfStopWord;i++)
myfgets(listStopWord[i],10,fp);
fclose(fp);
}
TextClassify::~TextClassify()
{
delete[] listStopWord;
}
bool TextClassify::InitialTree()
{
//open file
FILE * fp;
if((fp=fopen("resource//InitialTree.txt","r+b"))==NULL)
{
printf("The Tree.txt File can not be opened.\n");
return 0;
}
//Start to Initial the ClassTree;
char strTemp[20];
//build root of the tree
myfgets(strTemp,20,fp);
tree.BuildRoot(strTemp);
//insert children
while(!feof(fp))
{
myfgets(strTemp,20,fp);
if(!strcmp(strTemp,"<sub>"))
{
myfgets(strTemp,20,fp);
tree.InsertChild(NULL,strTemp);
tree.FirstChild();
}
else if(strcmp(strTemp,"</sub>")==0)
tree.Parent();
else
{
tree.InsertSibling(NULL,strTemp);
tree.NextSibling();
}
}
fclose(fp);
// tree.Root();
// tree.PostOrder();
return 1;
}
bool TextClassify::UpdateTree(char *strUpdatePath)
{
//open file
FILE * fp;
if((fp=fopen(strUpdatePath,"r+b"))==NULL)
{
printf("Update:The SaveTree.txt File can not be opened.\n");
return 0;
}
tree.UpdateTree(fp);
fclose(fp);
return 1;
}
bool TextClassify::SaveTree(char * SaveTreePath)
{//save the tree node with preOrder
//open file
FILE * fp;
if((fp=fopen(SaveTreePath,"w+b"))==NULL)
{
printf("Save:The SaveTree.txt File can not be opened.\n");
exit(0);
}
tree.SaveTree(fp);
fclose(fp);
return 1;
}
bool TextClassify::TrainTree(CString strTrainDirectory,int WordFreq,float Weight,float Central)
{
//initial variable
strTrainPath=strTrainDirectory;
WordFreqThreshold=WordFreq;
WeightThreshold=Weight;
CentralThreshold=Central;
//start train
tree.Root();
PostOrderTrain();
return true;
}
bool TextClassify::Classify(char * strFileName,CString& strClassifyResult)
{
if(strFileName==NULL)
return false;
tree.Root();
int pTFArray[DICLENGTH];
tfStat(strFileName,pTFArray);
strClassifyResult=tree.root->KindName;
int nNumOfKind;
//classify along the tree with up-down order
while(tree.current->pFirstChild!=NULL)
{
int tempResult;
nNumOfKind=tree.GetChildNum();
WORDVECTOR **pWordVectorArray=new WORDVECTOR *[nNumOfKind];
tree.FirstChild();
for(int i=0;i<nNumOfKind;i++)
{
pWordVectorArray[i]=&tree.current->WordVector;
tree.NextSibling();
}
tree.Parent();
tempResult=ClassifyKind(pTFArray,nNumOfKind,pWordVectorArray);
delete[] pWordVectorArray;
//classify at the next rank
tree.FirstChild();
for(i=0;i<tempResult;i++)
tree.NextSibling();
//record the classify result
strClassifyResult=strClassifyResult+"--->"+tree.current->KindName;
}
return true;
}
//private function in train part
void TextClassify::PostOrderTrain()
{
if(tree.current!=NULL)
{
ClassTreeNode *p=tree.current;
bool IsSuccess=tree.FirstChild();
while(IsSuccess)
{
PostOrderTrain();
IsSuccess=tree.NextSibling();
}
tree.current=p;
VisitNode();
}
}
void TextClassify::VisitNode()
{
printf("%s\n",tree.current->KindName);
//if the current node is leaf node
if(tree.current->pFirstChild==NULL)
{
//***stat. the word frequency of the document under this folder
//***and sum
//allocate memory
int *pTFArray=new int[DICLENGTH];
for(int i=0;i<DICLENGTH;i++)
pTFArray[i]=0;
int tempArray[DICLENGTH];
//Get the current directory
CString strDirectory=strTrainPath;
strDirectory+="\\";
CString strPath=tree.current->KindName;
CString strTemp;
ClassTreeNode *current=tree.current;
while(tree.Parent())
{
strTemp=tree.current->KindName;
strPath=strTemp+"\\"+strPath;
}
strDirectory+=strPath;
tree.current=current;
//find the file under this directory and sta.
CString strFileName;
strDirectory+="\\*.txt";
CFileFind finder;
BOOL bWorking = finder.FindFile(strDirectory.GetBuffer(0));
while (bWorking)
{
bWorking = finder.FindNextFile();
strFileName=finder.GetFilePath();
//stat. this file and output the result ----xuran
tfStat(strFileName.GetBuffer(0),tempArray);
printf("%s\n",strFileName);
for(i=0;i<DICLENGTH;i++)
pTFArray[i]=pTFArray[i]+tempArray[i];
}
//link the result to current node in the tree
tree.current->pTFArray=pTFArray;
}
else//the current node is not a leaf node ,first calculate CCM of the children ,then tfStat
{
//get the children number of the current node
int nNumOfKind=tree.GetChildNum();
//allocate 2 dimension array pTF2Array
//allocate list array pWordVectorArray and link to the tree
int **pTF2Array=new int *[nNumOfKind];
WORDVECTOR **pWordVectorArray=new WORDVECTOR *[nNumOfKind];
tree.FirstChild();
for(int i=0;i<nNumOfKind;i++)
{
pTF2Array[i]=tree.current->pTFArray;
pWordVectorArray[i]=&tree.current->WordVector;
tree.NextSibling();
}
tree.Parent();
//train the tree
CCM(pTF2Array,nNumOfKind,pWordVectorArray); //leiyun
//update the children's member variable :nDimOfVector
tree.FirstChild();
for(i=0;i<nNumOfKind;i++)
{
tree.current->nDimOfVector=tree.current->WordVector.GetCount();
tree.NextSibling();
}
tree.Parent();
//get tfStat. of current node with sum its children if the current node is not a root
if(tree.current!=tree.root)
{
tree.current->pTFArray=new int[DICLENGTH];
for(i=0;i<DICLENGTH;i++)
tree.current->pTFArray[i]=0;
for(i=0;i<nNumOfKind;i++)
for(int j=0;j<DICLENGTH;j++)
tree.current->pTFArray[j]+=pTF2Array[i][j];
}
//free the tfstat. memory of the children
for(i=0;i<nNumOfKind;i++)
delete[] pTF2Array[i];
delete[] pTF2Array;
//free the pWordVectorArray
delete[] pWordVectorArray;
}
}
void TextClassify::tfStat(char *fileName,int *pTFArray)
{//create by xuren
tfcout(fileName,pTFArray,wlisti,listStopWord,nNumOfStopWord);
}
void TextClassify::CCM(int ** pkinddoct,int nNumOfKind, WORDVECTOR **pwordlist)
{//createb by leiyun
double Central[DICLENGTH];
int Ndoctotal=nNumOfKind; //Ndoctotal是文章类的数量(最大值)
int sumwordfreq=0;
// long int wordfreq[nNumOfKind];
int *wordfreq=new int[nNumOfKind];
int *pwordfreq;
int j=0;
double *Pd=new double[nNumOfKind];
double *ppd;
double (*Pdw)[DICLENGTH]=new double[nNumOfKind][DICLENGTH];
// double **ppdw;
int sumword[DICLENGTH]; //一个词出现的总数
int (*num)[DICLENGTH]=new int [nNumOfKind][DICLENGTH]; //一个词在一个文档里出现的数目
int *pnum;
int m=0,i=0;
// memset(Central,0,DICLENGTH);
for(i=0;i<nNumOfKind;i++)
{
pwordfreq=wordfreq+i;
*pwordfreq=0;
ppd=Pd+i;
*ppd=0;
}
// memset(wordfreq,0,nNumOfKind);
for(i=0;i<DICLENGTH;i++)
{
*(sumword+i)=0;
*(Central+i)=0;
}
// memset(sumword,0,DICLENGTH);
// memset(Pd,0,nNumOfKind);
for(i=0;i<nNumOfKind;i++)
{
for(j=0;j<DICLENGTH;j++)
{
*(*(Pdw+i)+j)=0;
*(*(num+i)+j)=0;
}
//memset(*(Pdw+i),0,DICLENGTH);
//memset(*(num+i),0,DICLENGTH);
}
j=0;
while (m<nNumOfKind)
{
for(int i=0;i<DICLENGTH;i++)
{
if(*((*pkinddoct)+i)!=0)
{
Central[i]++;
}
sumwordfreq+=*((*pkinddoct)+i);
pwordfreq=wordfreq+j;
*pwordfreq+=*((*pkinddoct)+i);
sumword[i]+=*((*pkinddoct)+i);
pnum=(*(num+j)+i);
*pnum=*((*pkinddoct)+i);
}
j++;m++;
pkinddoct++;
}
for(i=0;i<DICLENGTH;i++)
{
if(Central[i]!=0)
Central[i]=log(Ndoctotal/Central[i]+0.01);
}
for(i=0;i<Ndoctotal;i++)
{
if((double)(*(wordfreq+i))!=0)
*(Pd+i)=((double)(*(wordfreq+i)))/((double)sumwordfreq);
for(j=0;j<DICLENGTH;j++)
{
if((double)(*(*(num+i)+j))!=0)
*(*(Pdw+i)+j)=((double)(*(*(num+i)+j)))/sumword[j];
}
}
long double Hd=0,Hdw[DICLENGTH];
for(i=0;i<DICLENGTH;i++)
{
Hdw[i]=0;
}
for(i=0;i<Ndoctotal;i++)
{
if((*Pd+i)!=0)
Hd+=-((*(Pd+i))*log(*(Pd+i)));
}
for(i=0;i<DICLENGTH;i++)
{
for(j=0;j<Ndoctotal;j++)
{
if((*(*(Pdw+j)+i))!=0)
Hdw[i]+=-(*(*(Pdw+j)+i)*log(*(*(Pdw+j)+i)));
}
}
double IG[DICLENGTH];
for(i=0;i<DICLENGTH;i++)
{
IG[i]=fabs(Hd-Hdw[i]);
}
double (*Weight)[DICLENGTH]=new double [nNumOfKind][DICLENGTH];
for(i=0;i<nNumOfKind;i++)
{
for(j=0;j<DICLENGTH;j++)
{
*(*(Weight+i)+j)=0;
}
// memset(*(Weight+i),0,DICLENGTH);
}
for(i=0;i<Ndoctotal;i++)
{
double Weightstd=0;
for(j=0;j<DICLENGTH;j++)
{
*(*(Weight+i)+j)=*(*(num+i)+j)*Central[j]*IG[j];
Weightstd+=(*(*(Weight+i)+j))*(*(*(Weight+i)+j));
}
for(j=0;j<DICLENGTH;j++)
{
if(Weightstd!=0)
*(*(Weight+i)+j)=(*(*(Weight+i)+j))/sqrt(Weightstd);
}
}
CListNode node;
for(i=0;i<Ndoctotal;i++)
{
for(j=0;j<DICLENGTH;j++)
{
if(Weight[i][j]>WeightThreshold && num[i][j]>WordFreqThreshold && Central[j]>CentralThreshold)
{
node.WordIndex = j;
node.Central=Central[j];
node.Weight=*(*(Weight+i)+j);
node.WordFreq=*(*(num+i)+j);
(**pwordlist).AddTail(node);
}
}
*pwordlist++;
}
delete wordfreq;
delete[] Pdw;
delete[] Weight;
delete[] num;
delete Pd;
}
int TextClassify::ClassifyKind(int *pTFArray,int nNumOfKind,WORDVECTOR **pWordVectorArray)//return the kind
{
double *Sim=new double[nNumOfKind];
for(int i=0;i<nNumOfKind;i++)
{
*(Sim+i)=0;
}
for(i=0;i<nNumOfKind;i++)
{
POSITION pos=pWordVectorArray[i]->GetHeadPosition();
for(int j=0;j<pWordVectorArray[i]->GetCount();j++)
{
int k=pWordVectorArray[i]->GetAt(pos).WordIndex;
Sim[i]+=pTFArray[k]*pWordVectorArray[i]->GetAt(pos).Weight;
pWordVectorArray[i]->GetNext(pos);
}
}
double k=Sim[0];
int index=0;
// Sim[2]=50;
for(i=0;i<nNumOfKind;i++)
{
if(k<Sim[i])
{
k=Sim[i];
index=i;
}
}
delete[] Sim;
return index;
}
//test
void TextClassify::test(double weight,double central,int frequence)
{
WeightThreshold=weight;
CentralThreshold=central;
WordFreqThreshold=frequence;
}
void TextClassify::veracity(double *veracity,CString m_strResultPath)
{
//计算准确度
CString stream,left,right;
char string[256];
double numerator=0,denominator=0;
ifstream fin(m_strResultPath, ios::nocreate);
if(fin.is_open()==NULL)
{
cout<<"Error Opening "<<m_strResultPath<<" for read. "<<endl;
return;
}
while(! fin.eof())
{
memset(string, 0, 256);//清空数组;
fin.getline(string, 256);
stream=string;
stream.MakeLower();
int length=stream.GetLength();
left=stream.Left(stream.ReverseFind('>'));
right=stream.Right(length-stream.ReverseFind('>')-1);
if(left.Find(right)!=-1)
numerator++;
denominator++;
}
denominator--;
*veracity=numerator/denominator;
}
void TextClassify::clear(CString m_strResultPath)
{
CFile myFile(m_strResultPath,CFile::modeCreate);
myFile.Close();
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -