⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 precnhierdata.cpp

📁 中科院计算所的分词软件
💻 CPP
📖 第 1 页 / 共 2 页
字号:

        bRet = FindNextFile(hFile,&fd);
		//若是横向遍历,则当前根目录不变
		CurDir=lpszDir;
	}//while

    FindClose(hFile);

    return TRUE;
}


/******************************************************************************
 FUNCTION fn_bScanDirectory()                                                                                                                               
 按目录读取                                                                      
******************************************************************************/
BOOL fn_bScanDirectory(char* lpszDir,char *outfile){
    char *infile;
	string CurDir;

	CurDir=lpszDir;

	if(!lpszDir||(strlen(lpszDir)==0)){
		return FALSE;
	}  

	if(_access(lpszDir,0)==-1)
		return FALSE;

    WIN32_FIND_DATA fd;
    HANDLE hFile;

	string sDirFiles = lpszDir;
	//if(lpszDir[strlen(lpszDir)-2]!='\\')
		sDirFiles += "\\";

    string sNoSuffixDir = sDirFiles;
	sDirFiles += "*.*";

    hFile = FindFirstFile(sDirFiles.c_str(),&fd);
    BOOL bRet = TRUE;

    file://Scan sub-directory under the directory specified by lpszDir
    while(bRet&&(hFile!=INVALID_HANDLE_VALUE)){
		if((strcmp(fd.cFileName,".")==0)||(strcmp(fd.cFileName,"..")==0)){
			bRet = FindNextFile(hFile,&fd);
            continue;
		}//if  

	    //如果是类别
		if(fd.dwFileAttributes == FILE_ATTRIBUTE_DIRECTORY){
	        string sDir = fd.cFileName;
			CurDir+="\\";
			CurDir+=sDir;
			//存储所有层次目录
			string CurDir1=CurDir;
			CurDir1+="\\";
			Class.push_back(CurDir1);
			ClassNum++;
			fn_bScanDirectory((char* )CurDir.c_str()," ");	
			
		}
		//如果是文件
		else{
			string sFullFilePath = sNoSuffixDir;
			sFullFilePath += fd.cFileName; 
			vEachDirFiles.push_back(sFullFilePath);

			cout<<CurDocId<<"	"<<endl;
			//if(CurDocId==750)
			//	CurDocId=CurDocId;

			//处理该文件
			//if(CurDocId%2==1)
			//	remove(sFullFilePath.c_str());
			ReadOneFile(sFullFilePath,outfile);
			PatClass.push_back(sNoSuffixDir);
			PatNum++;
		
			CurDocId++;
			//cout<<CurDocId<<endl;
			
		}

        bRet = FindNextFile(hFile,&fd);
		//若是横向遍历,则当前根目录不变
		CurDir=lpszDir;
	}//while

    FindClose(hFile);

    return TRUE;
}


int main(int argc, char* argv[])
{   
	//rename("t1.txt","t1-rename.txt");
	fstream stopfile,infile,infile1,infile2,outfile,
		    clablefile,rlablefile,hlablefile,prefile1,prefile2,
		    topicfile1,topicfile2;
    vector<string> vec;
	string DirFile="手机-adj",MatFile,CLableFile,RLableFile,HLableFile,Root,PreFile1,PreFile2,TopicFile1,
		TopicFile2;
	MatFile=DirFile+".mat";
	CLableFile=DirFile+".mat.clabel";
	RLableFile=DirFile+".mat.rlabel";
	HLableFile=DirFile+".mat.hlabel";
	PreFile1=DirFile+".Train.txt";
	TopicFile1=DirFile+".Train.topic.txt";
	PreFile2=DirFile+".Test.txt";
	TopicFile2=DirFile+".Test.topic.txt";

	WordMat.clear();
	CVweight.clear();
	CVterm.clear();
    PatClass.clear();

	//初始化分词模块
	char sSentence[2000],sSentenceResult[5000];
	ICTCLAS_Init();	
	//ICTCLAS_FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
	ICTCLAS_SetOperType(1);
	//ICTCLAS_FileProcess("分词测试文本.txt","分词测试输出.txt");
	
	//首先装入停用词
	stopfile.open("stoplist.txt",ios::in);
	stopfile>>str;
	while(!stopfile.eof()){
		if(stopfile.eof())   
			break;
		//如果计数为0,即不在其中,就进行插入
		strcpy(str,strlwr(str));
		if(!StopWord.count(str)){
			StopWord.insert(str);
		}
		stopfile>>str;
	}

	//存储所有层次目录
	Root=DirFile;
	Root += "\\";
	Class.push_back(Root);
	ClassNum++;

	//以Begin与End作为一篇文档开始与结束的标志
	temfile.open("MatFile.txt",ios::out);

	//按目录读取
	fn_bScanDirectory((char *)DirFile.c_str()," ");
	//DirectoryResearve("电脑-测试",1); //训练集选1,测试集选7

	//exit(0);

	temfile.close();

	//写入类别文件<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 
	rlablefile.open(RLableFile.c_str(),ios::out);
	for(i=0;i<PatNum;i++){
		rlablefile<<PatClass[i]<<endl;
	}

    //return;

	//对词排序
	it=Word.begin();
	for(;it!=Word.end();++it){
		vec.push_back(*it);
	}
	less<string> ls;
	sort(vec.begin(),vec.end(),ls);

	//写入单词文件<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
	clablefile.open(CLableFile.c_str(),ios::out);
	for(i=0;i<WordNum;i++){
		clablefile<<vec[i].c_str()<<endl;
	}
  
	//转变成词频矩阵
	long int *WordFreq=(long int *)malloc((WordNum)*sizeof(long int));
	long int AllItem=0;
	//词频向量清空
	for(j=0;j<WordNum;j++)
		WordFreq[j]=0;

	//vector<string> onefile;
    fstream temfile1;
	temfile1.open("MatFile.txt",ios::in);
    temfile1>>word;
	for(i=0;i<PatNum;i++){
		CVterm.push_back(intvec);
		CVweight.push_back(intvec);
		temfile1>>word;
		while(strcmp(word,"[End]")!=0){
			temfile1>>word;
			//查找当前词的编号
			long int no=fn_iBinarySearch(vec,word);
			if(no>=0){
				WordFreq[no]++;
			}
		}
		for(j=0;j<WordNum;j++){
			if(WordFreq[j]>0){
				CVterm[i].push_back(j);
				CVweight[i].push_back(WordFreq[j]);
				WordFreq[j]=0;
				AllItem++;
			}
		}
	}
	
	//写入词频文件
	outfile.open(MatFile.c_str(),ios::out);
	outfile<<PatNum<<" "<<WordNum<<" "<<AllItem<<endl;
	//outfile.open(MatFile.c_str(),ios::out);
    for(i=0;i<PatNum;i++){
		//cout<<CVterm[i].size()<<endl;
		//若本文档非空时,才进行输出
		if(CVterm[i].size()>0){
		for(j=0;j<CVterm[i].size();j++){
			int term=CVterm[i][j];
			float weight=CVweight[i][j];
			//han格式中词从1开始编号
			outfile<<CVterm[i][j]+1<<" "<<CVweight[i][j]<<" ";
		}
		outfile<<endl;
		}
	}

		

	//写入层次类别文件<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 
	hlablefile.open(HLableFile.c_str(),ios::out);
	for(i=0;i<Class.size();i++){
		hlablefile<<Class[i]<<endl;
	}
	
	ICTCLAS_Exit();

	//exit(0);

	
	//写入处理后文件
	prefile1.open(PreFile1.c_str(),ios::out);
	prefile2.open(PreFile2.c_str(),ios::out);

    for(i=0;i<PatNum;i++){
		if(i % 2 ==0){
			//若本文档非空时,才进行输出
		    if(RAWTEXT[i].size()>0){
				if(i/2>0)
					prefile2<<endl<<endl;
		        prefile2<<RAWTEXT[i];
				prefile2<<endl<<"[End]";
			}  
		}
		else{
			//若本文档非空时,才进行输出
		    if(RAWTEXT[i].size()>0){
				if(i/2>0)
					prefile1<<endl<<endl;
		        prefile1<<RAWTEXT[i];
				prefile1<<endl<<"[End]";
			}  
		}
	}
	prefile1.close();
	prefile2.close();

	//写入主题文件<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 
	topicfile1.open(TopicFile1.c_str(),ios::out);
	topicfile2.open(TopicFile2.c_str(),ios::out);
	for(i=0;i<PatNum;i++){
		if(i % 2 ==0)
			if(i/2==0)
				topicfile2<<PatClass[i];
			else
				topicfile2<<endl<<PatClass[i];
		else
			if(i/2==0)
				topicfile1<<PatClass[i];
			else
				topicfile1<<endl<<PatClass[i];
	}
	topicfile1.close();
	topicfile2.close();//*/

	//return 0;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -