⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 precnhierdata.cpp

📁 中科院计算所的分词软件
💻 CPP
📖 第 1 页 / 共 2 页
字号:
// ICTCLAS_DOS.cpp : Defines the entry point for the console application.
//

#include <stdio.h>
#include <windows.h>
#include <string.h>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <io.h>
#include <fstream>
#include <iostream.h>
#include <algorithm>
#include <functional>

//#include "Tagger/POSTagger.h"
//#include "Tagger/POSTagging.h"
//#include "Stemmer/stem.h"
#include "ICTCLAS.h"

#pragma warning(disable:4786)

using namespace std;
typedef double REAL;
typedef int    INT;
typedef string STR;
typedef vector<INT> INTVT;
typedef vector<REAL> REALVT;
typedef vector<STR>  STRVT;
typedef vector<INTVT> INTMATR;
typedef vector<REALVT> REALMATR;
typedef vector<STRVT> STRMATR;
STRMATR     WordMat;
STRVT       RAWTEXT;
STRVT       OneFile;            //用来存储一个文件中的词序列
INTMATR  	CVweight;			//多个中心向量对应的weight
INTMATR		CVterm;				//多个中心向量对应的term
REALVT		realvec;
INTVT		intvec;
STRVT		PatClass;           //每个样本的类别
set<string> StopWord;
vector<string> Class;
set<string> Word;
set<string>::iterator it;
char *const sWordSeparators=" \t\n.,:;'><()\"?/~!@#$%^&*+-_=|0123456789[]{}";

long int CurDocId=0;
long int CurTopicId=0;
vector<vector<string> > vDirlist;
long int i,j,k,pat,nonemptyitem,item,val;
long int RowLen;
char     str[100000],*str1;
char temstr[10000],word[10000];
//char     buf[1000000];
long int WordNum,ClassNum,PatNum=0; 
vector<string> vDirs;
vector<string> vEachDirFiles;
fstream temfile;

void ReadOneFile(string infilename,char *outfilename);
BOOL fn_bScanDirectory(char* lpszDir);

/******************************************************************************
 FUNCTION fn_iBinarySearch(const vector<string>& vStr,string sKey)                                                                                                                               
 折半查找                                                                      
******************************************************************************/
int fn_iBinarySearch(const vector<string>& vStr,string sKey)
{
    //binary searching for the keyword in vector vStr
    if(vStr.empty())
    return -1;

    int iSize = vStr.size();

    int iKeyID = -1;

    //Perform binary search algorithm in string vector
    int iLow = 0;

	int iHigh = iSize-1;

	while(iLow<=iHigh){
		int iMid = (iLow+iHigh)/2;
        if(vStr[iMid].compare(sKey) == 0){
            iKeyID = iMid;
            return iKeyID;
		}else if(vStr[iMid].compare(sKey)<0){
            iLow = iMid+1;
		}
		else{ 
            iHigh = iMid-1;
		}//if 
	}//while 

    return iKeyID;
}//fn_iBinarySearch

/******************************************************************************
 FUNCTION ReadFile()                                                                                                                               
 读取文件到一个char*中
******************************************************************************/
int ReadFile(char **FileStr,const char* lpszFilename)
{
    if(!lpszFilename||(strlen(lpszFilename)==0)){
		cout<<"文件"<<lpszFilename<<"不存在!"<<endl;
        exit(-1);
	} 
	
	//sFileText = "";

	FILE* fp = fopen(lpszFilename,"rb");
	try{
		if(!fp){
			cout<<"文件"<<lpszFilename<<"打开错误!"<<endl;
			exit(-1);
		}//if 

        fseek(fp,0L,SEEK_END);
        long lSize = ftell(fp);
        fseek(fp,0L,SEEK_SET);

        if(lSize>0){
        char* lpszFileText = (char*)calloc(lSize+1,sizeof(char));
        if(lpszFilename==NULL){
			fclose(fp);
            cout<<"映射文件失败!"<<endl;
			exit(-1);
		} 

        fread(lpszFileText,sizeof(char),lSize,fp);

        *FileStr=lpszFileText;
		//sFileText = lpszFileText;
        //free(lpszFileText);
		} //if 
 
        fclose(fp);

	}
	catch(...){
		if(fp)
			fclose(fp); 
	}//try

    return 1;
}//fn_bReadFile

/******************************************************************************
 FUNCTION ReadOneFile(string infilename,char *outfilename)                                                                                                                               
 先读取一行,再读出该行中的词,
 再对该行中的每一个词进行词性标注,还原,
 再读取该文件                                                                      
******************************************************************************/
void ReadOneFile(string infilename,char *outfilename)
{
	vector<string> OneWord;

	//文本开始标志
	temfile<<"Begin"<<endl;
	OneFile.clear();

	char *source;
	ReadFile(&source,infilename.c_str());
	RAWTEXT.push_back(source);

	ICTCLAS_FileProcess((char *)infilename.c_str(),"分词测试输出.txt");

	fstream infile,outfile;
	infile.open("分词测试输出.txt",ios::in);
	if(!infile.is_open ()){
		cout<<"分词测试输出.txt"<<" open error"<<endl;
		exit(1);
	}

	string outfilename1="N_"+infilename+".out";
	outfile.open(outfilename1.c_str(),ios::out);

	//将该文本中的词放入到一个set中
	infile>>word;
	while(!infile.eof())
	{	

		//若全是英文字母,不是停用词,且长度小于26,大于1
		//就加入到词汇表中
		if((unsigned char)word[0]>=0xb0 &&
			(unsigned char)word[0]<=0xf7){
		    if( !StopWord.count(word) ){
				//if( strstr(word,"/a")>0  ){
					//如果计数为0,即不在其中,就进行插入
                //    int pos=strstr(word,"/a")-word;
				//	word[pos]='\0'; //去掉/a标识
			        if(!Word.count(word)){
		                Word.insert(word);
				        WordNum++;
					}  
					OneFile.push_back(word);
					//cout<<OneFile[0].c_str()<<endl;
				//}
			} 
		} 
		infile>>word;
	
	}
	infile.close();
	//每一行纪录每篇文档的全部词序列
	//WordMat.push_back(OneFile);
	//输出处理后文档,以.out表示
	for(j=0;j<OneFile.size();j++){
		//outfile<<OneFile[j]<<" ";
		temfile<<OneFile[j]<<" ";
		//if((j%15)==14)
			//outfile<<endl;
		//	temfile<<endl;
	}
	//outfile<<endl;
	temfile<<endl;

	//temfile<<infilename.c_str()<<endl;

	//文本结束标志
	temfile<<"[End]"<<endl<<endl;

}

/******************************************************************************
 FUNCTION DirectoryResearve()                                                                                                                               
 保留训练或测试集                                                                      
******************************************************************************/
BOOL DirectoryResearve(char* lpszDir,int N){
    char *infile;
	string CurDir;

	CurDir=lpszDir;

	if(!lpszDir||(strlen(lpszDir)==0)){
		return FALSE;
	}  

	if(_access(lpszDir,0)==-1)
		return FALSE;

    WIN32_FIND_DATA fd;
    HANDLE hFile;

	string sDirFiles = lpszDir;
	//if(lpszDir[strlen(lpszDir)-2]!='\\')
		sDirFiles += "\\";

    string sNoSuffixDir = sDirFiles;
	sDirFiles += "*.*";

    hFile = FindFirstFile(sDirFiles.c_str(),&fd);
    BOOL bRet = TRUE;

    file://Scan sub-directory under the directory specified by lpszDir
    while(bRet&&(hFile!=INVALID_HANDLE_VALUE)){
		if((strcmp(fd.cFileName,".")==0)||(strcmp(fd.cFileName,"..")==0)){
			bRet = FindNextFile(hFile,&fd);
            continue;
		}//if  

	    //如果是类别
		if(fd.dwFileAttributes == FILE_ATTRIBUTE_DIRECTORY){
	        string sDir = fd.cFileName;
			CurDir+="\\";
			CurDir+=sDir;
			//存储所有层次目录
			string CurDir1=CurDir;
			CurDir1+="\\";
			Class.push_back(CurDir1);
			ClassNum++;
			DirectoryResearve((char* )CurDir.c_str(),N);	
			
		}
		//如果是文件
		else{
			string sFullFilePath = sNoSuffixDir;
			sFullFilePath += fd.cFileName; 

			CurDocId++;

			//处理该文件
			if(PatNum<2000 && ((CurDocId%2)==N) ){
				PatNum++;
			}
			else{
				remove(sFullFilePath.c_str());
			}			
			cout<<CurDocId<<endl;			
		}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -