📄 precnhierdata.cpp
字号:
// ICTCLAS_DOS.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <windows.h>
#include <string.h>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <io.h>
#include <fstream>
#include <iostream.h>
#include <algorithm>
#include <functional>
//#include "Tagger/POSTagger.h"
//#include "Tagger/POSTagging.h"
//#include "Stemmer/stem.h"
#include "ICTCLAS.h"
#pragma warning(disable:4786)
using namespace std;
typedef double REAL;
typedef int INT;
typedef string STR;
typedef vector<INT> INTVT;
typedef vector<REAL> REALVT;
typedef vector<STR> STRVT;
typedef vector<INTVT> INTMATR;
typedef vector<REALVT> REALMATR;
typedef vector<STRVT> STRMATR;
STRMATR WordMat;
STRVT RAWTEXT;
STRVT OneFile; //用来存储一个文件中的词序列
INTMATR CVweight; //多个中心向量对应的weight
INTMATR CVterm; //多个中心向量对应的term
REALVT realvec;
INTVT intvec;
STRVT PatClass; //每个样本的类别
set<string> StopWord;
vector<string> Class;
set<string> Word;
set<string>::iterator it;
char *const sWordSeparators=" \t\n.,:;'><()\"?/~!@#$%^&*+-_=|0123456789[]{}";
long int CurDocId=0;
long int CurTopicId=0;
vector<vector<string> > vDirlist;
long int i,j,k,pat,nonemptyitem,item,val;
long int RowLen;
char str[100000],*str1;
char temstr[10000],word[10000];
//char buf[1000000];
long int WordNum,ClassNum,PatNum=0;
vector<string> vDirs;
vector<string> vEachDirFiles;
fstream temfile;
void ReadOneFile(string infilename,char *outfilename);
BOOL fn_bScanDirectory(char* lpszDir);
/******************************************************************************
FUNCTION fn_iBinarySearch(const vector<string>& vStr,string sKey)
折半查找
******************************************************************************/
int fn_iBinarySearch(const vector<string>& vStr,string sKey)
{
//binary searching for the keyword in vector vStr
if(vStr.empty())
return -1;
int iSize = vStr.size();
int iKeyID = -1;
//Perform binary search algorithm in string vector
int iLow = 0;
int iHigh = iSize-1;
while(iLow<=iHigh){
int iMid = (iLow+iHigh)/2;
if(vStr[iMid].compare(sKey) == 0){
iKeyID = iMid;
return iKeyID;
}else if(vStr[iMid].compare(sKey)<0){
iLow = iMid+1;
}
else{
iHigh = iMid-1;
}//if
}//while
return iKeyID;
}//fn_iBinarySearch
/******************************************************************************
FUNCTION ReadFile()
读取文件到一个char*中
******************************************************************************/
int ReadFile(char **FileStr,const char* lpszFilename)
{
if(!lpszFilename||(strlen(lpszFilename)==0)){
cout<<"文件"<<lpszFilename<<"不存在!"<<endl;
exit(-1);
}
//sFileText = "";
FILE* fp = fopen(lpszFilename,"rb");
try{
if(!fp){
cout<<"文件"<<lpszFilename<<"打开错误!"<<endl;
exit(-1);
}//if
fseek(fp,0L,SEEK_END);
long lSize = ftell(fp);
fseek(fp,0L,SEEK_SET);
if(lSize>0){
char* lpszFileText = (char*)calloc(lSize+1,sizeof(char));
if(lpszFilename==NULL){
fclose(fp);
cout<<"映射文件失败!"<<endl;
exit(-1);
}
fread(lpszFileText,sizeof(char),lSize,fp);
*FileStr=lpszFileText;
//sFileText = lpszFileText;
//free(lpszFileText);
} //if
fclose(fp);
}
catch(...){
if(fp)
fclose(fp);
}//try
return 1;
}//fn_bReadFile
/******************************************************************************
FUNCTION ReadOneFile(string infilename,char *outfilename)
先读取一行,再读出该行中的词,
再对该行中的每一个词进行词性标注,还原,
再读取该文件
******************************************************************************/
void ReadOneFile(string infilename,char *outfilename)
{
vector<string> OneWord;
//文本开始标志
temfile<<"Begin"<<endl;
OneFile.clear();
char *source;
ReadFile(&source,infilename.c_str());
RAWTEXT.push_back(source);
ICTCLAS_FileProcess((char *)infilename.c_str(),"分词测试输出.txt");
fstream infile,outfile;
infile.open("分词测试输出.txt",ios::in);
if(!infile.is_open ()){
cout<<"分词测试输出.txt"<<" open error"<<endl;
exit(1);
}
string outfilename1="N_"+infilename+".out";
outfile.open(outfilename1.c_str(),ios::out);
//将该文本中的词放入到一个set中
infile>>word;
while(!infile.eof())
{
//若全是英文字母,不是停用词,且长度小于26,大于1
//就加入到词汇表中
if((unsigned char)word[0]>=0xb0 &&
(unsigned char)word[0]<=0xf7){
if( !StopWord.count(word) ){
//if( strstr(word,"/a")>0 ){
//如果计数为0,即不在其中,就进行插入
// int pos=strstr(word,"/a")-word;
// word[pos]='\0'; //去掉/a标识
if(!Word.count(word)){
Word.insert(word);
WordNum++;
}
OneFile.push_back(word);
//cout<<OneFile[0].c_str()<<endl;
//}
}
}
infile>>word;
}
infile.close();
//每一行纪录每篇文档的全部词序列
//WordMat.push_back(OneFile);
//输出处理后文档,以.out表示
for(j=0;j<OneFile.size();j++){
//outfile<<OneFile[j]<<" ";
temfile<<OneFile[j]<<" ";
//if((j%15)==14)
//outfile<<endl;
// temfile<<endl;
}
//outfile<<endl;
temfile<<endl;
//temfile<<infilename.c_str()<<endl;
//文本结束标志
temfile<<"[End]"<<endl<<endl;
}
/******************************************************************************
FUNCTION DirectoryResearve()
保留训练或测试集
******************************************************************************/
BOOL DirectoryResearve(char* lpszDir,int N){
char *infile;
string CurDir;
CurDir=lpszDir;
if(!lpszDir||(strlen(lpszDir)==0)){
return FALSE;
}
if(_access(lpszDir,0)==-1)
return FALSE;
WIN32_FIND_DATA fd;
HANDLE hFile;
string sDirFiles = lpszDir;
//if(lpszDir[strlen(lpszDir)-2]!='\\')
sDirFiles += "\\";
string sNoSuffixDir = sDirFiles;
sDirFiles += "*.*";
hFile = FindFirstFile(sDirFiles.c_str(),&fd);
BOOL bRet = TRUE;
file://Scan sub-directory under the directory specified by lpszDir
while(bRet&&(hFile!=INVALID_HANDLE_VALUE)){
if((strcmp(fd.cFileName,".")==0)||(strcmp(fd.cFileName,"..")==0)){
bRet = FindNextFile(hFile,&fd);
continue;
}//if
//如果是类别
if(fd.dwFileAttributes == FILE_ATTRIBUTE_DIRECTORY){
string sDir = fd.cFileName;
CurDir+="\\";
CurDir+=sDir;
//存储所有层次目录
string CurDir1=CurDir;
CurDir1+="\\";
Class.push_back(CurDir1);
ClassNum++;
DirectoryResearve((char* )CurDir.c_str(),N);
}
//如果是文件
else{
string sFullFilePath = sNoSuffixDir;
sFullFilePath += fd.cFileName;
CurDocId++;
//处理该文件
if(PatNum<2000 && ((CurDocId%2)==N) ){
PatNum++;
}
else{
remove(sFullFilePath.c_str());
}
cout<<CurDocId<<endl;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -