📄 precnhierdata.cpp
字号:
bRet = FindNextFile(hFile,&fd);
//若是横向遍历,则当前根目录不变
CurDir=lpszDir;
}//while
FindClose(hFile);
return TRUE;
}
/******************************************************************************
FUNCTION fn_bScanDirectory()
按目录读取
******************************************************************************/
BOOL fn_bScanDirectory(char* lpszDir,char *outfile){
char *infile;
string CurDir;
CurDir=lpszDir;
if(!lpszDir||(strlen(lpszDir)==0)){
return FALSE;
}
if(_access(lpszDir,0)==-1)
return FALSE;
WIN32_FIND_DATA fd;
HANDLE hFile;
string sDirFiles = lpszDir;
//if(lpszDir[strlen(lpszDir)-2]!='\\')
sDirFiles += "\\";
string sNoSuffixDir = sDirFiles;
sDirFiles += "*.*";
hFile = FindFirstFile(sDirFiles.c_str(),&fd);
BOOL bRet = TRUE;
file://Scan sub-directory under the directory specified by lpszDir
while(bRet&&(hFile!=INVALID_HANDLE_VALUE)){
if((strcmp(fd.cFileName,".")==0)||(strcmp(fd.cFileName,"..")==0)){
bRet = FindNextFile(hFile,&fd);
continue;
}//if
//如果是类别
if(fd.dwFileAttributes == FILE_ATTRIBUTE_DIRECTORY){
string sDir = fd.cFileName;
CurDir+="\\";
CurDir+=sDir;
//存储所有层次目录
string CurDir1=CurDir;
CurDir1+="\\";
Class.push_back(CurDir1);
ClassNum++;
fn_bScanDirectory((char* )CurDir.c_str()," ");
}
//如果是文件
else{
string sFullFilePath = sNoSuffixDir;
sFullFilePath += fd.cFileName;
vEachDirFiles.push_back(sFullFilePath);
cout<<CurDocId<<" "<<endl;
//if(CurDocId==750)
// CurDocId=CurDocId;
//处理该文件
//if(CurDocId%2==1)
// remove(sFullFilePath.c_str());
ReadOneFile(sFullFilePath,outfile);
PatClass.push_back(sNoSuffixDir);
PatNum++;
CurDocId++;
//cout<<CurDocId<<endl;
}
bRet = FindNextFile(hFile,&fd);
//若是横向遍历,则当前根目录不变
CurDir=lpszDir;
}//while
FindClose(hFile);
return TRUE;
}
int main(int argc, char* argv[])
{
//rename("t1.txt","t1-rename.txt");
fstream stopfile,infile,infile1,infile2,outfile,
clablefile,rlablefile,hlablefile,prefile1,prefile2,
topicfile1,topicfile2;
vector<string> vec;
string DirFile="手机-adj",MatFile,CLableFile,RLableFile,HLableFile,Root,PreFile1,PreFile2,TopicFile1,
TopicFile2;
MatFile=DirFile+".mat";
CLableFile=DirFile+".mat.clabel";
RLableFile=DirFile+".mat.rlabel";
HLableFile=DirFile+".mat.hlabel";
PreFile1=DirFile+".Train.txt";
TopicFile1=DirFile+".Train.topic.txt";
PreFile2=DirFile+".Test.txt";
TopicFile2=DirFile+".Test.topic.txt";
WordMat.clear();
CVweight.clear();
CVterm.clear();
PatClass.clear();
//初始化分词模块
char sSentence[2000],sSentenceResult[5000];
ICTCLAS_Init();
//ICTCLAS_FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
ICTCLAS_SetOperType(1);
//ICTCLAS_FileProcess("分词测试文本.txt","分词测试输出.txt");
//首先装入停用词
stopfile.open("stoplist.txt",ios::in);
stopfile>>str;
while(!stopfile.eof()){
if(stopfile.eof())
break;
//如果计数为0,即不在其中,就进行插入
strcpy(str,strlwr(str));
if(!StopWord.count(str)){
StopWord.insert(str);
}
stopfile>>str;
}
//存储所有层次目录
Root=DirFile;
Root += "\\";
Class.push_back(Root);
ClassNum++;
//以Begin与End作为一篇文档开始与结束的标志
temfile.open("MatFile.txt",ios::out);
//按目录读取
fn_bScanDirectory((char *)DirFile.c_str()," ");
//DirectoryResearve("电脑-测试",1); //训练集选1,测试集选7
//exit(0);
temfile.close();
//写入类别文件<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
rlablefile.open(RLableFile.c_str(),ios::out);
for(i=0;i<PatNum;i++){
rlablefile<<PatClass[i]<<endl;
}
//return;
//对词排序
it=Word.begin();
for(;it!=Word.end();++it){
vec.push_back(*it);
}
less<string> ls;
sort(vec.begin(),vec.end(),ls);
//写入单词文件<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
clablefile.open(CLableFile.c_str(),ios::out);
for(i=0;i<WordNum;i++){
clablefile<<vec[i].c_str()<<endl;
}
//转变成词频矩阵
long int *WordFreq=(long int *)malloc((WordNum)*sizeof(long int));
long int AllItem=0;
//词频向量清空
for(j=0;j<WordNum;j++)
WordFreq[j]=0;
//vector<string> onefile;
fstream temfile1;
temfile1.open("MatFile.txt",ios::in);
temfile1>>word;
for(i=0;i<PatNum;i++){
CVterm.push_back(intvec);
CVweight.push_back(intvec);
temfile1>>word;
while(strcmp(word,"[End]")!=0){
temfile1>>word;
//查找当前词的编号
long int no=fn_iBinarySearch(vec,word);
if(no>=0){
WordFreq[no]++;
}
}
for(j=0;j<WordNum;j++){
if(WordFreq[j]>0){
CVterm[i].push_back(j);
CVweight[i].push_back(WordFreq[j]);
WordFreq[j]=0;
AllItem++;
}
}
}
//写入词频文件
outfile.open(MatFile.c_str(),ios::out);
outfile<<PatNum<<" "<<WordNum<<" "<<AllItem<<endl;
//outfile.open(MatFile.c_str(),ios::out);
for(i=0;i<PatNum;i++){
//cout<<CVterm[i].size()<<endl;
//若本文档非空时,才进行输出
if(CVterm[i].size()>0){
for(j=0;j<CVterm[i].size();j++){
int term=CVterm[i][j];
float weight=CVweight[i][j];
//han格式中词从1开始编号
outfile<<CVterm[i][j]+1<<" "<<CVweight[i][j]<<" ";
}
outfile<<endl;
}
}
//写入层次类别文件<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
hlablefile.open(HLableFile.c_str(),ios::out);
for(i=0;i<Class.size();i++){
hlablefile<<Class[i]<<endl;
}
ICTCLAS_Exit();
//exit(0);
//写入处理后文件
prefile1.open(PreFile1.c_str(),ios::out);
prefile2.open(PreFile2.c_str(),ios::out);
for(i=0;i<PatNum;i++){
if(i % 2 ==0){
//若本文档非空时,才进行输出
if(RAWTEXT[i].size()>0){
if(i/2>0)
prefile2<<endl<<endl;
prefile2<<RAWTEXT[i];
prefile2<<endl<<"[End]";
}
}
else{
//若本文档非空时,才进行输出
if(RAWTEXT[i].size()>0){
if(i/2>0)
prefile1<<endl<<endl;
prefile1<<RAWTEXT[i];
prefile1<<endl<<"[End]";
}
}
}
prefile1.close();
prefile2.close();
//写入主题文件<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
topicfile1.open(TopicFile1.c_str(),ios::out);
topicfile2.open(TopicFile2.c_str(),ios::out);
for(i=0;i<PatNum;i++){
if(i % 2 ==0)
if(i/2==0)
topicfile2<<PatClass[i];
else
topicfile2<<endl<<PatClass[i];
else
if(i/2==0)
topicfile1<<PatClass[i];
else
topicfile1<<endl<<PatClass[i];
}
topicfile1.close();
topicfile2.close();//*/
//return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -