📄 eigenvectorselect.cpp
字号:
cout<<"vec/eivfile can not open "<<endl;
continue;
}
fout<<evevector[k].totalword<<endl;
int kwdnum_count = kwdnum;
for(map<double,vector<string>,greater<double> >::const_iterator
itc2=evevector[k].wordmap.begin();itc2!=evevector[k].wordmap.end();itc2++){
//cout<<itc->first<<" ";
for(vector<string>::const_iterator itcvs =itc2->second.begin();itcvs!=itc2->second.end();itcvs++){
if(kwdnum_count-- >0){
// fout<<itc->second<<" "<<itc->first/totaldocnum<<endl;
//cout<<*itcvs<<" ";
allwords.insert(pair<string,unsigned int>(*itcvs,0));
fout<<*itcvs<<" "<<itc2->first<<endl;
}else{
break;
}
}
//cout<<endl;
if(kwdnum_count<=0) break;
}
fout.close();
}
string totalname = vecfiledir + "\\chi\\_all_words.lst";
ofstream ftotal(totalname.c_str());
if(!ftotal){
cout<<"无法创建总词库文件!"<<endl;
}else{
//创建更新版本信息
string versionfile = vecfiledir + "\\_all_version.lst";
string tmpversion="";
ifstream fversion(versionfile.c_str());
if(fversion){
fversion>>tmpversion;
fversion.close();
//版本同步
ftotal<<allwords.size()<<" "<<tmpversion<<endl;
}else{
ofstream fversion_o(versionfile.c_str());
if(fversion_o){
//生成新的版本号,用一个变量的地址作版本号
fversion_o<<"#ver"<<&tmpversion<<endl;
fversion_o.close();
}
ftotal<<allwords.size()<<" #ver"<<&tmpversion<<endl;
}
//输出编号特征列表及对应编号
unsigned int wordindex=0;
for(map<string,unsigned int>::iterator itca=allwords.begin();itca!=allwords.end();itca++){
ftotal<<itca->first<<endl;
itca->second = wordindex++;
}
ftotal.close();
}
return true;
}
//-----------------------------------------------------------//
// 功能: 使用预定的特征抽取方法提取特征(所有类别)
// 参数:
// (入口)const string& dicextname //特征抽取方法缩写
// unsigned int kwdnum 该类别需要抽取的的特征数
// const string& vecfiledir 类别文件存放路径
// 文件输入vecfiledir+"\\"+(*itc)+".vec"
// (出口) map<string,unsigned int> &allwords,
// 用于编号的总词表,执行该函数时更新
// 文件输出vecfiledir+"\\chi\\"+evevector[k].classname+".vei";
// vecfiledir + "\\chi\\_all_words.lst";
// vecfiledir + "\\_all_version.lst";
// 返回: 抽取到的总特征词数量
// 主要思路:
// 调用方法:全局函数
// 日期: 2006年3月
//----------------------------------------------------------//
unsigned int G_UpdateKeyWords(const string& dicextname,map<string,unsigned int> &allwords,unsigned int kwdnum,const string& vecfiledir){
set<string> namevec;
//得到所有类别
unsigned int classcount =FindClassNames(namevec,vecfiledir,"vec");
if(dicextname == "df" ){
//针对所有类别IDFEigentVector更新
for(set<string>::const_iterator itc=namevec.begin();itc!=namevec.end();itc++){
IDFEigentVector(*itc,allwords,kwdnum,vecfiledir);
}
string totalname = vecfiledir + "\\df\\_all_words.lst";
string versionfile = vecfiledir + "\\_all_version.lst";
ofstream ftotal(totalname.c_str());
if(!ftotal){
cout<<"无法创建总词库文件!"<<endl;
return 0;
}
string tmpversion="";
ifstream fversion(versionfile.c_str());
if(fversion){
fversion>>tmpversion;
fversion.close();
ftotal<<allwords.size()<<" "<<tmpversion<<endl;
}else{
ofstream fversion_o(versionfile.c_str());
if(fversion_o){
fversion_o<<"#ver"<<&tmpversion<<endl;
fversion_o.close();
}
ftotal<<allwords.size()<<" #ver"<<&tmpversion<<endl;
}
unsigned int wordindex=0;
for(map<string,unsigned int>::iterator itca=allwords.begin();itca!=allwords.end();itca++){
ftotal<<itca->first<<endl;
itca->second = wordindex++;
}
ftotal.close();
//CHI直接调用专门函数
}else if(dicextname == "chi"){
bool result = CHIEigentVector(namevec,allwords,kwdnum,vecfiledir);
}
return allwords.size();
}
//-----------------------------------------------------------//
// 功能: 初始化或者切换特征抽取方式时重新读入词列表
// 参数:
// (入口)const string& dicextname //特征抽取方法缩写
// unsigned int kwdnum 该类别需要抽取的的特征数
// const string& vecfiledir 类别文件存放路径
// 文件输入vecfiledir + "\\_all_version.lst";
// 文件输入vecfiledir + "\\" + dicextname+ "\\_all_words.lst";
// (出口) map<string,unsigned int> &allwords,
// 用于编号的总词表,执行该函数时更新
// 返回: 特征词数量 ,读入错误或版本错误返回0
// 版本错误的原因是数据没有使用当前特征抽取方法向量化
// 版本错误时需要作的更新参见ssps.UpdateAllData()
// 主要思路:
// 调用方法:全局函数
// 日期: 2006年3月
//----------------------------------------------------------//
unsigned int G_LoadKeyWords(const string& dicextname,map<string,unsigned int> &allwords,unsigned int kwdnum,const string& vecfiledir){
allwords.clear();
string versionfile = vecfiledir + "\\_all_version.lst";
string totalname = vecfiledir + "\\" + dicextname+ "\\_all_words.lst";
ifstream ftotal(totalname.c_str());
if(!ftotal){
cout<<"无法打开关键词文件!"<<endl;
return 0;
}
string wordin;
int totalinfile =0;
string curversion1="",curversion2="";
ftotal>>totalinfile>>curversion1 ;
ifstream fversion(versionfile.c_str());
if(fversion){
fversion>>curversion2;
fversion.close();
if(curversion1 != curversion2 ){
cout<<"版本太旧,需要更新训练信息"<<endl;
//return 0;
}
}else{
cout<<"版本无法确认,需要更新训练信息"<<endl;
return 0;
}
unsigned int wordcountin=0;
while(ftotal>>wordin){
allwords.insert(pair<string,unsigned int>(wordin,wordcountin++));
}
// cout<<"read "<<wordcountin<<" words in total word id dic"<<endl;
return wordcountin;
}
//-----------------------------------------------------------//
// 功能: 训练或测试文本的向量化
// 参数:
// (入口)const string& dicextname 特征抽取方法缩写
// const string& vecfiledir 类别文件存放路径
// const map<string,unsigned int> &allwords, 用于编号的总词表,执行该函数时更新
// 文件输入 vecfiledir + "\\"+(*itc)+".tmw";
// (出口)文件输出 vecfiledir + "\\" + dicextname + "\\" +(*itc)+".tmi";
// 以上生成向量化文本,每个doc以行每类一个文件以类别名称命名
// 返回: 更新的类别数
// 主要思路:
// 调用方法:全局函数
// 日期: 2006年3月
//----------------------------------------------------------//
unsigned int G_UpdateDocs(const string& dicextname,const map<string,unsigned int> &allwords,const string& vecfiledir){
set<string> namevec;
map<string,unsigned int>::const_iterator itcm;
unsigned int classcount =FindClassNames(namevec,vecfiledir,"tmw");
unsigned int updatedclasscount =0;
for(set<string>::const_iterator itc=namevec.begin();itc!=namevec.end();itc++){
string fromname = vecfiledir + "\\"+(*itc)+".tmw";
string toname = vecfiledir + "\\" + dicextname + "\\" +(*itc)+".tmi";
ifstream f_from(fromname.c_str());
ofstream f_to(toname.c_str());
if(!f_to){
cout<<"无法创建"<<*itc<<"的向量文件!"<<endl;
continue;
}
if(!f_from){
f_to.close();
cout<<"无法创建"<<*itc<<"的向量文件!"<<endl;
continue;
}
updatedclasscount++;
string stringtmp;
while(getline(f_from,stringtmp)){
stringstream sstmp;
sstmp.rdbuf()->str(stringtmp);
string wordtmp;
unsigned int wordcount;
sstmp>>wordcount;
f_to<<wordcount;
while(sstmp>>wordtmp>>wordcount){
if((itcm=allwords.find(wordtmp))!=allwords.end()){
f_to<<" "<<itcm->second<<":"<<wordcount;
}
}
f_to<<endl;
//cout<<">";
}
f_from.close();
f_to.close();
}
return updatedclasscount;
}
//-----------------------------------------------------------//
// 功能: 在各类别中每个特征出现的文档次数
// 参数:
// (入口)const string& dicextname 特征抽取方法缩写
// const string& vecfiledir 类别文件存放路径
// const map<string,unsigned int> &allwords, 用于编号的总词表,执行该函数时更新
// 文件输入 vecfiledir+"\\"+类别名称+".vec";
// (出口)文件输出 vecfiledir+"\\"+dicextname+"\\_all_ids.lst";
// 以上生成向量化文本,每行一个类别
// 返回: 1 true 0 false
// 主要思路:
// 调用方法:全局函数
// 日期: 2006年3月
//----------------------------------------------------------//
unsigned int G_UpdateIDVector(const string& dicextname,map<string,unsigned int> &allwords,const string& vecfiledir){
set<string> namevec;
unsigned int classcount =FindClassNames(namevec,vecfiledir,"vec");
map<string,unsigned int>::const_iterator itcm;
string ofilename = vecfiledir+"\\"+dicextname+"\\_all_ids.lst";
ofstream fout(ofilename.c_str());
if(!fout){
cout<<"vei file can not open "<<endl;
return 0;
}
for(set<string>::const_iterator itc=namevec.begin();itc!=namevec.end();itc++){
string ifilename = vecfiledir+"\\"+(*itc)+".vec";
ifstream fin(ifilename.c_str());
if(!fin){
cout<<"vec file can not open "<<endl;
continue;
}
int totalword=0;
fin>>totalword;
fout<<(*itc)<<" "<<totalword;
pair<unsigned int,string> wordpairtmp;
unsigned int count =0;
while(fin>>wordpairtmp.second>>wordpairtmp.first){
if((itcm=allwords.find(wordpairtmp.second))!=allwords.end()){
fout<<" "<<itcm->second<<":"<<wordpairtmp.first;
}
}
fin.close();
fout<<endl;
}
fout.close();
return 1;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -