📄 eigenvectorselect.cpp
字号:
/***************************************************************
* 工程: 自然语言处理综合系统
* 作者: 李赟(liyun@nlu.caai.cn)
* 修改者 李卫
* 描述: 特征抽取的部分实现
* 主要函数:IDFEigentVector 、FindClassNames 、CHIEigentVector
G_UpdateKeyWords、G_UpdateIDVector、G_UpdateDocs
G_LoadKeyWords 等
* 版本: 1.0
* 修改: 遍历文件夹下所有目录内文件
* 参考文献:
**************************************************************/
#include "EigenVectorSelect.h"
//-----------------------------------------------------------//
// 功能: 单个类别DF特征抽取并以词的形式保存特征文件
// 参数:
// (入口)const string &classname,类别名称
// unsigned int kwdnum 该类别需要抽取的的特征数
// const string& vecfiledir 类别文件存放路径
// 文件输入vecfiledir+"\\"+classname+".vec"
// (出口) map<string,unsigned int> &allwords,
// 用于编号的总词表,执行该函数时更新
// 文件输出vecfiledir+"\\df\\"+classname+".vei"
// 返回: 正常true 错误false
// 主要思路:DF特征抽取
// 调用方法:全局函数
// 日期: 2006年3月
//----------------------------------------------------------//
bool IDFEigentVector(const string &classname,map<string,unsigned int> &allwords,unsigned int kwdnum,const string& vecfiledir){
//一个用于排序的map表 key为Doc数 value为Doc为key的特征列表
map<unsigned int,vector<string>,greater<unsigned int> > wordmap;
//特征抽取前的特征列表 格式为:词:篇章数
string ifilename = vecfiledir+"\\"+classname+".vec";
//特征抽取后的词表 格式为:词:篇章数
string ofilename = vecfiledir+"\\df\\"+classname+".vei";
ifstream fin(ifilename.c_str());
ofstream fout(ofilename.c_str());
if(!fin || !fout){
cout<<"vec/eivfile can not open "<<endl;
return false;
}
pair<unsigned int,string> wordpair;
unsigned int count =0;
unsigned int totaldocnum;
fin>>totaldocnum;
//更新wordmap用于排序
while(fin>>wordpair.second>>wordpair.first){
count++;
map<unsigned int,vector<string>,greater<unsigned int> > ::iterator itmv =
wordmap.find(wordpair.first);
if(itmv==wordmap.end()){
pair<unsigned int,vector<string> > tmpwordpair;
tmpwordpair.first = wordpair.first;
tmpwordpair.second.push_back(wordpair.second);
wordmap.insert(tmpwordpair);
}else{
itmv->second.push_back(wordpair.second);
}
}
fin.close();
if(totaldocnum==0 || count ==0){
// cout<<"Error! totaldocnum ="<<totaldocnum<<",wordcount ="<<count<<endl;
return false;
}else{
// cout<<count<<"words in. wordlist.size() = "<<wordmap.size()<<endl;
}
int kwdnum_count =kwdnum;
//输出总Doc数
fout<<totaldocnum<<endl;
//抽取DF靠前的kwdnum_count个特征词并输出
for(map<unsigned int,vector<string>,greater<unsigned int> >::const_iterator
itc=wordmap.begin();itc!=wordmap.end();itc++){
//cout<<itc->first<<" ";
for(vector<string>::const_iterator itcvs =itc->second.begin();itcvs!=itc->second.end();itcvs++)
if(kwdnum_count-- >0){
// fout<<itc->second<<" "<<itc->first/totaldocnum<<endl;
//cout<<*itcvs<<" ";
//更新用于生成序号的特征列表
allwords.insert(pair<string,unsigned int>(*itcvs,0));
//
fout<<*itcvs<<" "<<itc->first<<endl;
}
//cout<<endl;
}
fout.close();
return true;
}
//-----------------------------------------------------------//
// 功能: 寻找某目录包含的训练类别名称
// 参数:
// (入口)const string& vecfiledir 类别文件路径
// const string& extstr 扩展名
// 文件输入vecfiledir目录下的*.extstr,(文件名)
// (出口) set<string>& namevec 返回类别名称
// 用于编号的总词表,执行该函数时更新
// 返回: 找到的类别数,错误返回0
// 主要思路:所有训练类别按“类别名称.特定扩展名”形式存放在在一个目录下,
// 根据指定文件后缀寻找文件名(不包含扩展名)
// 调用方法: 全局函数
// 日期: 2006年3月
//----------------------------------------------------------//
unsigned short FindClassNames(set<string>& namevec,const string& vecfiledir,const string& extstr){
string FileNameTmp="";
unsigned short Filecount =0;
namevec.clear();
long hFile;
string tmpName1=vecfiledir+"\\*."+extstr;
struct _finddata_t TrainFile;
if((hFile = _findfirst(tmpName1.c_str(), &TrainFile )) == -1L){
// cout<<"路径无法访问!"<<endl;
return 0;
}
do
{
FileNameTmp=TrainFile.name;
if(FileNameTmp=="." || FileNameTmp=="..") continue;
int findpoint;
if((findpoint = FileNameTmp.rfind("."))<=0
||FileNameTmp.substr(findpoint,FileNameTmp.size()-findpoint) !=string(string(".")+extstr)
){
continue;
}
FileNameTmp = FileNameTmp.substr(0,findpoint);
namevec.insert(FileNameTmp);
Filecount++;
// printf(">");
}while(! _findnext( hFile, &TrainFile ) );
_findclose( hFile );
return Filecount;
}
//-----------------------------------------------------------//
// 功能: CHI特征抽取并以词的形式保存特征文件(针对所有类别)
// 参数:
// (入口)const set<string>& namevec,当前类别名称列表
// unsigned int kwdnum 该类别需要抽取的的特征数
// const string& vecfiledir 类别文件存放路径
// 文件输入vecfiledir+"\\"+(*itc)+".vec"
// (出口) map<string,unsigned int> &allwords,
// 用于编号的总词表,执行该函数时更新
// 文件输出vecfiledir+"\\chi\\"+evevector[k].classname+".vei";
// vecfiledir + "\\chi\\_all_words.lst";
// vecfiledir + "\\_all_version.lst";
// 返回: 正常true 错误false
// 主要思路:CHI特征抽取
// 调用方法:全局函数
// 日期: 2006年3月
//----------------------------------------------------------//
bool CHIEigentVector(const set<string>& namevec,map<string,unsigned int> &allwords,unsigned int kwdnum,const string& vecfiledir){
unsigned int allclassdocnum =0;
vector<EVenty> evevector;
//遍历所有类别未抽取的特征列表文件,并记录到vector<EVenty>中
//每个EVenty对应一个类别。内部以特征的音序升序排列
for(set<string>::const_iterator itc=namevec.begin();itc!=namevec.end();itc++){
string ifilename = vecfiledir+"\\"+(*itc)+".vec";
ifstream fin(ifilename.c_str());
if(!fin){
cout<<"vecfile can not open "<<endl;
continue;
}
evevector.push_back(EVenty());
vector<EVenty>::reverse_iterator iteve = evevector.rbegin();
iteve->classname = *itc;
fin>>iteve->totalword;
allclassdocnum += iteve->totalword;
pair<unsigned int,string> wordpairtmp;
unsigned int count =0;
while(fin>>wordpairtmp.second>>wordpairtmp.first){
count++;
iteve->wordspair.push_back(wordpairtmp);
}
cout<<"class:"<<iteve->classname<<"wordnum"<<count;
fin.close();
}
if(evevector.size() <= 1){
cout<<"not enough classes"<<endl;
return false;
}else{
cout<<"classes num "<<evevector.size()<<"totaldocnum"<<allclassdocnum<<endl;
//getchar();
}
unsigned int count =0;
string minword ="",minword_old="";
//根据CHI的要求生成ABCD4个值(chivalue[0-3])并计算chivalue的相关值
//由于计算牵涉到vector<EVenty>中多个EVenty(多个类别的有序的特征列表),
//这里采用了最小词对齐方法,每次从多个EVenty的当前词中选取一最小的计算
//下次去掉已经计算过的词,再重复上面的步骤,直到所有EVenty都处理完
while(1){
for(unsigned int i=0;i<evevector.size();i++){
//该evevector已到结尾
if(evevector[i].isstop == true) continue;
//更新chivalue后已计算过的最小特征词,开始下一轮选最小词
if(minword_old ==evevector[i].wordspair[evevector[i].curindex].second) { //count the result
double chivalue =
evevector[i].chivalue[0] * evevector[i].chivalue[4]
- evevector[i].chivalue[2] * evevector[i].chivalue[3] ;
// do sth
if(chivalue >0 && evevector[i].chivalue[0] >3){
//分母
double chivalue2 =
(evevector[i].chivalue[0] + evevector[i].chivalue[2])
*(evevector[i].chivalue[1] + evevector[i].chivalue[3])
*(evevector[i].chivalue[0] + evevector[i].chivalue[1])
*(evevector[i].chivalue[2] + evevector[i].chivalue[3]) ;
if(chivalue2 == 0) chivalue2 = 1;
//参见CHI相关公式
double chivalue3 = allclassdocnum * chivalue * chivalue / chivalue2 ;
chivalue3 *= log((float)evevector[i].chivalue[0]);
//chivalue3 *= evevector[i].chivalue[0];
//if(chivalue3 >1e+13)
//cout<<minword_old<<" "
// <<evevector[i].classname<<evevector[i].chivalue[0]<<" "<<
// evevector[i].chivalue[1]<<" "<<evevector[i].chivalue[2]<<" "<<evevector[i].chivalue[3]<<" "
//<<chivalue<<" "<<chivalue2<<" "<<chivalue3<<endl;
//MAP排序表,形式为key权重 value对应的词列表,按Key从大到小排序
map<double,vector<string>,greater<double> > ::iterator itmvd =
evevector[i].wordmap.find(chivalue3);
if(itmvd==evevector[i].wordmap.end()){
pair<double,vector<string> > tmpwordpair;
tmpwordpair.first = chivalue3;
tmpwordpair.second.push_back(minword_old);
evevector[i].wordmap.insert(tmpwordpair);
}else{
itmvd->second.push_back(minword_old);
}
}//end of if(chivalue >0 && evevector[i].chivalue[0] >3)...
//切换到下一个特征,开始下一轮选最小词
if(evevector[i].curindex >=evevector[i].wordspair.size()-1){
evevector[i].isstop = true;
continue;
}else{
evevector[i].curindex ++;
}
}//end of minword_old ==evevector[i].w ....
//look for min word
//如果找到更小的最小词,则更新
evevector[i].chivalue[0] =evevector[i].chivalue[1] =evevector[i].chivalue[2] =evevector[i].chivalue[3] =0;
if(minword == "" || minword > evevector[i].wordspair[evevector[i].curindex].second) {
minword = evevector[i].wordspair[evevector[i].curindex].second;
}
} //end of for
// no more words of all entrys
//while1的结束条件,没有特征需要处理
if(minword_old == minword){
cout<<"search end ,total wordnum ="<<count<<endl;
break; //break from while 1
}else{
//清空最小词,开始下一轮寻找
minword_old = minword;
minword = "";
count ++;
//cout<<minword_old<<endl;
//cout<<">";
}
for(unsigned int i2=0;i2<evevector.size();i2++){
unsigned int docinclass =0;
if(minword_old == evevector[i2].wordspair[evevector[i2].curindex].second)
docinclass = evevector[i2].wordspair[evevector[i2].curindex].first;
for(unsigned int j=0;j<evevector.size();j++){
evevector[j].chivalue[(i2 ==j)?0:1] += docinclass;
evevector[j].chivalue[(i2 ==j)?2:3] += (evevector[i2].totalword - docinclass);
}
}
}//end of while 1
//根据排序输出文件结果并更新排序编号特征列表
for(unsigned int k=0;k<evevector.size();k++){
string ofilename = vecfiledir+"\\chi\\"+evevector[k].classname+".vei";
ofstream fout(ofilename.c_str());
if(!fout){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -