📄 new_ssps.cpp
字号:
{
if(!CIndexcom[z].WCount) continue;
fputc(z,fwptr);
fwrite(&CIndexcom[z].WCount,2,1,fwptr);//count
NWords+=CIndexcom[z].WCount;
offset=ftell(fword);
fwrite((void *)&offset,4,1,fwptr);//offset
Tptr=CIndexcom[z].WList;
while(Tptr)
{
fputc(Tptr->Len,fword);
fwrite(Tptr->sCIYU,Tptr->Len,1,fword);
fwrite(Tptr->pos,8,1,fword);
freep=Tptr;
Tptr=Tptr->ptrNext;
free(freep->sCIYU);
free(freep->pos);
free(freep);
}//end of while
CIndexcom[z].WList = NULL;
CIndexcom[z].WCount = 0;
}//end of double for;
for(int x=0;x<94;x++)
for(int y=0;y<94;y++)
{
if(!WIndexcom[x][y].WCount) continue;
fputc(x+0xa1,fwptr);
fputc(y+0xa1,fwptr);//Chinese ch
fwrite(&WIndexcom[x][y].WCount,2,1,fwptr);//count
NWords+=WIndexcom[x][y].WCount;
offset=ftell(fword);
fwrite((void *)&offset,4,1,fwptr);//offset
Tptr=WIndexcom[x][y].WList;
while(Tptr)
{
fputc(Tptr->Len,fword);
fwrite(Tptr->sCIYU,Tptr->Len,1,fword);
fwrite(Tptr->pos,8,1,fword);
freep=Tptr;
Tptr=Tptr->ptrNext;
free(freep->sCIYU);
free(freep->pos);
free(freep);
}//end of while
WIndexcom[x][y].WList = NULL;
WIndexcom[x][y].WCount = 0;
}//end of double for;
// printf("Total %d words in dictionary.\n",NWords);
fclose(fwptr);
fclose(fword);
fclose(fptr);
return 0;
}
//-----------------------------------------------------------//
// 功能: 给每个特征词计数,从0开始每调用一次+1
// 参数:
// (入口)start 输入文本起始指针
// len 词长 (输入可能为一个很长的字符串,需要指定截取长度)
// (出口)resmap 一个特征词于词数的对应表
// 返回:
//----------------------------------------------------------//
void CSSPS::insertwordtomap(const unsigned char*start,unsigned int len, map<string,unsigned int>&resmap)const{
if(start==NULL || len==0)
return ;
map<string,unsigned int>::iterator it;
char *tmpstr =new char[len+1];
strncpy(tmpstr,(const char *)start,len);
tmpstr[len]=0;
if((it=resmap.find(tmpstr))==resmap.end()){
resmap.insert(pair<string,unsigned int>(tmpstr,1));
}else {
it->second++;
}
delete [] tmpstr;
}
//-----------------------------------------------------------//
// 功能: 一篇文本的单次快速分词的同时获取文本count
// 参数:
// (入口)sBuffer 输入文本
// (出口)resmap 一个特征词于词数的对应表
// 返回:该文本中有效的总词数
//----------------------------------------------------------//
unsigned int CSSPS::GetVecInDoc(const unsigned char* sBuffer,map<string,unsigned int>&resmap)const{
int pointer=0;
int x;
int y;
int ascii_flag;
int flag=0;
struct aWORDdic* Wptr;
int count;
int totalwordcount=0;
struct aWORDdic* DPos=NULL;
int lenofsBuffer=strlen((const char*)sBuffer);
while(pointer<lenofsBuffer) {
if(sBuffer[pointer] & 128) //首字节表明是汉字词
{
ascii_flag=0;
if(flag==1)
{
//totalwordcount++;
flag=0;
}
x=sBuffer[pointer]-0xa1;
y=sBuffer[pointer+1]-0xa1;
if(x<0||y<0 ||x>=94||y>=94 )
{
pointer+=2;
continue;
}
Wptr=WIndexcom[x][y].WList;
count=WIndexcom[x][y].WCount;
DPos=NULL;
if(!Wptr) {
pointer+=2;
continue;
}
}
else //首字节表明是ASCII
{
flag=1;
ascii_flag=1;
x=sBuffer[pointer];
Wptr=CIndexcom[x].WList;
count=CIndexcom[x].WCount;
DPos=NULL;
if(!Wptr) {
pointer+=1;
continue;
}
}
unsigned char LastLen=2;
int iCompareResult;
while(1) {
if(!Wptr)
{
if(ascii_flag)
{
pointer+=1;
break;
}
if(LastLen>2){
insertwordtomap((const unsigned char *)(sBuffer+pointer),LastLen,resmap);
totalwordcount++;
}
pointer+=LastLen;
break;
}
iCompareResult=strncmp((char *)(sBuffer+pointer),(const char *)Wptr->sCIYU,Wptr->Len);
if(iCompareResult<0) {
if(ascii_flag)
{
pointer++;
break;
}
if(LastLen>2){
insertwordtomap((const unsigned char *)(sBuffer+pointer),LastLen,resmap);
totalwordcount++;
}
pointer+=LastLen;
break;
}
if(iCompareResult==0) {
LastLen=Wptr->Len;
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
if(iCompareResult>0)
{
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
}//end of while(1)
}//end of while(pointer<...)
return totalwordcount;
}
//-----------------------------------------------------------//
// 功能: 单次快速分词的同时获取文本count
// 参数:
// (入口)const char *TrainFileDir 训练文件路径
// const char *ClassName 类别名称
// const char *ResultVecFileDir输出结果文件路径
// 文件输入TrainFileDir下的文本文件
// (出口)ClassName.vec; 所有的特征对应的文档数
// OutFileName.tmw 每篇文档中词和出现次数对应列表
// 以上两文本均夷词作为key,是特征抽取的数据源
// 调用函数:GetVecInDoc
// 返回:true false
//----------------------------------------------------------//
//this function will call GetVecInDoc set onceonly = true
unsigned int CSSPS::CountDf(const char *TrainFileDir,const char *ClassName,const char *ResultVecFileDir){
bool dirnotin=false;
long hFile;
unsigned int Filecount=0;
string FileNameTmp="";
map<string,unsigned int> resmap;
map<string,unsigned int>::const_iterator itcm;
string OutFileName = string(ResultVecFileDir)+DIRSEPSTRING+string(ClassName);
ofstream fout_tmw;
if(ResultVecFileDir!=NULL){
string OutFileName2 =OutFileName+".tmw";
fout_tmw.open(OutFileName2.c_str());
if(!fout_tmw){
printf("输出文件无法写入!");
return 0;
}
}
//获取当前目录
char pszCurrentPATH[_MAX_PATH];
getcwd(pszCurrentPATH,_MAX_PATH);
//构造遍历文件夹类对象
CStatDir statdir;
//设置要遍历的目录
if (!statdir.SetInitDir(TrainFileDir))
{
puts("训练语料目录不存在。");
return 0;
}
//开始遍历
statdir.BeginBrowse("*.*");
//统计结果中,子目录个数不含 . 及 ..
// printf("训练语料文件总数: %d\n子目录总数: %d\n",statdir.GetFileCount(), statdir.GetSubdirCount());
//处理每篇语料
string tmpName;
int nFileCount = statdir.m_vsFileName.size();
for(int i=0;i<nFileCount;i++)
{
tmpName = statdir.m_vsFileName[i];
// printf("%s\n",(char *)tmpName.c_str());
ifstream Fin(tmpName.c_str());
FILE *stream = fopen(tmpName.c_str(),"r");
if(stream == NULL) printf("the file is not opened\n");
if(!Fin) continue;
string StrBuffer;
getline(Fin,StrBuffer,'\0');
map<string,unsigned int> resmap_one;
unsigned int totalwordcount = GetVecInDoc((const unsigned char *)(StrBuffer.c_str()),resmap_one);
if(fout_tmw) fout_tmw<<totalwordcount;
for(map<string,unsigned int>::const_iterator itc=resmap_one.begin();itc!=resmap_one.end();itc++){
insertwordtomap((const unsigned char *)(itc->first.c_str()),itc->first.size(),resmap);
if(fout_tmw) fout_tmw<<"\t"<<itc->first<<"\t"<<itc->second;
}
if(fout_tmw) fout_tmw<<endl;
Fin.close();
Filecount++;
// printf(">");
}
//返回原来的目录
if (_chdir(pszCurrentPATH) != 0)
return 0;
//如果目录的最后一个字母不是'\',则在最后加上一个'\'
int len=strlen(pszCurrentPATH);
if (pszCurrentPATH[len-1] != '\\')
strcat(pszCurrentPATH,"\\");
/*
string tmpName1=string(TrainFileDir)+DIRSEPSTRING+"*.*";
struct _finddata_t TrainFile;
if((hFile = _findfirst(tmpName1.c_str(), &TrainFile )) == -1L){
printf("训练语料路径无法访问!");
return 0;
}
do
{
string tmpName;
FileNameTmp=TrainFile.name;
if(FileNameTmp=="." || FileNameTmp=="..") continue;
tmpName=string(TrainFileDir)+DIRSEPSTRING+FileNameTmp.c_str();
ifstream Fin(tmpName.c_str());
if(!Fin) continue;
string StrBuffer;
getline(Fin,StrBuffer,'\0');
map<string,unsigned int> resmap_one;
unsigned int totalwordcount = GetVecInDoc((const unsigned char *)(StrBuffer.c_str()),resmap_one);
if(fout_tmw) fout_tmw<<totalwordcount;
for(map<string,unsigned int>::const_iterator itc=resmap_one.begin();itc!=resmap_one.end();itc++){
insertwordtomap((const unsigned char *)(itc->first.c_str()),itc->first.size(),resmap);
if(fout_tmw) fout_tmw<<"\t"<<itc->first<<"\t"<<itc->second;
}
if(fout_tmw) fout_tmw<<endl;
Fin.close();
Filecount++;
printf(">");
}while(! _findnext( hFile, &TrainFile ) );
_findclose( hFile );
*/
// printf("训练语料%d\n",Filecount);
if(!Filecount) return 0;
string resultvecfile =string(ResultVecFileDir)+DIRSEPSTRING+string(ClassName)+".vec";
ofstream fout((char *)resultvecfile.c_str() );
if(!fout){
printf("无法输出训练结果!");
return 0;
}
fout<<Filecount<<endl;
for(map<string,unsigned int>::const_iterator itc=resmap.begin();itc!=resmap.end();itc++){
fout<<itc->first<<"\t"<<itc->second<<endl;
}
if(fout_tmw){
fout_tmw.close();
}
return true;
}
//****************************************************************//
//功能:获得一个字符串的DF格式,即:总词数+词ID+ 词数
//参数:(入口)instr字符串
// (出口)特定输出格式的字符串
//调用函数:getVec
//******************************************************************//
//string CSSPS::CountOneDf(string & instr){
//
//
//}
//-----------------------------------------------------------//
// 功能: 向量化一个字符串
// 参数:
// (入口)strin 输入字符串
// 调用函数:GetVecInDoc
// 返回:向量化文本 总词数 词编号:词数
//----------------------------------------------------------//
string CSSPS::GetVecStr(const string&strin)const{
map<string,unsigned int> resmap;
stringstream sstreamtmp;
sstreamtmp<<GetVecInDoc((const unsigned char*)(strin.c_str()),resmap);
map<string,unsigned int>::const_iterator itcm;
for(map<string,unsigned int>::const_iterator itc=resmap.begin();itc!=resmap.end();itc++){
if( (itcm=allwords.find(itc->first))!=allwords.end() )
sstreamtmp<<" "<<itcm->second<<":"<<itc->second;
}
sstreamtmp<<endl;
return sstreamtmp.str();
}
//-----------------------------------------------------------//
// 功能: 读入一个文本文件并向量化(编号)
// 参数:
// (入口)filename 文件名含路径
// 文件输入filename
// 返回:向量化文本 总词数 词编号:词数
// 调用函数:GetVecStr
// 无法读取返回""
//----------------------------------------------------------//
string CSSPS::GetVecStrFromFile(const string&filename)const{
ifstream Fin(filename.c_str());
if(!Fin) {
cout<<"file can not open"<<endl;
return "";
}
string filecontent;
getline(Fin,filecontent,'\0');
return GetVecStr(filecontent);
}
bool CSSPS::exist(unsigned char* word,unsigned char len,unsigned char* pos){
unsigned char head[2];
head[0] = *word;
head[1] = *(word + 1);
if(head[1] & 128){
int x = head[0] - 0xa1;
int y = head[1] - 0xa1;
if(x<0||x>94||y<0||y>94){
cout<<"this is not a word or a asc code\n";
return false;
}
aWORDdic* wordlist = WIndexcom[x][y].WList;
while(wordlist != NULL){
if(!strncmp((char*)wordlist->sCIYU,(const char *)word,len)){
strcpy((char*)pos,(const char*)wordlist->pos);
return true;
}
else {
wordlist = wordlist->ptrNext;
continue;
}
}
return false;
}
else{
int z = head[1] ;
aWORDdic* wordlist = CIndexcom[z].WList;
while(wordlist != NULL){
if(!strncmp((char *)wordlist->sCIYU,(const char *)word,len)){
pos = wordlist->pos;
return true;
}
else {
wordlist = wordlist->ptrNext;
continue;
}
}
return false;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -