📄 crf.cpp
字号:
trim_line(line); x_num=atoi(line); for(i=0;i<x_num;i++) { fgets(line,MAXSTRLEN-1,fp); trim_line(line); vector<char *> columns; split_string(line,"\t",columns); char *q=x_str.push_back(columns[0]); index=ffmap[atoi(columns[1])]; if(index!=-1) xindex.insert(make_pair(q,index)); } memcpy(fmap,&new_fmap[0],sizeof(int)*new_fmap.size()); fmap_size=new_fmap.size(); fclose(fp); //rewrite first part fout=fopen(model_file,"w"); if(!fout) { printf("can not open model file: %s\n",model_file); return; } //write version fprintf(fout,"version\t%d\n",version); //write templets for(i=0;i<templets.size();i++) { templet &cur_templet=templets[i]; for(j=0;j<cur_templet.x.size();j++) fprintf(fout,"%s%%x[%d,%d]",cur_templet.words[j].c_str(),cur_templet.x[j].first,cur_templet.x[j].second); fprintf(fout,"%s",cur_templet.words[j].c_str()); for(j=0;j<cur_templet.y.size();j++) fprintf(fout,"%%y[%d]",cur_templet.y[j]); fprintf(fout,"\n"); } fprintf(fout,"\n"); //write y fprintf(fout,"%d\n",ysize); for(i=0;i<tags.size();i++) fprintf(fout,"%s\n",tags[i]); fprintf(fout,"\n"); //write x fprintf(fout,"%d\n\n",cols); fprintf(fout,"%d\n",xindex.size()); map<char*, int, str_cmp>::iterator it; for(it = xindex.begin(); it != xindex.end(); it++) fprintf(fout,"%s\t%d\n",it->first,it->second); fprintf(fout,"\n"); fclose(fout); } //write lambda, fmap fout=fopen(model_file,"a"); if(!fout) { printf("can not open model file: %s\n",model_file); return; } //write fmap fprintf(fout,"%d\n",fmap_size); for(i=0;i<fmap_size;i++) fprintf(fout,"%d\n",fmap[i]); fprintf(fout,"\n"); fprintf(fout,"%d\n",lambda_size); for(i=0;i<lambda_size;i++) fprintf(fout,"%lf\n",lambda[i]); fclose(fout); }}void CRF::shrink_feature(){ int i,j,k,ii; if(freq_thresh<=0) { for(i=0;i<fmap_tmp.size();i++) fmap_tmp[i]=i; return; } map<int, int> old2new; int new_lambda_size = 0; int new_mapping_size=0; char temp[MAXSTRLEN]; map<char*, int, str_cmp>::iterator it; int findex=0;//base feature index for(i=0;i<lambda_size;i++) { if(i<x_freq.size() && x_freq[i]>0) findex=i; if(fmap_tmp[i]<freq_thresh) { x_freq[findex]-=fmap_tmp[i]; fmap_tmp[i]=-1;// mapping current feature index to null }else{ fmap_tmp[i]=new_lambda_size++;// mapping current feature index to new_lambda_size } } vector<int> new_fmap; lambda_size=new_lambda_size; for (it= xindex.begin(); it!= xindex.end();) { char *key=it->first; catch_string(key,":",temp); int index=atoi(temp); int gram_num=templets[index].y.size(); if (x_freq[it->second] >0) { old2new.insert(make_pair<int, int>(it->second, new_mapping_size)); new_fmap.insert(new_fmap.end(),fmap_tmp.begin()+it->second,fmap_tmp.begin()+it->second+pow(double(ysize),gram_num)); it->second = new_mapping_size; new_mapping_size += pow(double(ysize),gram_num); ++it; }else{ xindex.erase(it++); } } fmap_tmp=new_fmap; map<int, int>::iterator iter; freelist<int> temp_clique_feature; temp_clique_feature.set_size(PAGESIZE*16); clique_feature.free(); for(i=0;i<sequences_tmp.size();i++) { sequence &seq=sequences_tmp[i]; for(j=0;j<seq.node_num;j++) { node &nod=seq.nodes[j]; for(k=0;k<nod.clique_num;k++) { if(!nod.cliques[k]) continue; clique &cli=*nod.cliques[k]; vector<int> newf; for(ii=0;ii<cli.feature_num;ii++) { iter = old2new.find(cli.fvector[ii]); if(iter != old2new.end()) newf.push_back(iter->second); } int *f; if(newf.size()) f=temp_clique_feature.push_back(&newf[0],newf.size()); else f=NULL; cli.fvector=f; cli.feature_num=newf.size(); } } } clique_feature.clear(); for(i=0;i<sequences_tmp.size();i++) { sequence &seq=sequences_tmp[i]; for(j=0;j<seq.node_num;j++) { node &nod=seq.nodes[j]; for(k=0;k<nod.clique_num;k++) { if(!nod.cliques[k]) continue; clique &cli=*nod.cliques[k]; if(cli.feature_num) { int *f=clique_feature.push_back(cli.fvector,cli.feature_num); cli.fvector=f; }else{ cli.fvector=NULL; } } } } return;}void CRF::tag(vector<vector<string> > &table, vector<vector<string> > &best_tag){ vector<double> sequencep; vector<vector<double> > nodep; tag(table,best_tag,sequencep,nodep);}void CRF::tag(vector<vector<string> > &table, vector<vector<string> > &best_tag,vector<double> &sequencep, vector<vector<double> > &nodep){ sequence seq; generate_sequence(table,seq); crf_thread &cur_thread=threads.front(); cur_thread.build_lattice(seq); cur_thread.viterbi(seq); int i,j; for(i=0;i<cur_thread.best_path.size();i++) { vector<string> cur_tag(seq.node_num); for(j=0;j<seq.node_num;j++) cur_tag[j]=tags[cur_thread.best_path[i][j]]; best_tag.push_back(cur_tag); } if(margin) { double z; cur_thread.forward_backward(seq,z); cur_thread.node_margin(seq,nodep,z); sequencep.resize(cur_thread.best_path.size()); for(i=0;i<cur_thread.best_path.size();i++) { cur_thread.assign_tag(seq,cur_thread.best_path[i]); double c=cur_thread.path_cost(seq); sequencep[i]=exp(c-z); } }}void CRF::generate_sequence(vector<vector<string> > &table,sequence &seq){ int i,j,k; int xcols=cols-1; int rows=table.size(); char s[1024]; char s1[1024]; char s2[1024]; node* nod=nodes.alloc(rows); seq.node_num=rows; seq.nodes=nod; vector<vector<vector<string> > > ext_table(table.size());//split each unit by " " for(i=0;i<table.size();i++){ ext_table[i].resize(table[i].size()); for(j=0;j<table[i].size();j++){ char unit[1000]; strcpy(unit,table[i][j].c_str()); vector<char *> units; split_string(unit," ",units); for(k=0;k<units.size();) { if(!units[k][0]) units.erase(units.begin()+k); else k++; } ext_table[i][j].resize(units.size()); for(k=0;k<units.size();k++) ext_table[i][j][k]=units[k]; } } for(i=0;i<rows;i++) { nod[i].key=0;//random initialize vector<clique*> clisp;//features that affect on current nodes vector<int> feature_vector; for(j=0;j<templets.size();j++) { //get first y's offset templet &pat=templets[j]; if(pat.y[0]+i<0) continue; if(pat.x.size()>0){//if has x int index1,index2; bool has_xstring=true;//false, if unit="" vector<int> xid(pat.x.size(),0);//xid=(0,0): 0 th units + 0 th units vector<int> xtop(pat.x.size(),0); //get xtop for(k=0;k<pat.x.size();k++) { index1=pat.x[k].first+i; index2=pat.x[k].second; if(index1<0) { xtop[k]=1; }else if(index1>=rows){ xtop[k]=1; }else if(!ext_table[index1][index2].size()){//no string here xtop[k]=0; has_xstring=false; break; }else{ xtop[k]=ext_table[index1][index2].size(); } } if(has_xstring) { xid.back()=-1; while(1) { //increase and check whether stop for(k=xid.size()-1;k>=0 && xid[k]+1 == xtop[k];k--) xid[k]=0; if(k<0) break;//stop xid[k]++; sprintf(s, "%d", j); strcat(s,":"); //get x for(k=0;k<pat.x.size();k++) { strcat(s,pat.words[k].c_str()); strcat(s,"//"); index1=pat.x[k].first+i; index2=pat.x[k].second; if(index1<0) { index1=-index1-1; strcpy(s1,"B_"); sprintf(s2,"%d",index1); strcat(s1,s2);//B_0 for example }else if(index1>=rows){ index1-=rows; strcpy(s1,"E_"); sprintf(s2,"%d",index1); strcat(s1,s2);//E_0 for example }else{ strcpy(s1,ext_table[index1][index2][xid[k]].c_str()); } strcat(s,s1); strcat(s,"//"); } strcat(s,pat.words[k].c_str()); //x obtained, insert x map<char *, int, str_cmp>::iterator it; it=xindex.find(s); if(it!=xindex.end()) feature_vector.push_back(it->second); } } }else{//else , no x sprintf(s, "%d", j); strcat(s,":"); strcat(s,pat.words[0].c_str()); //x obtained, insert x map<char *, int, str_cmp>::iterator it; it=xindex.find(s); if(it!=xindex.end()) feature_vector.push_back(it->second); } if(pat.end_of_group) {//creat new clique clique cli; vector<node*> ns; for(k=0;k<pat.y.size();k++) ns.push_back(nod+i+pat.y[k]); node ** np=clique_node.push_back(&ns[0],ns.size()); cli.nodes=np; cli.node_num=ns.size(); if(feature_vector.size()) { int *f=clique_feature.push_back(&feature_vector[0],feature_vector.size()); cli.fvector=f; }else{ cli.fvector=NULL; } cli.feature_num=feature_vector.size(); cli.groupid=templets[j].groupid; cli.key=0;//random initialize clique *new_clique=cliques.push_back(&cli,1); clisp.push_back(new_clique); feature_vector.clear(); } } //set node -> clique if(clisp.size()) nod[i].cliques = node_clique.push_back(&clisp[0],clisp.size()); else nod[i].cliques = NULL; nod[i].clique_num =clisp.size(); } nodes.free(); node_clique.free(); cliques.free(); clique_node.free(); clique_feature.free();}bool CRF::load_model(char *model_file){ int i; char line[MAXSTRLEN]; FILE *fp=fopen(model_file,"r"); if(!fp) { printf("model file: %s not found\n",model_file); return false; } //check version fgets(line,MAXSTRLEN-1,fp); trim_line(line); if(!strncmp(line,"version\t",strlen("version\t"))) { char *p=strstr(line,"\t"); version=atoi(p); }else{ version=40; } fclose(fp); fp=fopen(model_file,"r"); if(version!=40) fgets(line,MAXSTRLEN-1,fp);//skip version line //load templates printf("model version: 0.%d\n",version); while(fgets(line,MAXSTRLEN-1,fp)) { trim_line(line); if(!add_templet(line)) break; } set_order(); printf("template number: %d \n",templets.size()); //get ysize fgets(line,MAXSTRLEN-1,fp); trim_line(line); ysize=atoi(line); tags.resize(ysize); for(i=0;i<ysize;i++) { fgets(line,MAXSTRLEN-1,fp); trim_line(line); char *q=tag_str.push_back(line); tags[i]=q; } printf("tags number: %d \n",ysize); set_group(); //get cols fgets(line,MAXSTRLEN-1,fp); fgets(line,MAXSTRLEN-1,fp); trim_line(line); cols=atoi(line); //load x int index; int x_num; fgets(line,MAXSTRLEN-1,fp); fgets(line,MAXSTRLEN-1,fp); trim_line(line); x_num=atoi(line); for(i=0;i<x_num;i++) { fgets(line,MAXSTRLEN-1,fp); trim_line(line); vector<char *> columns; split_string(line,"\t",columns); char *q=x_str.push_back(columns[0]); index=atoi(columns[1]); xindex.insert(make_pair(q,index)); } if(version!=40) { //load fmap fgets(line,MAXSTRLEN-1,fp);//skip enter fgets(line,MAXSTRLEN-1,fp); trim_line(line); int mapping_size=atoi(line); fmap_size=mapping_size; fmap=new int[fmap_size]; for(i=0;i<fmap_size;i++) { fgets(line,MAXSTRLEN-1,fp); trim_line(line); fmap[i]=atoi(line); } } //load lambda fgets(line,MAXSTRLEN-1,fp);//skip enter fgets(line,MAXSTRLEN-1,fp); trim_line(line); lambda_size=atoi(line); if(version==40) {//create fmap fmap_size=lambda_size; fmap=new int[fmap_size]; for(i=0;i<fmap_size;i++) fmap[i]=i; } lambda=new double[lambda_size]; for(i=0;i<lambda_size;i++) { fgets(line,MAXSTRLEN-1,fp); trim_line(line); lambda[i]=atof(line); } printf("%d lambda loaded\n",i); fclose(fp); path_num=pow((double)ysize,order+1); node_anum=pow((double)ysize,order);//alpha(beta) number of each node head_offset=-log((double)ysize)*order; return true;}void CRF::unload()//unload the sequence data to file to set memory free{ FILE *fp=fopen("__data1","wb"); fwrite(work_space,sizeof(char),work_size/2,fp); fclose(fp); fp=fopen("__data2","wb"); fwrite(work_space+work_size/2,sizeof(char),work_size-work_size/2,fp); fclose(fp);}void CRF::load()//unload the sequence data to file to set memory free{ FILE *fp=fopen("__data1","rb"); fread(work_space,sizeof(char),work_size/2,fp); fclose(fp); fp=fopen("__data2","rb"); fread(work_space+work_size/2,sizeof(char),work_size-work_size/2,fp); fclose(fp);}void CRF::adjust_data(){ int i,j,k; lambda_size=0; for(i=0;i<fmap_size;i++) { if(fmap[i]!=-1 && lambda[fmap[i]]!=0) { if(algorithm==AP_ALGORITHM||algorithm==PA_ALGORITHM)//copy lambda gradient[lambda_size]=lambda[fmap[i]]; fmap[i]=lambda_size++; } else fmap[i]=-1; } //adjust cliques if(chain_type==SIMPLE_CHAIN) { for(i=0;i<path_num && fmap[transit+i]<0;i++); if(i==path_num){ for(i=0;i<path_num;i++) fmap[transit+i]=-2; } } for(i=0;i<sequence_num;i++){ if(chain_type==GENERAL_CHAIN){ sequence &seq=sequences[i]; for(j=0;j<seq.node_num;j++){ node &nod=seq.nodes[j]; for(int k=0;k<nod.clique_num;k++){ if(!nod.cliques[k]) continue; clique &cli=*nod.cliques[k]; for(int ii=0;ii<cli.feature_num;){ int jj; for(jj=0;jj<templet_group[cli.groupid].size() && fmap[cli.fvector[ii]+jj]<0;jj++);//==-1 || ==-2 if(jj==templet_group[cli.groupid].size()){ for(jj=0;jj<templet_group[cli.groupid].size();jj++) fmap[cli.fvector[ii]+jj]=-2;//ready to remove corresponding feature string for(jj=ii;jj<cli.feature_num-1;jj++) cli.fvector[jj]=cli.fvector[jj+1]; cli.feature_num--; }else{ ii++; } } if(cli.feature_num==0) cli.fvector=NULL; } } }else if(chain_type==FIRST_CHAIN||chain_type==SIMPLE_CHAIN){ sequence1 &seq1=sequence1s[i]; for(j=0;j<seq1.vertex_num;j++){ vertex &vtx=seq1.vertexes[j]; for(k=0;k<vtx.feature_num;) { int ii; for(ii=0;ii<ysize && fmap[vtx.fvector[k]+ii]<0;ii++);//==-1 || ==-2 if(ii==ysize){ for(ii=0;ii<ysize;ii++) fmap[vtx.fvector[k]+ii]=-2; for(ii=k;ii<vtx.feature_num-1;ii++) vtx.fvector[ii]=vtx.fvector[ii+1]; vtx.feature_num--; }else{ k++; } } if(vtx.feature_num==0) vtx.fvector=NULL; if(j && chain_type==FIRST_CHAIN){ edge &e=seq1.edges[j-1]; for(k=0;k<e.feature_num;) { int ii; for(ii=0;ii<path_num && fmap[e.fvector[k]+ii]<0;ii++);//==-1 || ==-2 if(ii==path_num){ for(ii=0;ii<path_num;ii++) fmap[e.fvector[k]+ii]=-2; for(ii=k;ii<e.feature_num-1;ii++) e.fvector[ii]=e.fvector[ii+1]; e.feature_num--; }else{ k++; } } if(e.feature_num==0) e.fvector=NULL; } } } } if(algorithm==AP_ALGORITHM||algorithm==PA_ALGORITHM)//copy lambda memcpy(lambda,gradient,sizeof(double)*lambda_size);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -