⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crf.cpp

📁 pocket_crf_0.45
💻 CPP
📖 第 1 页 / 共 3 页
字号:
			trim_line(line);			x_num=atoi(line);			for(i=0;i<x_num;i++)			{				fgets(line,MAXSTRLEN-1,fp);				trim_line(line);				vector<char *> columns;				split_string(line,"\t",columns);				char *q=x_str.push_back(columns[0]);				index=ffmap[atoi(columns[1])];				if(index!=-1)					xindex.insert(make_pair(q,index));			}			memcpy(fmap,&new_fmap[0],sizeof(int)*new_fmap.size());			fmap_size=new_fmap.size();			fclose(fp);			//rewrite first part			fout=fopen(model_file,"w");			if(!fout)			{				printf("can not open model file: %s\n",model_file);				return;			}						//write version			fprintf(fout,"version\t%d\n",version);			//write templets			for(i=0;i<templets.size();i++)			{				templet &cur_templet=templets[i];				for(j=0;j<cur_templet.x.size();j++)					fprintf(fout,"%s%%x[%d,%d]",cur_templet.words[j].c_str(),cur_templet.x[j].first,cur_templet.x[j].second);				fprintf(fout,"%s",cur_templet.words[j].c_str());				for(j=0;j<cur_templet.y.size();j++)					fprintf(fout,"%%y[%d]",cur_templet.y[j]);				fprintf(fout,"\n");			}			fprintf(fout,"\n");			//write y			fprintf(fout,"%d\n",ysize);			for(i=0;i<tags.size();i++)				fprintf(fout,"%s\n",tags[i]);			fprintf(fout,"\n");			//write x			fprintf(fout,"%d\n\n",cols);			fprintf(fout,"%d\n",xindex.size());			map<char*, int, str_cmp>::iterator it;			for(it = xindex.begin(); it != xindex.end(); it++)				fprintf(fout,"%s\t%d\n",it->first,it->second);			fprintf(fout,"\n");			fclose(fout);		}		//write lambda, fmap		fout=fopen(model_file,"a");		if(!fout)		{			printf("can not open model file: %s\n",model_file);			return;		}		//write fmap		fprintf(fout,"%d\n",fmap_size);		for(i=0;i<fmap_size;i++)			fprintf(fout,"%d\n",fmap[i]);		fprintf(fout,"\n");		fprintf(fout,"%d\n",lambda_size);		for(i=0;i<lambda_size;i++)			fprintf(fout,"%lf\n",lambda[i]);		fclose(fout);	}}void CRF::shrink_feature(){	int i,j,k,ii;	if(freq_thresh<=0)	{		for(i=0;i<fmap_tmp.size();i++)			fmap_tmp[i]=i;		return;	}    map<int, int> old2new;    int new_lambda_size = 0;	int new_mapping_size=0;	char temp[MAXSTRLEN];	map<char*, int, str_cmp>::iterator it;	int findex=0;//base feature index	for(i=0;i<lambda_size;i++)	{		if(i<x_freq.size() && x_freq[i]>0)			findex=i;		if(fmap_tmp[i]<freq_thresh)		{			x_freq[findex]-=fmap_tmp[i];			fmap_tmp[i]=-1;//	mapping current feature index to null		}else{			fmap_tmp[i]=new_lambda_size++;// mapping current feature index to new_lambda_size 		}	}	vector<int> new_fmap;	lambda_size=new_lambda_size;    for (it= xindex.begin(); it!= xindex.end();)	{		char *key=it->first;		catch_string(key,":",temp);		int index=atoi(temp);		int gram_num=templets[index].y.size();		if (x_freq[it->second] >0)		{			old2new.insert(make_pair<int, int>(it->second, new_mapping_size));			new_fmap.insert(new_fmap.end(),fmap_tmp.begin()+it->second,fmap_tmp.begin()+it->second+pow(double(ysize),gram_num));			it->second = new_mapping_size;			new_mapping_size += pow(double(ysize),gram_num);			++it;		}else{			xindex.erase(it++);		}    }	fmap_tmp=new_fmap;	map<int, int>::iterator iter;	freelist<int> temp_clique_feature;	temp_clique_feature.set_size(PAGESIZE*16);	clique_feature.free();	for(i=0;i<sequences_tmp.size();i++)	{		sequence &seq=sequences_tmp[i];		for(j=0;j<seq.node_num;j++)		{			node &nod=seq.nodes[j];			for(k=0;k<nod.clique_num;k++)			{				if(!nod.cliques[k])					continue;				clique &cli=*nod.cliques[k];				vector<int> newf;				for(ii=0;ii<cli.feature_num;ii++)				{					iter = old2new.find(cli.fvector[ii]);					if(iter != old2new.end())						newf.push_back(iter->second);				}				int *f;				if(newf.size())					f=temp_clique_feature.push_back(&newf[0],newf.size());				else					f=NULL;				cli.fvector=f;				cli.feature_num=newf.size();			}		}	}	clique_feature.clear();	for(i=0;i<sequences_tmp.size();i++)	{		sequence &seq=sequences_tmp[i];		for(j=0;j<seq.node_num;j++)		{			node &nod=seq.nodes[j];			for(k=0;k<nod.clique_num;k++)			{				if(!nod.cliques[k])					continue;				clique &cli=*nod.cliques[k];				if(cli.feature_num)				{						int *f=clique_feature.push_back(cli.fvector,cli.feature_num);					cli.fvector=f;				}else{					cli.fvector=NULL;				}			}		}	}    return;}void CRF::tag(vector<vector<string> > &table, vector<vector<string> > &best_tag){	vector<double> sequencep;	vector<vector<double> > nodep;	tag(table,best_tag,sequencep,nodep);}void CRF::tag(vector<vector<string> > &table, vector<vector<string> > &best_tag,vector<double> &sequencep, vector<vector<double> > &nodep){	sequence seq;	generate_sequence(table,seq);	crf_thread &cur_thread=threads.front();	cur_thread.build_lattice(seq);	cur_thread.viterbi(seq);		int i,j;	for(i=0;i<cur_thread.best_path.size();i++)	{		vector<string> cur_tag(seq.node_num);		for(j=0;j<seq.node_num;j++)			cur_tag[j]=tags[cur_thread.best_path[i][j]];		best_tag.push_back(cur_tag);	}		if(margin)	{		double z;		cur_thread.forward_backward(seq,z);		cur_thread.node_margin(seq,nodep,z);		sequencep.resize(cur_thread.best_path.size());		for(i=0;i<cur_thread.best_path.size();i++)		{			cur_thread.assign_tag(seq,cur_thread.best_path[i]);			double c=cur_thread.path_cost(seq);			sequencep[i]=exp(c-z);		}	}}void CRF::generate_sequence(vector<vector<string> > &table,sequence &seq){	int i,j,k;	int xcols=cols-1;	int rows=table.size();	char s[1024];	char s1[1024];	char s2[1024];	node* nod=nodes.alloc(rows);	seq.node_num=rows;	seq.nodes=nod;	vector<vector<vector<string> > > ext_table(table.size());//split each unit by " "	for(i=0;i<table.size();i++){		ext_table[i].resize(table[i].size());		for(j=0;j<table[i].size();j++){			char unit[1000];			strcpy(unit,table[i][j].c_str());			vector<char *> units;			split_string(unit," ",units);			for(k=0;k<units.size();)			{				if(!units[k][0])					units.erase(units.begin()+k);				else					k++;			}			ext_table[i][j].resize(units.size());			for(k=0;k<units.size();k++)				ext_table[i][j][k]=units[k];		}	}	for(i=0;i<rows;i++)	{		nod[i].key=0;//random initialize		vector<clique*> clisp;//features that affect on current nodes		vector<int> feature_vector;		for(j=0;j<templets.size();j++)		{			//get first y's offset			templet &pat=templets[j];			if(pat.y[0]+i<0)				continue;			if(pat.x.size()>0){//if has x				int index1,index2;				bool has_xstring=true;//false, if unit=""				vector<int> xid(pat.x.size(),0);//xid=(0,0): 0 th units + 0 th units				vector<int> xtop(pat.x.size(),0);				//get xtop				for(k=0;k<pat.x.size();k++)				{					index1=pat.x[k].first+i;					index2=pat.x[k].second;					if(index1<0)					{						xtop[k]=1;					}else if(index1>=rows){						xtop[k]=1;					}else if(!ext_table[index1][index2].size()){//no string here						xtop[k]=0;						has_xstring=false;						break;					}else{						xtop[k]=ext_table[index1][index2].size();					}				}				if(has_xstring)				{					xid.back()=-1;					while(1)					{						//increase and check whether stop						for(k=xid.size()-1;k>=0 && xid[k]+1 == xtop[k];k--)							xid[k]=0;						if(k<0)							break;//stop						xid[k]++;						sprintf(s, "%d", j);						strcat(s,":");												//get x						for(k=0;k<pat.x.size();k++)						{							strcat(s,pat.words[k].c_str());							strcat(s,"//");							index1=pat.x[k].first+i;							index2=pat.x[k].second;							if(index1<0)							{								index1=-index1-1;								strcpy(s1,"B_");								sprintf(s2,"%d",index1);								strcat(s1,s2);//B_0 for example							}else if(index1>=rows){								index1-=rows;								strcpy(s1,"E_");								sprintf(s2,"%d",index1);								strcat(s1,s2);//E_0 for example							}else{								strcpy(s1,ext_table[index1][index2][xid[k]].c_str());							}							strcat(s,s1);							strcat(s,"//");						}												strcat(s,pat.words[k].c_str());						//x obtained, insert x						map<char *, int, str_cmp>::iterator it;						it=xindex.find(s);						if(it!=xindex.end())							feature_vector.push_back(it->second);					}				}			}else{//else , no x				sprintf(s, "%d", j);				strcat(s,":");				strcat(s,pat.words[0].c_str());				//x obtained, insert x				map<char *, int, str_cmp>::iterator it;				it=xindex.find(s);				if(it!=xindex.end())					feature_vector.push_back(it->second);			}			if(pat.end_of_group)			{//creat new clique				clique cli;				vector<node*> ns;				for(k=0;k<pat.y.size();k++)					ns.push_back(nod+i+pat.y[k]);				node ** np=clique_node.push_back(&ns[0],ns.size());				cli.nodes=np;				cli.node_num=ns.size();				if(feature_vector.size())				{					int *f=clique_feature.push_back(&feature_vector[0],feature_vector.size());					cli.fvector=f;				}else{					cli.fvector=NULL;				}				cli.feature_num=feature_vector.size();				cli.groupid=templets[j].groupid;				cli.key=0;//random initialize				clique *new_clique=cliques.push_back(&cli,1);				clisp.push_back(new_clique);				feature_vector.clear();			}		}		//set node -> clique		if(clisp.size())			nod[i].cliques = node_clique.push_back(&clisp[0],clisp.size());		else			nod[i].cliques = NULL;		nod[i].clique_num =clisp.size();	}	nodes.free();	node_clique.free();	cliques.free();	clique_node.free();	clique_feature.free();}bool CRF::load_model(char *model_file){	int i;	char line[MAXSTRLEN];	FILE *fp=fopen(model_file,"r");	if(!fp)	{		printf("model file: %s not found\n",model_file);		return false;	}	//check version	fgets(line,MAXSTRLEN-1,fp);	trim_line(line);	if(!strncmp(line,"version\t",strlen("version\t")))	{		char *p=strstr(line,"\t");		version=atoi(p);	}else{		version=40;	}	fclose(fp);	fp=fopen(model_file,"r");	if(version!=40)		fgets(line,MAXSTRLEN-1,fp);//skip version line	//load templates	printf("model version: 0.%d\n",version);	while(fgets(line,MAXSTRLEN-1,fp))	{		trim_line(line);		if(!add_templet(line))			break;	}	set_order();	printf("template number: %d \n",templets.size());	//get ysize	fgets(line,MAXSTRLEN-1,fp);	trim_line(line);	ysize=atoi(line);	tags.resize(ysize);	for(i=0;i<ysize;i++)	{		fgets(line,MAXSTRLEN-1,fp);		trim_line(line);		char *q=tag_str.push_back(line);		tags[i]=q;	}	printf("tags number: %d \n",ysize);	set_group();	//get cols	fgets(line,MAXSTRLEN-1,fp);	fgets(line,MAXSTRLEN-1,fp);	trim_line(line);	cols=atoi(line);	//load x	int index;	int x_num;		fgets(line,MAXSTRLEN-1,fp);	fgets(line,MAXSTRLEN-1,fp);	trim_line(line);	x_num=atoi(line);	for(i=0;i<x_num;i++)	{		fgets(line,MAXSTRLEN-1,fp);		trim_line(line);		vector<char *> columns;		split_string(line,"\t",columns);		char *q=x_str.push_back(columns[0]);		index=atoi(columns[1]);		xindex.insert(make_pair(q,index));	}	if(version!=40)	{		//load fmap		fgets(line,MAXSTRLEN-1,fp);//skip enter		fgets(line,MAXSTRLEN-1,fp);		trim_line(line);		int mapping_size=atoi(line);		fmap_size=mapping_size;		fmap=new int[fmap_size];		for(i=0;i<fmap_size;i++)		{			fgets(line,MAXSTRLEN-1,fp);			trim_line(line);			fmap[i]=atoi(line);		}	}	//load lambda	fgets(line,MAXSTRLEN-1,fp);//skip enter	fgets(line,MAXSTRLEN-1,fp);	trim_line(line);	lambda_size=atoi(line);	if(version==40)	{//create fmap		fmap_size=lambda_size;		fmap=new int[fmap_size];		for(i=0;i<fmap_size;i++)			fmap[i]=i;	}	lambda=new double[lambda_size];	for(i=0;i<lambda_size;i++)	{		fgets(line,MAXSTRLEN-1,fp);		trim_line(line);		lambda[i]=atof(line);	}	printf("%d lambda loaded\n",i);	fclose(fp);	path_num=pow((double)ysize,order+1);	node_anum=pow((double)ysize,order);//alpha(beta) number of each node	head_offset=-log((double)ysize)*order;	return true;}void CRF::unload()//unload the sequence data to file to set memory free{	FILE *fp=fopen("__data1","wb");	fwrite(work_space,sizeof(char),work_size/2,fp);	fclose(fp);	fp=fopen("__data2","wb");	fwrite(work_space+work_size/2,sizeof(char),work_size-work_size/2,fp);	fclose(fp);}void CRF::load()//unload the sequence data to file to set memory free{	FILE *fp=fopen("__data1","rb");	fread(work_space,sizeof(char),work_size/2,fp);	fclose(fp);	fp=fopen("__data2","rb");	fread(work_space+work_size/2,sizeof(char),work_size-work_size/2,fp);	fclose(fp);}void CRF::adjust_data(){	int i,j,k;	lambda_size=0;	for(i=0;i<fmap_size;i++)	{		if(fmap[i]!=-1 && lambda[fmap[i]]!=0)		{			if(algorithm==AP_ALGORITHM||algorithm==PA_ALGORITHM)//copy lambda				gradient[lambda_size]=lambda[fmap[i]];			fmap[i]=lambda_size++;		}		else			fmap[i]=-1;	}	//adjust cliques	if(chain_type==SIMPLE_CHAIN)	{		for(i=0;i<path_num && fmap[transit+i]<0;i++);		if(i==path_num){			for(i=0;i<path_num;i++)				 fmap[transit+i]=-2;		}	}	for(i=0;i<sequence_num;i++){		if(chain_type==GENERAL_CHAIN){			sequence &seq=sequences[i];			for(j=0;j<seq.node_num;j++){				node &nod=seq.nodes[j];				for(int k=0;k<nod.clique_num;k++){					if(!nod.cliques[k])						continue;					clique &cli=*nod.cliques[k];					for(int ii=0;ii<cli.feature_num;){						int jj;						for(jj=0;jj<templet_group[cli.groupid].size() && fmap[cli.fvector[ii]+jj]<0;jj++);//==-1 || ==-2						if(jj==templet_group[cli.groupid].size()){							for(jj=0;jj<templet_group[cli.groupid].size();jj++)								fmap[cli.fvector[ii]+jj]=-2;//ready to remove corresponding feature string							for(jj=ii;jj<cli.feature_num-1;jj++)								cli.fvector[jj]=cli.fvector[jj+1];							cli.feature_num--;						}else{							ii++;						}					}					if(cli.feature_num==0)						cli.fvector=NULL;				}			}		}else if(chain_type==FIRST_CHAIN||chain_type==SIMPLE_CHAIN){			sequence1 &seq1=sequence1s[i];			for(j=0;j<seq1.vertex_num;j++){				vertex &vtx=seq1.vertexes[j];				for(k=0;k<vtx.feature_num;)				{					int ii;					for(ii=0;ii<ysize && fmap[vtx.fvector[k]+ii]<0;ii++);//==-1 || ==-2					if(ii==ysize){						for(ii=0;ii<ysize;ii++)							fmap[vtx.fvector[k]+ii]=-2;						for(ii=k;ii<vtx.feature_num-1;ii++)							vtx.fvector[ii]=vtx.fvector[ii+1];						vtx.feature_num--;					}else{						k++;					}				}				if(vtx.feature_num==0)					vtx.fvector=NULL;				if(j && chain_type==FIRST_CHAIN){					edge &e=seq1.edges[j-1];					for(k=0;k<e.feature_num;)					{						int ii;						for(ii=0;ii<path_num && fmap[e.fvector[k]+ii]<0;ii++);//==-1 || ==-2						if(ii==path_num){							for(ii=0;ii<path_num;ii++)								fmap[e.fvector[k]+ii]=-2;							for(ii=k;ii<e.feature_num-1;ii++)								e.fvector[ii]=e.fvector[ii+1];							e.feature_num--;						}else{							k++;						}					}					if(e.feature_num==0)						e.fvector=NULL;				}			}		}	}	if(algorithm==AP_ALGORITHM||algorithm==PA_ALGORITHM)//copy lambda		memcpy(lambda,gradient,sizeof(double)*lambda_size);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -