📄 crf.h

📁 pocket_crf_0.45
💻 H
字号:
/****************************************************************************************************************//*	Pocket CRF 0.45/*	Copyright(c) 2005-2008 Taku Kudo <taku@chasen.org>/****************************************************************************************************************/#ifndef CRF_H#define CRF_H#include <map>#include <vector>#include <string>#include <stdio.h>#include "freelist.h"#include "crf_thread.h"using namespace std;const int SIMPLE_CHAIN=0;const int FIRST_CHAIN=1;//first order markov chainconst int GENERAL_CHAIN=2;const int CRF_ALGORITHM=0;const int AP_ALGORITHM=1;const int PA_ALGORITHM=2;const int SPEED_PRIOR=0;const int MEM_PRIOR=1;class str_length{	public:		size_t operator()(const char *str) const  {return strlen(str)+1;}};typedef freelist<char, str_length> charlist;typedef struct templet{	vector<string> words;	vector<pair<int,int> > x;	vector<int> y;	bool end_of_group;	int groupid;}templet;/*define the templet "abc%x[1,0]xyz%x[-1,0]y[0]" in templet filehere,words={"abc","xyz",""}x={[1,0],[-1,0]}y={0}templets share the same y are clustered into one group:%x[0,0]%y[0]			groupid=0	end_of_group=true%x[0,0]%y[-1]%y[0]		groupid=1	end_of_group=falsex[-2,0]%y[-1]%y[0]		groupid=1	end_of_group=true*/class templet_cmp{public:	bool operator()(const templet &s, const templet &t) const	{		for(int i=0;i<s.y.size();i++)		{			if(s.y[i]<t.y[i])				return false;			else if(s.y[i]>t.y[i])				return true;		}		return false;	}};class str_cmp{public:	bool operator()(const char* s, const char* t) const 	{		return strcmp(s,t)<0;	}};template <class T>class inverse_cmp{	public:	bool operator()(T s, T t) const 	{		return s>t;	}};struct node;typedef struct clique{	int *fvector;	int feature_num;	node **nodes;	int node_num;	int groupid;	int key;}clique;/*for a concrete x, CRF use all templets for matching, and creates one clique for each templet groupfvector[i] ~ fvector[i] + ysize^node_num is the lambda space for the i th templet in the group(ysize is the tag number, for "BIO" tagging, ysize=3)feature_num is the group size.nodes is the tokens linked by the clique, e.g. for "y[-1]y[0]" clique, the previous and current tokens are linkednode_num is the number of nodes, e.g. it is 2 for "y[-1]y[0]" cliquekey: if the nodes are tagged, then clique.key is the index of the corresponding lambda in the clique's lambda spacee.g. for "y[-1]y[0]" clique, nodes are tagged as "IB", thenkey = 2*3^1+0*3^0 = 6, i.e. r*f_i = lambda[fvector[i]+6] (f_i=1 here)*/typedef struct node{public:	clique **cliques;	int clique_num;	int key;}node;/*cliques are the cliques linked with current node*/typedef struct sequence{	node* nodes;	int node_num;}sequence;/*node is the structure of token, path is the edge.For "BIO" tagging, a sequence "x1...xn" contains n nodes,if the order of CRF is 1, i.e. uses bigram features at most, thenthe path number is n * 3 * 3B	  .			.I	  .			.O	  .			.	node_i	node_{i+1}  corresponding paths: (i_B,{i+1}_B),(i_B,{i+1}_I),...(i_O,{i+1}_O)*//*following structures used for 1st order markov chain only*/typedef struct vertex{	int key;	int feature_num;	int *fvector;}vertex;typedef struct edge{	int feature_num;	int *fvector;}edge;typedef struct sequence1{	int vertex_num;	vertex *vertexes;	edge *edges;}sequence1;class crf_thread;class CRF{friend class crf_thread;public:	CRF();	CRF(double C_, int freq_thresh_, bool margin_, int nbest_);//old api	bool set_para(char *para_name, char *para_value);	~CRF();	bool learn(char* templet_file, char *training_file, char *model_file);	bool load_model(char *model_file);	void tag(vector<vector<string> > &table, vector<vector<string> > &best_tag,vector<double> &sequencep, vector<vector<double> > &nodep);	void tag(vector<vector<string> > &table, vector<vector<string> > &best_tag);	//see readme.htmlprivate:	bool load_templet(char *templet_file);	//load templets from file.	bool add_templet(char *line);	//add templets, line is the use defined templet string, cur_group is the number of group currently formed.	bool set_order();	//set groupid,end_of_group,order	void set_group();	//after templets loading, set templet_group	void set_chain_type();	//set chain_type	bool check_training(char *training_file);	//check the training file , get x cols, tag number	bool generate_features(char *training_file);	//generate features according to templates and training file	void compress();	void adjust_data();	//adjust fmap, lambda_size	vector<templet> templets;	//templates	vector<vector<vector<int> > > templet_group;	//templet_group[i] is the i th group	//templet_group[i].size() is the number of parameters number of i th group (size of the parameter space)	//e.g. for "y[-1]y[0]" group, its size is 3*3=9	//templet_group[i][j] is the indexes of the paths that affect the i th group, j th parameter	//for 2 order CRF, ("y[-2]y[-1]y[0]"), group[i] is "y[-2]y[0]" group, tagged with IO, 	//group[i][1*3^1+2*3^0]=group[i][5]	//then paths that affected the 5th parameter are the {5,14,23} th paths	//group[i][5]={5,14,23}	int gsize;//templet_group.size()	//x	map<char *, int, str_cmp> xindex;//<"1:Confidence", 132> > 132 th x	vector<int> x_freq;//x_freq[i] is the frequency of the i th x	charlist x_str;//space storing x	vector<char *>tags;//sorted in alphabet order	charlist tag_str;//space storing tags		double *lambda;//parameters	size_t lambda_size;//parameter size	int cols;//cols of training file	int ysize;//tags.size()	int order;//order of CRF	double sigma;//sigma||\lambda||^2	double l1;//l1 factor	int freq_thresh;//see readme.html	int max_iter;//max iteration number	double eta;//controls the iteration accuracy	double* gradient;//gradient=d L(\lambda) / d \lambda	double fvalue;//L(\lambda)	//L(\lambda)=\sum_{sequences} [ z(\lambda) - \sum_{f \in y} \lambda * f(x,y) ]	//z(\lambda)=\log{\sum_y exp(\sum_{f \in y} \lambda * f(x,y)) }	vector<sequence> sequences_tmp;	sequence *sequences;	sequence1 *sequence1s;	int sequence_num;	freelist<node> nodes;//storing all nodes	freelist<clique> cliques;//storing all cliques	freelist<node*> clique_node;//storing clique linked nodes' addresses	freelist<clique*> node_clique;//storing nodes linked cliques' addresses	freelist<int> clique_feature;//clique->feature		bool add_x(vector<char *> &table);//add x from table to xindex	bool insert_x(char *x, int & index);//insert x to xindex, return its index	void shrink_feature();//eliminate the x with frequency less than freq_thresh	void write_model(char *model_file,bool first_part);	//test code	void generate_sequence(vector<vector<string> > &table, sequence &seq);	//generate seq according to user's input table	bool margin;	//true if margin probabilities for each node calculation is required	int nbest;	//output nbest results	//global temporary variables	int path_num;//=pow((double)ysize,order+1);	int node_anum;//pow((double)ysize,order),alpha(beta) number of each node	double head_offset;//=log((double)ysize)*order; node_{-1}'s prior possibility	vector<int> fmap_tmp;	int *fmap;	int fmap_size;	int version;	vector<crf_thread> threads;	unsigned int thread_num;	char *work_space;	unsigned long work_size;	void unload();	void load();	int chain_type;	int transit;	int max_seq_size;	double *transit_buff;	int algorithm;	int total_nodes;	int depth;	int prior;};#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -