📄 featuregen.cpp

📁 Hieu Xuan Phan & Minh Le Nguyen 利用CRF统计模型写的可用于英文命名实体识别、英文分词的工具（开放源码）。CRF模型最早由Lafferty提出
💻 CPP
字号:
/* * Copyright (C) 2004 - 2005 by *     Hieu Xuan Phan & Minh Le Nguyen {hieuxuan, nguyenml}@jaist.ac.jp *     Graduate School of Information Science, *     Japan Advanced Institute of Science and Technology (JAIST) * * featuregen.cpp - this file is part of FlexCRFs. * * Begin:	Dec. 15, 2004 * Last change:	Oct. 29, 2005 * * FlexCRFs is a free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * FlexCRFs is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with FlexCRFs; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */#include <ctype.h>#include "../../../include/featuregen.h"#include "../../../include/data.h"#include "../../../include/strtokenizer.h"#include "../../../include/dictionary.h"using namespace std;// generating featuresvoid featuregen::gen_features() {    features.clear();    sfeatures.clear();    efeatures.clear();        if (!pdata || !pdict) {	return;    }        if (!(pdata->ptrndata)) {	return;    }        dataset::iterator datait;    sequence::iterator seqit;        feature f;        // scan over all data sequences    for (datait = pdata->ptrndata->begin(); datait != pdata->ptrndata->end(); datait++) {		// scan over all observations in each data sequence	int pos = 0;	for (seqit = datait->begin(); seqit != datait->end(); seqit++) {	    	    // generating edge feature (type 1)	    if (pos > 0) {		// for both first- and second-order Markov		f.efeature1_init(seqit->label, (*datait)[pos - 1].label);		f.strid2idx(fmap);		if (f.idx == -1) {		    // new feature, thus add to the feature list		    add_feature(f);		    // add the new edge feature to the vector of edge features		    efeatures.push_back(f);		}	    }	    	    // create edge feature (type 2)	    if (popt->order == SECOND_ORDER && pos > 0) {		f.efeature2_init(seqit->label2order, (*datait)[pos - 1].label2order);		f.strid2idx(fmap);		if (f.idx == -1) {		    // new feature, thus add to the feature list		    add_feature(f);		    // add the new edge feature to the vector of edge features		    efeatures.push_back(f);		}	    }	    	    // generating state features	    // scan over all context predicates	    vector<int>::iterator cpit;	    for (cpit = (seqit->cps).begin(); cpit != (seqit->cps).end(); cpit++) {				// do not generate too rare features		map<int, element>::iterator dictit;		map<int, pair<int, int> >::iterator labelit, label2orderit;				int create_sfeature2 = 0;				dictit = pdict->dict.find(*cpit);		if(dictit != pdict->dict.end()) {		    if (dictit->second.count <= popt->cp_rare_threshold) {			continue;		    }		    		    		    int f_rare_threshold = popt->f_rare_threshold;		    		    if (popt->multiple_f_rare_thresholds) {			// using multiple rare thresholds for features			mapcpint2str::iterator cpmapit;			cpmapit = pdata->pcpi2s->find(*cpit);			if (cpmapit != pdata->pcpi2s->end()) {			    if (isdigit(cpmapit->second[0])) {				f_rare_threshold = cpmapit->second[0] - '0';			    } else if (cpmapit->second.size() >= 2 && cpmapit->second[0] == '#' 						    && isdigit(cpmapit->second[1])) {				f_rare_threshold = cpmapit->second[1] - '0';			    }			}		    }				    labelit = dictit->second.lb_cnt_fidxes.find(seqit->label);		    if (labelit != dictit->second.lb_cnt_fidxes.end()){			if (labelit->second.first <= f_rare_threshold) {			    continue;			}		    }		    		    		    label2orderit = dictit->second.lb2order_cnt_fidxes.find(seqit->label2order);		    if (label2orderit != dictit->second.lb2order_cnt_fidxes.end()) {			if (label2orderit->second.first > f_rare_threshold) {			    create_sfeature2 = 1;			}		    }		}						// create a state feature (type 1)		f.sfeature1_init(seqit->label, *cpit);		f.strid2idx(fmap);		if (f.idx == -1) {		    // new feature, add to the feature list		    add_feature(f);		    labelit->second.second = f.idx;		    dictit->second.chosen = 1;		}				if (create_sfeature2) {		    mapcpint2str::iterator cpmapit;		    cpmapit = pdata->pcpi2s->find(*cpit);		    if (cpmapit != pdata->pcpi2s->end()) {			if (cpmapit->second[0] != '#') {			    create_sfeature2 = 0;			}		    }		}				if (popt->order == SECOND_ORDER && create_sfeature2) {		    // create a state feature (type 2)		    f.sfeature2_init(seqit->label2order, *cpit);		    f.strid2idx(fmap);		    if (f.idx == -1) {			// new feature, add to the feature list			add_feature(f);			label2orderit->second.second = f.idx;		    }		}	    }	    	    pos++;	}    }        // update the number of features    if (popt) {	popt->num_features = features.size();    }}// write all features to filevoid featuregen::write_features(FILE * fout) {    list<feature>::iterator fit;        // write number of features    fprintf(fout, "%d\n", features.size());        for (fit = features.begin(); fit != features.end(); fit++) {	fprintf(fout, "%s\n", (fit->to_string(*(pdata->pcpi2s), *(pdata->plbi2s), *(pdata->plb2to1))).c_str());    }        fprintf(fout, "##################################################\n");}// read features from filevoid featuregen::read_features(FILE * fin) {    features.clear();    efeatures.clear();        string line;    char buff[BUFF_SIZE_SHORT];        fgets(buff, BUFF_SIZE_SHORT - 1, fin);    int num_features = atoi(buff);        for (int i = 0; i < num_features; i++) {	fgets(buff, BUFF_SIZE_SHORT - 1, fin);	line = buff;		strtokenizer tok(line, " ");	if (tok.count_tokens() != 3) {	    continue;	}    		// create a new feature by parsing the line	feature f(line, *(pdata->pcps2i), *(pdata->plbs2i), *(pdata->plb2to1s2i));		map<string, int>::iterator it;	it = fmap.find(f.strid);	if (it == fmap.end()) {	    // insert to the feature map	    fmap.insert(pair<string, int>(f.strid, f.idx));	    features.push_back(f);	    	    if (f.ftype == EDGE_FEATURE1 || f.ftype == EDGE_FEATURE2) {		efeatures.push_back(f);	    }	}    }    fgets(buff, BUFF_SIZE_SHORT - 1, fin);	// read the line ###...        // update the number of features    if (popt) {	popt->num_features = features.size();    }}// scan all features at position posvoid featuregen::start_scan_features_at(sequence & seq, int pos) {    start_scan_sfeatures_at(seq, pos);    start_scan_efeatures();}// have more features?int featuregen::has_next_feature() {    return (has_next_efeature() || has_next_sfeature());}// get the next featurevoid featuregen::next_feature(feature & f) {    if (has_next_efeature()) {	next_efeature(f);    } else if (has_next_sfeature()) {	next_sfeature(f);    } else {	// do nothing    }}// scan all state featuresvoid featuregen::start_scan_sfeatures_at(sequence & seq, int pos) {    sfeatures.clear();    sf_idx = 0;        // get the pointer to the observation at pos    obsr * pobsr = &seq[pos];        vector<int>::iterator cpit;    map<int, element>::iterator dictit;    map<int, pair<int, int> >::iterator labelit, label2orderit, label2orderit1;        feature sf;        // scan over all context predicates    for (cpit = pobsr->cps.begin(); cpit != pobsr->cps.end(); cpit++) {	dictit = pdict->dict.find(*cpit);		if (dictit == pdict->dict.end()) {	    continue;	}		if (!(dictit->second.is_scanned)) {	    // scan all first-order labels for state feature (type 1)	    for (labelit = dictit->second.lb_cnt_fidxes.begin(); 		    labelit != dictit->second.lb_cnt_fidxes.end(); labelit++) {		if (labelit->second.second >= 0) {		    sf.sfeature1_init(labelit->first, *cpit);		    sf.idx = labelit->second.second;		    		    if (popt->highlight_feature) {			// highlight feature			sf.val += (double)labelit->second.first / dictit->second.count;		    }		    		    dictit->second.cpfeatures.push_back(sf);		}	    }	    	    // scan all second-order labels for state feature (type 2)	    for (label2orderit = dictit->second.lb2order_cnt_fidxes.begin(); 		    label2orderit != dictit->second.lb2order_cnt_fidxes.end(); label2orderit++) {		if (label2orderit->second.second >= 0) {		    sf.sfeature2_init(label2orderit->first, *cpit);		    sf.idx = label2orderit->second.second;		    		    if (popt->highlight_feature) {			// highlight feature			int prev_label = label2orderit->first / popt->num_labels;			int sum_count = 0;			for (label2orderit1 = dictit->second.lb2order_cnt_fidxes.begin();				    label2orderit1 != dictit->second.lb2order_cnt_fidxes.end(); 			    	    label2orderit1++) {							    if (label2orderit1->second.second >= 0 &&					(label2orderit1->first / popt->num_labels) == prev_label) {				sum_count += label2orderit1->second.first;							    }			}		    			if (sum_count > 0) {			    sf.val += (double)label2orderit->second.first / sum_count;			    			}		    }		    		    dictit->second.cpfeatures.push_back(sf);		}	    }	    		    dictit->second.is_scanned = 1;	}		for (int i = 0; i < dictit->second.cpfeatures.size(); i++) {	    sfeatures.push_back(&(dictit->second.cpfeatures[i]));	}    }}// has more state features?int featuregen::has_next_sfeature() {    return (sf_idx < sfeatures.size());}// get the next state featurevoid featuregen::next_sfeature(feature & sf) {    sf = *(sfeatures[sf_idx++]);}// scan all edge featuresvoid featuregen::start_scan_efeatures() {    ef_idx = 0;}// have more edge featureint featuregen::has_next_efeature() {    return (ef_idx < efeatures.size());}// get the next edge featurevoid featuregen::next_efeature(feature & ef) {    ef = efeatures[ef_idx++];}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -