📄 option.cpp

📁 Hieu Xuan Phan & Minh Le Nguyen 利用CRF统计模型写的可用于英文命名实体识别、英文分词的工具（开放源码）。CRF模型最早由Lafferty提出
💻 CPP
字号:
/* * Copyright (C) 2004 - 2005 by *     Hieu Xuan Phan & Minh Le Nguyen {hieuxuan, nguyenml}@jaist.ac.jp *     Graduate School of Information Science, *     Japan Advanced Institute of Science and Technology (JAIST) * * option.cpp - this file is part of FlexCRFs. * * Begin:	Dec. 15, 2004 * Last change:	Nov. 06, 2005 * * FlexCRFs is a free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * FlexCRFs is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with FlexCRFs; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */#include <stdio.h>#include <string>#include "../../../include/option.h"#include "../../../include/strtokenizer.h"using namespace std;void option::read_and_parse(FILE * fin) {    char buff[BUFF_SIZE_SHORT];    string line;        while (fgets(buff, BUFF_SIZE_SHORT - 1, fin)) {	line = buff;    	// find '#' character: if found, the current line is a comment	int i = 0;	while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) {	    i++;	}		if (i < line.size() && line[i] == '#') {	    continue;	}			strtokenizer tok(line, "= \t\r\n");	int len = tok.count_tokens();	if (len != 2) {	    // invalid, ignore this line	    continue;	}		string optstr = tok.next_token();	string optval = tok.next_token();		if (optstr == "traindata_file") {	    trndata_file = optval;		} else if (optstr == "testdata_file") {	    tstdata_file = optval;		} else if (optstr == "unlbldata_file") {	    ulbdata_file = optval;	    	} else if (optstr == "model_file") {	    model_file = optval;	    	} else if (optstr == "order") {	    order = atoi(optval.c_str());	    	} else if (optstr == "label_of_first_observation") {	    lfostr = optval;		} else if (optstr == "trainlog_file") {	    trainlog_file = optval;		} else if (optstr == "num_labels") {	    num_labels = atoi(optval.c_str());		} else if (optstr == "num_trnseqs") {	    num_trnseqs = atoi(optval.c_str());	    	} else if (optstr == "num_tstseqs") {	    num_tstseqs = atoi(optval.c_str());	    	} else if (optstr == "num_ulbseqs") {	    num_ulbseqs = atoi(optval.c_str());	    	} else if (optstr == "num_cps") {	    num_cps = atoi(optval.c_str());	    	} else if (optstr == "num_features") {	    num_features = atoi(optval.c_str());		} else if (optstr == "f_rare_threshold") {	    f_rare_threshold = atoi(optval.c_str());	    	} else if (optstr == "cp_rare_threshold") {	    cp_rare_threshold = atoi(optval.c_str());	    	} else if (optstr == "multiple_f_rare_thresholds") {	    multiple_f_rare_thresholds = atoi(optval.c_str());	} else if (optstr == "highlight_feature") {	    highlight_feature = atoi(optval.c_str());	    	} else if (optstr == "nbest") {	    nbest = atoi(optval.c_str());	    	} else if (optstr == "num_iterations") {	    num_iterations = atoi(optval.c_str());		} else if (optstr == "init_lambda_val") {	    init_lambda_val = atof(optval.c_str());			} else if (optstr == "sigma_square") {	    sigma_square = atof(optval.c_str());		} else if (optstr == "eps_for_convergence") {	    eps_for_convergence = atof(optval.c_str());		} else if (optstr == "m_for_hessian") {	    m_for_hessian = atoi(optval.c_str());	    		} else if (optstr == "debug_level") {	    debug_level = atoi(optval.c_str());		} else if (optstr == "is_scaling") {	    is_scaling = atoi(optval.c_str());		} else if (optstr == "is_logging") {	    is_logging = atoi(optval.c_str());		} else if (optstr == "evaluate_during_training") {	    evaluate_during_training = atoi(optval.c_str());			} else if (optstr == "chunk_evaluate_during_training") {	    chunk_evaluate_during_training = atoi(optval.c_str());	    	} else if (optstr == "chunktype") {	    if (optval == "IOB1" || optval == "iob1") {		chunktype = IOB1;	    }	    if (optval == "IOB2" || optval == "iob2") {		chunktype = IOB2;	    }	    if (optval == "IOE1" || optval == "ioe1") {		chunktype = IOE1;	    }	    if (optval == "IOE2" || optval == "ioe2") {		chunktype = IOE2;	    }	    	} else if (optstr == "chunk") {	    strtokenizer tok(optval, ":");	    int len = tok.count_tokens();	    if (len == 3) {		vector<string> tags;		for (int i = 0; i < len; i++) {		    tags.push_back(tok.token(i));		}		chunks.push_back(tags);	    }	} else if (optstr == "prevfixedlabels") {	    // examples:	    // for IOB2: prevfixedlabels=b-np:i-np|i-np	    // for IOB1: prevfixedlabels=b-np:i-np|b-np	    strtokenizer tok(optval, "|");	    if (tok.count_tokens() == 2) {		vector<string> cnt;				cnt.push_back(tok.token(1));				strtokenizer tok1(tok.token(0), ":");		for (int count = 0; count < tok1.count_tokens(); count++) {		    cnt.push_back(tok1.token(count));		}				prevfixedstrlabels.push_back(cnt);	    }	    	} else if (optstr == "nextfixedlabels") {	    // examples:	    // for IOE2: nextfixedlabels=i-np|i-np:e-np	    // for IOE1: nextfixedlabels=e-np|i-np:e-np	    strtokenizer tok(optval, "|");	    if (tok.count_tokens() == 2) {		vector<string> cnt;				cnt.push_back(tok.token(0));				strtokenizer tok1(tok.token(1), ":");		for (int count = 0; count < tok1.count_tokens(); count++) {		    cnt.push_back(tok1.token(count));		}				nextfixedstrlabels.push_back(cnt);	    }	} else {	}    }}void option::write_options(FILE * fout) {    fprintf(fout, "OPTION VALUES:\n\n");    fprintf(fout, "Model directory: %s\n", model_dir.c_str());    fprintf(fout, "Training data file: %s\n", trndata_file.c_str());    fprintf(fout, "Testing data file: %s\n", tstdata_file.c_str());    fprintf(fout, "Unlabeled data file: %s\n", ulbdata_file.c_str());    if (chunk_evaluate_during_training) {	if (chunktype == IOB1) {	    fprintf(fout, "Label representation: IOB1\n");	} else if (chunktype == IOB2) {	    fprintf(fout, "Label representation: IOB2\n");	} else if (chunktype == IOE1) {	    fprintf(fout, "Label representation: IOE1\n");	} else if (chunktype == IOE2) {	    fprintf(fout, "Label representation: IOE2\n");	}    }    fprintf(fout, "Model file: %s\n", model_file.c_str());    fprintf(fout, "Training log file (this one): %s\n\n", trainlog_file.c_str());                if (order == FIRST_ORDER) {	fprintf(fout, "First-order Markov CRFs\n\n");    } else if (order == SECOND_ORDER) {	fprintf(fout, "Second-order Markov CRFs\n\n");    }    fprintf(fout, "Number of labels: %d\n", num_labels);    fprintf(fout, "Number of training sequences: %d\n", num_trnseqs);    fprintf(fout, "Number of testing sequences: %d\n", num_tstseqs);    fprintf(fout, "Number of unlabeled sequences: %d\n", num_ulbseqs);    fprintf(fout, "Number of context predicates: %d\n", num_cps);    fprintf(fout, "Number of features: %d\n", num_features);    fprintf(fout, "Feature rare threshold: %d\n", f_rare_threshold);    fprintf(fout, "Context predicate rare threshold: %d\n", cp_rare_threshold);    fprintf(fout, "Using multiple rare thresholds for features: %d\n", multiple_f_rare_thresholds);    fprintf(fout, "Highlight feature: %d\n\n", highlight_feature);    fprintf(fout, "Number of training iterations: %d\n", num_iterations);    fprintf(fout, "Initial lambda value: %10.4f\n", init_lambda_val);    fprintf(fout, "Sigma square (for smoothing): %10.4f\n", sigma_square);    fprintf(fout, "Epsilon for L-BFGS convergence: %10.6f\n", eps_for_convergence);    fprintf(fout, "Number of approximated hessian matrixes: %d\n", m_for_hessian);}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -