⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 convertchunk.cpp

📁 Hieu Xuan Phan & Minh Le Nguyen 利用CRF统计模型写的可用于英文命名实体识别、英文分词的工具(开放源码)。CRF模型最早由Lafferty提出
💻 CPP
字号:
/* * Copyright (C) 2004 - 2005 by *     Hieu Xuan Phan & Minh Le Nguyen {hieuxuan, nguyenml}@jaist.ac.jp *     Graduate School of Information Science, *     Japan Advanced Institute of Science and Technology (JAIST) * * convertchunk.cpp - this file is part of FlexCRFs. * * Begin:	Nov. 26, 2005 * Last change:	Nov. 26, 2005 * * FlexCRFs is a free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * FlexCRFs is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with FlexCRFs; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */#include <map>#include "../../../include/convertchunk.h"using namespace std;string strtail(string & str) {    return str.substr(1, str.size() - 1);}void convert_IOB2_IOB1(int is_cap, int is_last, sequence & seq) {    map<int, int> fixedlabels;    map<int, int>::iterator obsrit;    char begin = is_cap ? 'B' : 'b';    char inside = is_cap ? 'I' : 'i';    string INSIDE = is_cap ? "I" : "i";    string str1, str2, istr, newlabel;    int i, len = seq.size();            for (i = 1; i < len; i++) {	int col1 = is_last ? seq[i - 1].size() - 1 : seq[i - 1].size() - 2;	int col2 = is_last ? seq[i].size() - 1 : seq[i].size() - 2;		str1 = seq[i - 1][col1];	str2 = seq[i][col2];		if (str2[0] != begin) {	    continue;	}		istr = INSIDE;	istr += strtail(str2);		if (str1 == str2 || str1 == istr) {	    fixedlabels.insert(pair<int, int>(i, i));	}    }        for (i = 0; i < len; i++) {	obsrit = fixedlabels.find(i);	if (obsrit != fixedlabels.end()) {	    continue;	}		int col = is_last ? seq[i].size() - 1 : seq[i].size() - 2;	if (seq[i][col][0] == begin) {	    // B- or b- => I- or i-	    newlabel = INSIDE;	    newlabel += strtail(seq[i][col]);	    seq[i][col] = newlabel;	}		    }}void convert_IOB1_IOB2(int is_cap, int is_last, sequence & seq) {    vector<int> firstlabels;    char begin = is_cap ? 'B' : 'b';    string BEGIN = is_cap ? "B" : "b";    char inside = is_cap ? 'I' : 'i';    string str1, str2, bstr, newlabel;    int i, len = seq.size();                for (i = 0; i < len; i++) {	int col2 = is_last ? seq[i].size() - 1 : seq[i].size() - 2;		str2 = seq[i][col2];		if (str2[0] != inside) {	    continue;	}		if (i == 0) {	    firstlabels.push_back(i);		} else {	    int col1 = is_last ? seq[i - 1].size() - 1 : seq[i - 1].size() - 2;	    str1 = seq[i - 1][col1];		    bstr = BEGIN;	    bstr += strtail(str2);	    	    if (str2 != str1 && str1 != bstr) {		firstlabels.push_back(i);	    }	}    }    for (i = 0; i < firstlabels.size(); i++) {	int col = is_last ? seq[firstlabels[i]].size() - 1 : seq[firstlabels[i]].size() - 2;		newlabel = BEGIN;	newlabel += strtail(seq[firstlabels[i]][col]);	seq[firstlabels[i]][col] = newlabel;    }}void convert_IOE2_IOE1(int is_cap, int is_last, sequence & seq) {    map<int, int> fixedlabels;    map<int, int>::iterator obsrit;    char inside = is_cap ? 'I' : 'i';    string INSIDE = is_cap ? "I" : "i";    char end = is_cap ? 'E' : 'e';    string str1, str2, istr, newlabel;    int i, len = seq.size();                for (i = 0; i < len - 1; i++) {	int col1 = is_last ? seq[i].size() - 1 : seq[i].size() - 2;	int col2 = is_last ? seq[i + 1].size() - 1 : seq[i + 1].size() - 2;	str1 = seq[i][col1];	str2 = seq[i + 1][col2];		if (str1[0] != end) {	    continue;	}		istr = INSIDE;	istr += strtail(str1);		if (str2 == str1 || str2 == istr) {	    fixedlabels.insert(pair<int, int>(i, i));	}    }        for (i = 0; i < len; i++) {	obsrit = fixedlabels.find(i);	if (obsrit != fixedlabels.end()) {	    continue;	}		int col = is_last ? seq[i].size() - 1 : seq[i].size() - 2;	if (seq[i][col][0] == end) {	    // E- or e- => I- or i-	    newlabel = INSIDE;	    newlabel += strtail(seq[i][col]);	    seq[i][col] = newlabel;	}		    }}void convert_IOE1_IOE2(int is_cap, int is_last, sequence & seq) {    vector<int> endlabels;        char end = is_cap ? 'E' : 'e';    string END = is_cap ? "E" : "e";    char inside = is_cap ? 'I' : 'i';    string str1, str2, estr, newlabel;    int i, len = seq.size();                for (i = 0; i < len; i++) {	int col1 = is_last ? seq[i].size() - 1 : seq[i].size() - 2;		str1 = seq[i][col1];		if (str1[0] != inside) {	    continue;	}		if (i == len - 1) {	    endlabels.push_back(i);		} else {	    int col2 = is_last ? seq[i + 1].size() - 1 : seq[i + 1].size() - 2;	    str2 = seq[i + 1][col2];		    estr = END;	    estr += strtail(str1);	    	    if (str2 != str1 && str2 != estr) {		endlabels.push_back(i);	    }	}    }    for (i = 0; i < endlabels.size(); i++) {	int col = is_last ? seq[endlabels[i]].size() - 1 : seq[endlabels[i]].size() - 2;		newlabel = END;	newlabel += strtail(seq[endlabels[i]][col]);	seq[endlabels[i]][col] = newlabel;    }}void convert_IOB2_IOE2(int is_cap, int is_last, sequence & seq) {    map<int, int> endlabels;    map<int, int>::iterator elbit;        char begin = is_cap ? 'B' : 'b';    char inside = is_cap ? 'I' : 'i';    string INSIDE = is_cap ? "I" : "i";    char end = is_cap ? 'E' : 'e';    string END = is_cap ? "E" : "e";        string str1, str2, istr, str, newlabel;    int i, len = seq.size();        for (i = 0; i < len; i++) {	int col1 = is_last ? seq[i].size() - 1 : seq[i].size() - 2;	str1 = seq[i][col1];		if (str1[0] != begin && str1[0] != inside) {	    continue;	}		if (i == len - 1) {	    endlabels.insert(pair<int, int>(i, i));	} else {	    int col2 = is_last ? seq[i + 1].size() - 1 : seq[i + 1].size() - 2;	    str2 = seq[i + 1][col2];	    	    istr = INSIDE;	    istr += strtail(str1);	    	    if (str1[0] == begin && str2 != istr) {		endlabels.insert(pair<int, int>(i, i));	    	    } else if (str1[0] == inside && str2 != str1) {		endlabels.insert(pair<int, int>(i, i));	    	    }	}    }        for (i = 0; i < len; i++) {	int col = is_last ? seq[i].size() - 1 : seq[i].size() - 2;	str = seq[i][col];		if (str[0] != begin && str[0] != inside) {	    continue;	}		elbit = endlabels.find(i);	if (elbit != endlabels.end()) {	    newlabel = END;	    newlabel += strtail(str);	    seq[i][col] = newlabel;		} else {	    newlabel = INSIDE;	    newlabel += strtail(str);	    seq[i][col] = newlabel;	}	    }} void convert_IOE2_IOB2(int is_cap, int is_last, sequence & seq) {    map<int, int> beginlabels;    map<int, int>::iterator blbit;        char begin = is_cap ? 'B' : 'b';    string BEGIN = is_cap ? "B" : "b";    char inside = is_cap ? 'I' : 'i';    string INSIDE = is_cap ? "I" : "i";    char end = is_cap ? 'E' : 'e';        string str1, str2, istr, str, newlabel;    int i, len = seq.size();        for (i = 0; i < len; i++) {	int col2 = is_last ? seq[i].size() - 1 : seq[i].size() - 2;	str2 = seq[i][col2];		if (str2[0] != end && str2[0] != inside) {	    continue;	}		if (i == 0) {	    beginlabels.insert(pair<int, int>(i, i));	} else {	    int col1 = is_last ? seq[i - 1].size() - 1 : seq[i - 1].size() - 2;	    str1 = seq[i - 1][col1];	    	    istr = INSIDE;	    istr += strtail(str2);	    	    if (str2[0] == end && str1 != istr) {		beginlabels.insert(pair<int, int>(i, i));	    	    } else if (str2[0] == inside && str1 != str2) {		beginlabels.insert(pair<int, int>(i, i));	    	    }	}    }        for (i = 0; i < len; i++) {	int col = is_last ? seq[i].size() - 1 : seq[i].size() - 2;	str = seq[i][col];		if (str[0] != end && str[0] != inside) {	    continue;	}		blbit = beginlabels.find(i);		if (blbit != beginlabels.end()) {	    newlabel = BEGIN;	    newlabel += strtail(str);	    seq[i][col] = newlabel;		} else {	    newlabel = INSIDE;	    newlabel += strtail(str);	    seq[i][col] = newlabel;	}	    }} void convert_IOB2_IOE1(int is_cap, int is_last, sequence & seq) {    convert_IOB2_IOE2(is_cap, is_last, seq);    convert_IOE2_IOE1(is_cap, is_last, seq);}void convert_IOE1_IOB2(int is_cap, int is_last, sequence & seq) {    convert_IOE1_IOE2(is_cap, is_last, seq);    convert_IOE2_IOB2(is_cap, is_last, seq);}void convert_IOB1_IOE1(int is_cap, int is_last, sequence & seq) {    convert_IOB1_IOB2(is_cap, is_last, seq);    convert_IOB2_IOE1(is_cap, is_last, seq);}void convert_IOE1_IOB1(int is_cap, int is_last, sequence & seq) {    convert_IOE1_IOB2(is_cap, is_last, seq);    convert_IOB2_IOB1(is_cap, is_last, seq);}void convert_IOB1_IOE2(int is_cap, int is_last, sequence & seq) {    convert_IOB1_IOB2(is_cap, is_last, seq);    convert_IOB2_IOE2(is_cap, is_last, seq);}void convert_IOE2_IOB1(int is_cap, int is_last, sequence & seq) {    convert_IOE2_IOB2(is_cap, is_last, seq);    convert_IOB2_IOB1(is_cap, is_last, seq);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -