📄 concatregexfeatures.java
字号:
package iitb.Model;import iitb.CRF.*;import java.util.regex.*;import java.util.*;import java.io.*;/** * ConcatRegexFeatures generates features by matching the token with the character patterns. * Character patterns are regular expressions for checking whether the token is capitalized word, * a number, small case word, whether the token contains any special characters and like. * It uses regular expression to match a sequence of character pattern and generates features * accordingly. * <P> * The feature generated here is whether a sequence of tokens has a particular sequence of given pattern or not. * For example, if a pattern is to mathc a capital word, then for two token context window, various features * generated are weither two token (bigram) sequence is having any of the following pattern or not: * (1) Capital, Capital * (2) Capital, Non-Capital * (3) Non-capital, Capital. * * You can use any window around the current token (segment) for creating regular expression based features. * Also, you can define your own patterns, by writing down the regular expression in a file, * whose format is specified below. * </p> * <p> * The object of this class should be wrap around {@link FeatureTypesEachLabel} as follows: * <pre> * new FeatureTypesEachLabel(model, new ConcreteConcatRegexFeatures(model,relSegmentStart, relSegmentEnd, maxMemory, patternFile)); * </pre> * </p> * A token in a token sequence has a index relative to the current token index, which is described below: * <pre> x0 x1 x2 x3 x4 x5 x6 x7 .... xn -4 -3 -2 -1 0 0 0 1 2 ... * </pre> * <p> * In above example, the current segment is from postion 4 to 6 with value of pos = 6 and prevPos = 3 in * startScanFeaturesAt() call of FeatureGenerator. * You can refer to any of the token relative to current position by using the index below the token sequence. * Thus, you can create a pattern concat features for any token sequence in the neighbourhood of the current token, * using relSegmentStart and relSegmentEnd. * For, example to create pattern for two tokens to the left of the current token, following is the parameters * to be passed to the constructor of the class: * </p> * <pre> * new FeatureTypesEachLabel(model, new ConcreteConcatRegexFeatures(model,-2, -1, maxMemory, patternFile)); * </pre> * * @author Imran Mansuri */ public class ConcatRegexFeatures extends FeatureTypes { /** * Various patterns are defined here. * First dimension of this two dimensional array is feature name and second value is the * regular expression pattern to be matched against a token. You can add your own patterns * in this array. */ String patternString[][] = { {"isWord", "[a-zA-Z][a-zA-Z]+" }, {"singleCapLetterWithDot", "[A-Z]\\." }, {"singleCapLetter", "[A-Z]" }, {"isDigits", "\\d+" }, {"singleDot", "[.]" }, {"singleComma", "[,]" }, {"isSpecialCharacter", "[#;:\\-/<>'\"()&]"}, {"containsSpecialCharacters",".*[#;:\\-/<>'\"()&].*"}, {"isInitCapital", "[A-Z][a-z]+" }, {"isAllCapital", "[A-Z]+" }, {"isAllSmallCase", "[a-z]+" }, {"isAlpha", "[a-zA-Z]+" }, {"isAlphaNumeric", "[a-zA-Z0-9]+" }, {"endsWithDot", "\\p{Alnum}+\\." }, {"endsWithComma", "\\w+[,]" }, {"endsWithPunctuation", "\\w+[;:,.?!]" }, {"singlePunctuation", "\\p{Punct}" }, {"singleAmp", "[&]" }, {"containsDigit", ".*\\d+.*" }, {"singleDigit", "\\s*\\d\\s*" }, {"twoDigits", "\\s*\\d{2}\\s*" }, {"threeDigits", "\\s*\\d{3}\\s*" }, {"fourDigits", "\\s*\\(*\\d{4}\\)*\\s*" }, {"isNumberRange", "\\d+\\s*([-]{1,2}\\s*\\d+)?"}, {"isDashSeparatedWords", "(\\w[-])+\\w"}, {"isDashSeparatedSeq", "((\\p{Alpha}+|\\p{Digit}+)[-])+(\\p{Alpha}+|\\p{Digit}+)"}, {"isURL", "\\p{Alpha}+://(\\w+\\.)\\w+(:(\\d{2}|\\d{4}))?(/\\w+)*(/|(/\\w+\\.\\w+))?" }, {"isEmailId", "\\w+@(\\w+\\.)+\\w+" }, {"containsDashes", ".*--.*"} }; Pattern p[]; transient protected DataSequence data; protected int index, idbase, curId, window; protected int relSegmentStart, relSegmentEnd; protected int maxMemory; protected int left, right; /** * Constructs an object of ConcatRegexFeatures to be used to generate features for the token * sequence as specified. * You can specify the sequence of tokens on which the pattern has to be applied using relSegmentStart * and relSegmentEnd, which denotes segment boundries. * The maxMemory denotes the maximum segment size, for normal CRF the value of maxMemory is 1. * There are certain default patterns defined in the class. You can specify your own pattern in a file, and pass * the name of the file in this constructor. The file should begin with integer value for number of pattern in the * file. This should be follwoed by one pattern definition on each line. The first word is the name of the pattern * and second word is regular expression for the pattern. * * @param fgen a {@link Model} object * @param relSegmentStart index of the reltive position for left boundary * @param relSegmentEnd index of the reltive position for right boundary * @param maxMemory maximum size of a segment * @param patternFile file which contains the pattern definition */ public ConcatRegexFeatures(FeatureGenImpl fgen, int relSegmentStart, int relSegmentEnd, int maxMemory, String patternFile){ super(fgen); assert(relSegmentEnd >= relSegmentStart); this.relSegmentStart = relSegmentStart; this.relSegmentEnd = relSegmentEnd; this.maxMemory = maxMemory; window = getWindowSize(relSegmentStart, relSegmentEnd); idbase = (int) Math.pow(2, window); getPatterns(patternFile); assert(patternString != null); p = new Pattern[patternString.length]; for(int i = 0; i < patternString.length; i++){ //System.out.println("i"+ i +" " + patternString[i][1]); p[i] = Pattern.compile(patternString[i][1]); } } /** * @param relSegmentStart2 * @param relSegmentEnd2 * @return */ private int getWindowSize(int relSegmentStart, int relSegmentEnd) { if((sign(relSegmentEnd) == sign(relSegmentStart)) && relSegmentStart != 0) return relSegmentEnd - relSegmentStart + 1; else return relSegmentEnd - relSegmentStart + maxMemory; } /** * Constructs an object of ConcatRegexFeatures to be used to generate features for current token. * @param m a {@link Model} object * @param relSegmentStart index of the reltive position for left boundary * @param relSegmentEnd index of the reltive position for right boundary * @param maxMemory maximum size of a segment */ public ConcatRegexFeatures(FeatureGenImpl m, int relSegmentStart, int relSegmentEnd, int maxMemory){ super(m); assert(relSegmentEnd >= relSegmentStart); this.relSegmentStart = relSegmentStart; this.relSegmentEnd = relSegmentEnd; this.maxMemory = maxMemory; window = getWindowSize(relSegmentStart, relSegmentEnd); idbase = (int) Math.pow(2, window); assert(patternString != null); p = new Pattern[patternString.length]; for(int i = 0; i < patternString.length; i++){ //System.out.println("i"+ i +" " + patternString[i][1]); p[i] = Pattern.compile(patternString[i][1]); } } /** * Constructs an object of ConcatRegexFeatures to be used to generate features for current token. * @param m a {@link Model} object * @param relSegmentStart index of the reltive position for left boundary * @param relSegmentEnd index of the reltive position for right boundary */ public ConcatRegexFeatures(FeatureGenImpl m, int relSegmentStart, int relSegmentEnd){ this(m, relSegmentStart, relSegmentEnd, 1); } /** * Constructs an object of ConcatRegexFeatures to be used to generate features for current token. * @param m a {@link Model} object * @param relSegmentStart index of the reltive position for left boundary * @param relSegmentEnd index of the reltive position for right boundary * @param patternFile file which contains the pattern definition */ public ConcatRegexFeatures(FeatureGenImpl m, int relSegmentStart, int relSegmentEnd, String patternFile){ this(m, relSegmentStart, relSegmentEnd, 1, patternFile); } private int sign(int boundary){ if(boundary == 0) return 0; else if(boundary < 0) return -1; else return 1; } /** * Reads patterns to be matched from the file. * The format of the file is as follows: * The first line of the file is number of patterns, followed by a list of patterns one per line. * Each line describes a pattern's name and pattern string itself. * * @param patternFile name of the pattern file */ void getPatterns(String patternFile){ String line; String patterns[][]; try { BufferedReader in = new BufferedReader(new FileReader(patternFile)); int len = Integer.parseInt(in.readLine()); patterns = new String[len][2]; for(int k = 0; k < len; k++){ StringTokenizer strTokenizer = new StringTokenizer(in.readLine()); patterns[k][0] = strTokenizer.nextToken(); patterns[k][1] = strTokenizer.nextToken(); //System.out.println(patterns[k][0] + " " + patterns[k][1]); } }catch(IOException ioe){ System.err.println("Could not read pattern file : " + patternFile); ioe.printStackTrace(); return; } patternString = patterns; return; } /** * Initaites scanning of features in a sequence at specified position. * * @param data a training sequence * @param prevPos the previous label postion * @param pos Current token postion */ public boolean startScanFeaturesAt(DataSequence data, int prevPos, int pos){ assert(patternString != null); this.data = data; index = 0; if (relSegmentStart <= 0) { left = prevPos + 1 + relSegmentStart; } else { left = pos + relSegmentStart; } if (relSegmentEnd < 0) { right = prevPos + 1 + relSegmentEnd; } else { right = pos + relSegmentEnd; } if(!(left >= 0 && left < data.length() && right >= 0 && right < data.length())) index = patternString.length; //System.out.println("DataLength:" + data.length() + " segment(" + (prevPos+1) + "," + pos + ") rs(" +relSegmentStart + "," + relSegmentEnd + ") window(" + left + "," + right + ") idbase:" + idbase); advance(); return true; } /** * Returns true if there are any more feature(s) for the current scan. * */ public boolean hasNext() { return index < patternString.length; } /** * Generates the next feature for the current scan. * * @param f Copies the feature generated to the argument */ public void next(FeatureImpl f) { if(featureCollectMode()){ //This is a feature collection mode, so return id and name f.strId.name = patternString[index][0] + "_" + window + "_" + Integer.toBinaryString(curId); } /*//Return feature on token window int base = 1; f.strId.id = 0; for(int k = left; k <= right; k++){ boolean match = p[index].matcher((String)data.x(k)).matches(); f.strId.id += base * (match? 1:0); base = base * 2; } f.val = (f.strId.id > 0) ? 1:0; //In case of no match return 0 as feature value f.ystart = -1; f.strId.id += idbase * index++;*/ f.val = 1; f.strId.id = curId + idbase * index++; f.ystart = -1; advance(); } private void advance(){ curId = 0; while(curId <= 0 && index < patternString.length){ int base = 1; for(int k = left; k <= right; k++){ boolean match = p[index].matcher((String)data.x(k)).matches(); curId += base * (match? 1:0); base = base * 2; } if(curId > 0) break; index++; } } public int maxFeatureId(){ return idbase * (patternString.length - 1) + (idbase -1); //(maximum base i.e. most significat bits + maximum offset) } int offsetLabelIndependentId(FeatureImpl f) { return f.strId.id; }};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -