📄 conllner2003sentence2tokensequence.java

📁 常用机器学习算法,java编写源代码,内含常用分类算法,包括说明文档
💻 JAVA
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. *//**    @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> *//*	An error?  CoNLLTrue MalletTrue MalletPred	O O O	I-MISC B-MISC B-MISC	B-MISC B-MISC I-MISC	I-MISC B-MISC I-MISC	O O O	O O O	O O O*/package edu.umass.cs.mallet.share.mccallum.ner;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.types.*;import java.util.regex.*;public class ConllNer2003Sentence2TokenSequence extends Pipe{	static final String[] endings = new String[]	{"ing", "ed", "ogy", "s", "ly", "ion", "tion", "ity", "ies"};	static Pattern[] endingPatterns = new Pattern[endings.length];	// Indexed by {forward,backward} {0,1,2 offset} {ending char ngram index}	static final String[][][] endingNames = new String[2][3][endings.length];	{		for (int i = 0; i < endings.length; i++) {			endingPatterns[i] = Pattern.compile (".*"+endings[i]+"$");			for (int j = 0; j < 3; j++) {				for (int k = 0; k < 2; k++)					endingNames[k][j][i] = "W"+(k==1?"-":"")+j+"=<END"+endings[i]+">";			}		}	}	boolean saveSource = false;	boolean doConjunctions = false;	boolean doTags = true;	boolean doPhrases = true;	boolean doSpelling = false;	boolean doDigitCollapses = true;	boolean doDowncasing = false;		public ConllNer2003Sentence2TokenSequence ()	{		super (null, LabelAlphabet.class);	}	public ConllNer2003Sentence2TokenSequence (boolean extraFeatures)	{		super (null, LabelAlphabet.class);		if (!extraFeatures) {			doDigitCollapses = doConjunctions = doSpelling = doPhrases = doTags = false;			doDowncasing = true;		}	}		/* Lines look like this:		 -DOCSTART- -X- -X- O		 EU NNP I-NP I-ORG		 rejects VBZ I-VP O		 German JJ I-NP I-MISC		 call NN I-NP O		 to TO I-VP O		 boycott VB I-VP O		 British JJ I-NP I-MISC		 lamb NN I-NP O		 . . O O		 Peter NNP I-NP I-PER		 Blackburn NNP I-NP I-PER		 BRUSSELS NNP I-NP I-LOC		 1996-08-22 CD I-NP O		 The DT I-NP O		 European NNP I-NP I-ORG		 Commission NNP I-NP I-ORG		 said VBD I-VP O		 on IN I-PP O		 ...	*/	public Instance pipe (Instance carrier)	{		String sentenceLines = (String) carrier.getData();		String[] tokens = sentenceLines.split ("\n");		TokenSequence data = new TokenSequence (tokens.length);		LabelSequence target = new LabelSequence ((LabelAlphabet)getTargetAlphabet(), tokens.length);		boolean [][] ending = new boolean[3][endings.length];		boolean [][] endingp1 = new boolean[3][endings.length];		boolean [][] endingp2 = new boolean[3][endings.length];		StringBuffer source = saveSource ? new StringBuffer() : null;		String prevLabel = "NOLABEL";		Pattern ipattern = Pattern.compile ("I-.*");		String word, tag, phrase, label;		for (int i = 0; i < tokens.length; i++) {			if (tokens[i].length() != 0) {				String[] features = tokens[i].split (" ");				if (features.length != 4)					throw new IllegalStateException ("Line \""+tokens[i]+"\" doesn't have four elements");				word = features[0]; // .toLowerCase();				tag = features[1];				phrase = features[2];				label = features[3];			} else {				word = "-<S>-";				tag = "-<S>-";				phrase = "-<S>-";				label = "O";			}			// Transformations			if (doDigitCollapses) {				if (word.matches ("19\\d\\d"))					word = "<YEAR>";				else if (word.matches ("19\\d\\ds"))					word = "<YEARDECADE>";				else if (word.matches ("19\\d\\d-\\d+"))					word = "<YEARSPAN>";				else if (word.matches ("\\d+\\\\/\\d"))					word = "<FRACTION>";				else if (word.matches ("\\d[\\d,\\.]*"))					word = "<DIGITS>";				else if (word.matches ("19\\d\\d-\\d\\d-\\d--d"))					word = "<DATELINEDATE>";				else if (word.matches ("19\\d\\d-\\d\\d-\\d\\d"))					word = "<DATELINEDATE>";				else if (word.matches (".*-led"))					word = "<LED>";				else if (word.matches (".*-sponsored"))					word = "<LED>";			}			if (doDowncasing)				word = word.toLowerCase();			Token token = new Token (word);						// Word and tag unigram at current time			if (doSpelling) {				for (int j = 0; j < endings.length; j++) {					ending[2][j] = ending[1][j];					ending[1][j] = ending[0][j];					ending[0][j] = endingPatterns[j].matcher(word).matches();					if (ending[0][j]) token.setFeatureValue (endingNames[0][0][j], 1);				}			}			if (doTags) {				token.setFeatureValue ("T="+tag, 1);			}			if (doPhrases) {				token.setFeatureValue ("P="+phrase, 1);			}			if (true) {				// Change so each segment always begins with a "B-",				// even if previous token did not have this label.				String oldLabel = label;				if (ipattern.matcher(label).matches ()						&& (prevLabel.length() < 3		// prevLabel is "O"								|| !prevLabel.substring(2).equals (label.substring(2)))) {					label = "B" + oldLabel.substring(1);				}				prevLabel = oldLabel;			}			// Append			data.add (token);			//target.add (bigramLabel);			target.add (label);			//System.out.print (label + ' ');			if (saveSource) {				source.append (word); source.append (" ");				//source.append (bigramLabel); source.append ("\n");				source.append (label); source.append ("\n");			}		}		//System.out.println ("");		carrier.setData(data);		carrier.setTarget(target);		if (saveSource)			carrier.setSource(source);		return carrier;	}}
💿 文件大小 5351 K
👤 上传用户 lihuitao1987
📂 所属分类数学计算
🏷️ 相关标签

#java #机器学习 #分类算法 #文档
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -