conllner2003sentence2tokensequence.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 220 行

JAVA

220 行

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. *//**    @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> *//*	An error?  CoNLLTrue MalletTrue MalletPred	O O O	I-MISC B-MISC B-MISC	B-MISC B-MISC I-MISC	I-MISC B-MISC I-MISC	O O O	O O O	O O O*/package edu.umass.cs.mallet.share.casutton.ner; // Generated package nameimport edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.extract.StringTokenization;import edu.umass.cs.mallet.base.extract.StringSpan;import java.util.regex.*;/** * Reads a data file in CoNLL 2003 format, and makes some simple *  transformations. * * Unlike the version in <tt>mccallum.ner</tt>, does not expect fields in *  the data file for tags and phrasos if those features are off.  Does *  not look for target field if isTargetProcessing() is false. */public class ConllNer2003Sentence2TokenSequence extends Pipe{	static final String[] endings = new String[]	{"ing", "ed", "ogy", "s", "ly", "ion", "tion", "ity", "ies"};	static Pattern[] endingPatterns = new Pattern[endings.length];	// Indexed by {forward,backward} {0,1,2 offset} {ending char ngram index}	static final String[][][] endingNames = new String[2][3][endings.length];	{		for (int i = 0; i < endings.length; i++) {			endingPatterns[i] = Pattern.compile (".*"+endings[i]+"$");			for (int j = 0; j < 3; j++) {				for (int k = 0; k < 2; k++)					endingNames[k][j][i] = "W"+(k==1?"-":"")+j+"=<END"+endings[i]+">";			}		}	}	boolean saveSource = true;	boolean doConjunctions = false;	boolean doTags = true;	boolean doPhrases = true;	boolean doSpelling = false;	boolean doDigitCollapses = true;	boolean doDowncasing = false;		public ConllNer2003Sentence2TokenSequence ()	{		super (null, LabelAlphabet.class);	}	public ConllNer2003Sentence2TokenSequence (boolean useTags, boolean usePhrases)	{		super (null, LabelAlphabet.class);		this.doTags = useTags;		this.doPhrases = usePhrases;	}	/* Lines look like this:		 -DOCSTART- -X- -X- O		 EU NNP I-NP I-ORG		 rejects VBZ I-VP O		 German JJ I-NP I-MISC		 call NN I-NP O		 to TO I-VP O		 boycott VB I-VP O		 British JJ I-NP I-MISC		 lamb NN I-NP O		 . . O O		 Peter NNP I-NP I-PER		 Blackburn NNP I-NP I-PER		 BRUSSELS NNP I-NP I-LOC		 1996-08-22 CD I-NP O		 The DT I-NP O		 European NNP I-NP I-ORG		 Commission NNP I-NP I-ORG		 said VBD I-VP O		 on IN I-PP O		 ...	*/	public Instance pipe (Instance carrier)	{		String sentenceLines = (String) carrier.getData();		String[] tokens = sentenceLines.split ("\n");		LabelSequence target = new LabelSequence ((LabelAlphabet)getTargetAlphabet(), tokens.length);		boolean [][] ending = new boolean[3][endings.length];		boolean [][] endingp1 = new boolean[3][endings.length];		boolean [][] endingp2 = new boolean[3][endings.length];		StringBuffer source = saveSource ? new StringBuffer() : null;    TokenSequence data = new StringTokenization (source);		String prevLabel = "NOLABEL";		Pattern ipattern = Pattern.compile ("I-.*");		String word, tag = null, phrase = null, label = null;		for (int i = 0; i < tokens.length; i++) {      if (tokens[i].length() != 0) {        try {				  String[] features = tokens[i].split (" ");          int fieldIdx = 0;          word = features[fieldIdx++]; // .toLowerCase();          if (doTags) tag = features[fieldIdx++];				  if (doPhrases) phrase = features[fieldIdx++];				  if (isTargetProcessing ()) label = features[fieldIdx++];        } catch (ArrayIndexOutOfBoundsException e) {          throw new IllegalArgumentException ("Invalid line "+tokens[i]+" : expected word "            + (doTags ? ", tag" : "")            + (doPhrases ? ", phrase" : "")            + (isTargetProcessing () ? ", target" : "")            + ".");        }      } else {				word = "-<S>-";				tag = "-<S>-";				phrase = "-<S>-";				label = "O";			}			// Transformations			if (doDigitCollapses) {				if (word.matches ("19\\d\\d"))					word = "<YEAR>";				else if (word.matches ("19\\d\\ds"))					word = "<YEARDECADE>";				else if (word.matches ("19\\d\\d-\\d+"))					word = "<YEARSPAN>";				else if (word.matches ("\\d+\\\\/\\d"))					word = "<FRACTION>";				else if (word.matches ("\\d[\\d,\\.]*"))					word = "<DIGITS>";				else if (word.matches ("19\\d\\d-\\d\\d-\\d--d"))					word = "<DATELINEDATE>";				else if (word.matches ("19\\d\\d-\\d\\d-\\d\\d"))					word = "<DATELINEDATE>";				else if (word.matches (".*-led"))					word = "<LED>";				else if (word.matches (".*-sponsored"))					word = "<LED>";			}			if (doDowncasing)				word = word.toLowerCase();      int start = source.length ();      if (saveSource) {        if (word.equals ("-<S>-")) source.append ("\n\n");        source.append (word); source.append (" ");      }      Token token = new StringSpan (source, start, source.length () - 1);			// Word and tag unigram at current time			if (doSpelling) {				for (int j = 0; j < endings.length; j++) {					ending[2][j] = ending[1][j];					ending[1][j] = ending[0][j];					ending[0][j] = endingPatterns[j].matcher(word).matches();					if (ending[0][j]) token.setFeatureValue (endingNames[0][0][j], 1);				}			}			if (doTags) {				token.setFeatureValue ("T="+tag, 1);			}			if (doPhrases) {				token.setFeatureValue ("P="+phrase, 1);			}      data.add (token);      if (isTargetProcessing ()) {        // Change so each segment always begins with a "B-",				// even if previous token did not have this label.				String oldLabel = label;				if (ipattern.matcher(label).matches ()						&& (prevLabel.length() < 3		// prevLabel is "O"								|| !prevLabel.substring(2).equals (label.substring(2)))) {					label = "B" + oldLabel.substring(1);				}				prevLabel = oldLabel;			  target.add (label);      }    }    carrier.setData(data);		if (isTargetProcessing ()) carrier.setTarget(target);		if (saveSource) carrier.setSource(source);    return carrier;	}}

conllner2003sentence2tokensequence.java - 源码说明

本页面展示了「mallet是自然语言处理、机器学习领域的一个开源项目。」中的 conllner2003sentence2tokensequence.java 源码文件，采用 Java 编程语言编写，共 220 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与mallet相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?