featuresinwindow.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 158 行

JAVA
158
字号
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. *//**	 Create new features from features (matching a regex within a window +/- the current position).	 For example, 	 <br><code>	 FeaturesInWindow p = new FeaturesInWindow("PREV-", -1, 1, Pattern.compile("POS-.*"), true)	 </code> <br>	 will create a pipe that adds a feature to the current position for each	 feature in the previous starting with "POS-".  So if the previous position	 has "POS-NN" we add "PREV-POS-NN".   The last argument to the constructor is	 currently ignored.  The alternative constructor matches all patterns, so: 	 <br><code>	 FeaturesInWindow p = new FeaturesInWindow(s, l, r);	 </code> <br>	 is equivalent to 	 <br><code>	 FeaturesInWindow p = new FeaturesInWindow("PREV-", -1, 1, Pattern.compile(".*"), true);	 </code> <br>	 but more efficient, since we don't actually check using the Pattern.    @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */package edu.umass.cs.mallet.base.pipe.tsf;import edu.umass.cs.mallet.base.types.TokenSequence;import edu.umass.cs.mallet.base.util.PropertyList;import edu.umass.cs.mallet.base.pipe.Pipe;import edu.umass.cs.mallet.base.types.Token;import edu.umass.cs.mallet.base.types.Instance;import java.io.*;import java.util.regex.*;public class FeaturesInWindow extends Pipe implements Serializable{	String namePrefix, namePrefixLeft;	int leftBoundary;	int rightBoundary;	Pattern featureRegex;	boolean includeBeginEndBoundaries;	boolean includeCurrentToken = false;	private static final int maxWindowSize = 20;	private static final PropertyList[] startfs = new PropertyList[maxWindowSize];	private static final PropertyList[] endfs = new PropertyList[maxWindowSize];		static {		initStartEndFs ();	}	private static void initStartEndFs ()	{		for (int i = 0; i < maxWindowSize; i++) {			startfs[i] = PropertyList.add ("<START"+i+">", 1.0, null);			endfs[i] = PropertyList.add ("<END"+i+">", 1.0, null);		}	}	/** @param namePrefix what to prepend to feature names		* @param leftBoundaryOffset left boundary of the window (e.g. -1 means		*                           include the previous word		* @param rightBoundaryOffset right boundary for this window (e.g. 1 means		*                           include the current position, but not the next		* @param featureRegex add only for features matching this (null = always match		* @param includeBeginEndBoundaries ignored		*/	public FeaturesInWindow (String namePrefix, int leftBoundaryOffset, int rightBoundaryOffset,													 Pattern featureRegex, boolean includeBeginEndBoundaries)	{		this.namePrefix = namePrefix;		this.leftBoundary = leftBoundaryOffset;		this.rightBoundary = rightBoundaryOffset;		this.featureRegex = featureRegex;		this.includeBeginEndBoundaries = includeBeginEndBoundaries;	}	/** 		equivalent to <br>		<code>		FeaturesInWindow((namePrefix, leftBoundaryOffset, rightBoundaryOffset, null, true);		</code>		*/	public FeaturesInWindow (String namePrefix, int leftBoundaryOffset, int rightBoundaryOffset)	{		this (namePrefix, leftBoundaryOffset, rightBoundaryOffset, null, true);	}		public Instance pipe (Instance carrier)	{		TokenSequence ts = (TokenSequence) carrier.getData();		int tsSize = ts.size();		PropertyList[] newFeatures = new PropertyList[tsSize];		for (int i = 0; i < tsSize; i++) {			Token t = ts.getToken (i);			PropertyList pl = t.getFeatures();			newFeatures[i] = pl;			for (int position = i + leftBoundary; position < i + rightBoundary; position++) {				if (position == i && !includeCurrentToken)					continue;				PropertyList pl2;				if (position < 0)					pl2 = startfs[-position];				else if (position >= tsSize)					pl2 = endfs[position-tsSize];				else					pl2 = ts.getToken(position).getFeatures ();				PropertyList.Iterator pl2i = pl2.iterator();				while (pl2i.hasNext()) {					pl2i.next();					String key = pl2i.getKey();					if (featureRegex == null || featureRegex.matcher(key).matches()) {						newFeatures[i] = PropertyList.add ((namePrefixLeft == null || position-i>0 ? namePrefix : namePrefixLeft)+key,																							 pl2i.getNumericValue(), newFeatures[i]);					}				}			}		}		for (int i = 0; i < tsSize; i++) {			// Put the new PropertyLists in place			ts.getToken (i).setFeatures (newFeatures[i]);		}		return carrier;	}	// Serialization 		private static final long serialVersionUID = 1;	private static final int CURRENT_SERIAL_VERSION = 0;		private void writeObject (ObjectOutputStream out) throws IOException {		out.writeInt (CURRENT_SERIAL_VERSION);		out.writeObject (namePrefix);		out.writeInt (leftBoundary);		out.writeInt (rightBoundary);		out.writeObject (featureRegex);		out.writeBoolean (includeBeginEndBoundaries);	}		private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {		int version = in.readInt ();		namePrefix = (String) in.readObject();		leftBoundary = in.readInt ();		rightBoundary = in.readInt ();		featureRegex = (Pattern) in.readObject();		includeBeginEndBoundaries = in.readBoolean();	}}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?