📄 simpletaggersentence2tokensequence.java
字号:
/* Copyright (C) 2003 University of Pennsylvania. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org. For furtherinformation, see the file `LICENSE' included with this distribution. *//** @author Fernando Pereira <a href="mailto:pereira@cis.upenn.edu">pereira@cis.upenn.edu</a> Modified by Kuzman Ganchev to covert to TokenSequence rather than to FeatureVectorSequence. */package edu.umass.cs.mallet.base.pipe;import edu.umass.cs.mallet.base.types.*;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;/** * Converts an external encoding of a sequence of elements with binary * features to a {@link TokenSequence}. If target processing * is on (training or labeled test data), it extracts element labels * from the external encoding to create a target {@link LabelSequence}. * Two external encodings are supported: * <ol> * <li> A {@link String} containing lines of whitespace-separated tokens.</li> * <li> a {@link String}<code>[][]</code>.</li> * </ol> * <p/> * Both represent rows of tokens. When target processing is on, the last token * in each row is the label of the sequence element represented by * this row. All other tokens in the row, or all tokens in the row if * not target processing, are the names of features that are on for * the sequence element described by the row. */public class SimpleTaggerSentence2TokenSequence extends Pipe { private boolean setTokensAsFeatures; /** * Creates a new * <code>SimpleTaggerSentence2TokenSequence</code> instance. * By default we include tokens as features. */ public SimpleTaggerSentence2TokenSequence () { super (Alphabet.class, LabelAlphabet.class); setTokensAsFeatures = true; } /** * creates a new <code>SimpleTaggerSentence2TokenSequence</code> instance * which includes tokens as features iff the supplied argument is true. */ public SimpleTaggerSentence2TokenSequence (boolean inc) { super (Alphabet.class, LabelAlphabet.class); setTokensAsFeatures = inc; } /** * Parses a string representing a sequence of rows of tokens into an * array of arrays of tokens. * * @param sentence a <code>String</code> * @return the corresponding array of arrays of tokens. */ private String[][] parseSentence (String sentence) { String[] lines = sentence.split ("\n"); String[][] tokens = new String[lines.length][]; for (int i = 0; i < lines.length; i++) tokens[i] = lines[i].split ("\\s"); return tokens; } /** returns the first String in the array or "" if the array has length 0. */ private String makeText(String[] in){ if (in.length>0) return in[0]; else return ""; } /** * Takes an instance with data of type String or String[][] and creates * an Instance of type TokenSequence. Each Token in the sequence is * gets the test of the line preceding it and once feature of value 1 * for each "Feature" in the line. For example, if the String[][] is * {{a,b},{c,d,e}} (and target processing is off) then the text would be * "a b" for the first token and "c d e" for the second. Also, the * features "a" and "b" would be set for the first token and "c", "d" and * "e" for the second. The last element in the String[] for the current * token is taken as the target (label), so in the previous example "b" * would have been the label of the first sequence. */ public Instance pipe (Instance carrier) { Object inputData = carrier.getData (); Alphabet features = getDataAlphabet (); LabelAlphabet labels; LabelSequence target = null; String [][] tokens; TokenSequence ts = new TokenSequence (); if (inputData instanceof String) tokens = parseSentence ((String) inputData); else if (inputData instanceof String[][]) tokens = (String[][]) inputData; else throw new IllegalArgumentException ("Not a String or String[][]; got " + inputData); FeatureVector[] fvs = new FeatureVector[tokens.length]; if (isTargetProcessing ()) { labels = (LabelAlphabet) getTargetAlphabet (); target = new LabelSequence (labels, tokens.length); } for (int l = 0; l < tokens.length; l++) { int nFeatures; if (isTargetProcessing ()) { if (tokens[l].length < 1) throw new IllegalStateException ("Missing label at line " + l + " instance " + carrier.getName ()); nFeatures = tokens[l].length - 1; target.add(tokens[l][nFeatures]); } else nFeatures = tokens[l].length; Token tok = new Token(makeText(tokens[l])); if (setTokensAsFeatures){ for (int f = 0; f < nFeatures; f++) tok.setFeatureValue(tokens[l][f], 1.0); } else { for (int f = 1; f < nFeatures; f++) tok.setFeatureValue(tokens[l][f], 1.0); } ts.add (tok); } carrier.setData (ts); if (isTargetProcessing ()) carrier.setTarget (target); return carrier; } // Serialization garbage private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 1; private void writeObject (ObjectOutputStream out) throws IOException { out.defaultWriteObject (); out.writeInt (CURRENT_SERIAL_VERSION); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject (); int version = in.readInt (); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -