stringeditfeaturevectorsequence.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 339 行

JAVA
339
字号
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. *//**    @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */package edu.umass.cs.mallet.base.types;import java.io.*;import java.util.regex.*;import java.util.HashMap;import gnu.trove.TObjectIntHashMap;import java.util.Set;import java.util.Iterator;// xxx A not very space-efficient version.  I'll compress it later.public class StringEditFeatureVectorSequence extends FeatureVectorSequence implements Serializable{  private int string1Length, string2Length;  private String string1, string2;  private String[] string1Blocks, string2Blocks;  private TObjectIntHashMap string1Present, string2Present;  private TObjectIntHashMap lexicon;  private int[] block1Indices, block2Indices;  private char delim = ':';  private static final char defaultDelimiter = ':';  public StringEditFeatureVectorSequence (FeatureVector[] featureVectors, String s1, String s2)  {    this (featureVectors, s1, s2, defaultDelimiter);  }  public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, char delimiter)  {    this (featureVectors, s1, s2, delimiter, null);  }  public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, HashMap lexic)  {    this (featureVectors, s1, s2, defaultDelimiter, lexic);  }  public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, char delimiter, HashMap lexic)  {    super (featureVectors);    this.delim = delimiter;        this.lexicon = new TObjectIntHashMap();    if (lexic != null) {      Set keys = lexic.keySet();      Iterator iter = keys.iterator();      while (iter.hasNext())        this.lexicon.put((String) iter.next(), 1);    }    this.string1 = s1;    this.string2 = s2;    this.string1Length = s1.length() + 2;    this.string2Length = s2.length() + 2;    string1Blocks = string1.split("" + delim);    string2Blocks = string2.split("" + delim);    string1Present = new TObjectIntHashMap();    string2Present = new TObjectIntHashMap();    block1Indices = new int[string1Length];    if (string1Blocks.length > 0) {      int whichBlock = 0;      block1Indices[0] = whichBlock++;      for (int i = 0; i < string1Blocks.length; i++)        string1Present.put(string1Blocks[i], 1);      for (int i = 1; i < string1Length-1; i++)        block1Indices[i] = ((string1.charAt(i-1) == delim) ? whichBlock++ : -1);      block1Indices[string1Length-1] = -1;    }    block2Indices = new int[string2Length];    if (string2Blocks.length > 0) {      int whichBlock = 0;      block2Indices[0] = whichBlock++;      for (int i = 0; i < string2Blocks.length; i++)        string2Present.put(string2Blocks[i], 1);      for (int i = 1; i < string2Length - 1; i++)        block2Indices[i] = ((string2.charAt(i-1) == delim) ? whichBlock++ : -1);      block2Indices[string2Length-1] = -1;    }  }   public String getString1() {    return string1;  }  public String getString2() {    return string2;  }  public int getString1Length () {    return string1Length;  }  public int getString2Length () {    return string2Length;  }  // End of Block  public int getString1EOBIndex(String delimiter) {    return getString1EOBIndex(delimiter, 0);  }  public int getString1EOBIndex(String delimiter, int start) {    return getString1IndexOf(delimiter, start);  }  public String getString1BlockAtIndex(int idx) {    if (idx < 0 || idx >= block1Indices.length || block1Indices[idx] < 0 || block1Indices[idx] >= string1Blocks.length) return null;    else return string1Blocks[block1Indices[idx]];  }  public int getString1IndexOf(String str, int start) {    int toret = string1.indexOf(str, start);      if (toret == -1)      toret = string1.length() - 1 - start;    else      toret = toret - start;    if (toret < 1)      return -1;    return toret;  }  public boolean isPresent1(String patternStr) {    Pattern p = Pattern.compile(patternStr);    Matcher m = p.matcher(string1);    boolean b = m.matches();    return b;  }  public boolean isPresentInString1(String str) {    return string1Present.containsKey(str);  }  public char getString1Char(int index) {    index = index - 1;    if (index < 0 || index >= string1.length()) return (char) 0;    else return string1.charAt(index);  }  public int getString2EOBIndex(String delimiter) {    return getString2EOBIndex(delimiter, 0);  }  public int getString2EOBIndex(String delimiter, int start) {    return getString2IndexOf(delimiter, start);  }  public String getString2BlockAtIndex(int idx) {    if (idx < 0 || idx >= block2Indices.length || block2Indices[idx] < 0 || block2Indices[idx] >= string2Blocks.length) return null;    else return string2Blocks[block2Indices[idx]];  }  public boolean isPresentInString2(String str) {    return string2Present.containsKey(str);  }  public int getString2IndexOf(String str, int start) {    int toret = string2.indexOf(str, start);      if (toret == -1)      toret = string2.length() - 1 - start;    else      toret = toret - start;    if (toret < 1)      return -1;    return toret;  }  public boolean isPresent2(String patternStr) {    Pattern p = Pattern.compile(patternStr);    Matcher m = p.matcher(string2);    boolean b = m.matches();    return b;  }  public char getString2Char(int index) {    index = index - 1;    if (index < 0 || index >= string2.length()) return (char) 0;    else return string2.charAt(index);  }  public boolean isInLexicon(String str) {    if (lexicon == null || str == null) return false;    return lexicon.containsKey(str);  }  public String toString ()  {    StringBuffer sb = new StringBuffer ();    sb.append (super.toString());    sb.append ('\n');    sb.append ("String 1: " + string1Length + " String 2: " + string2Length);    return sb.toString();  }  // Serialization of Instance  private static final long serialVersionUID = 1;  private static final int CURRENT_SERIAL_VERSION = 0;  private static final int NULL_INTEGER = -1;  private void writeObject (ObjectOutputStream out) throws IOException {    out.writeInt (CURRENT_SERIAL_VERSION);    out.writeInt (string1Length);    out.writeInt (string2Length);    out.writeObject (string1);    out.writeObject (string2);    if (string1Blocks == null) {      out.writeInt(NULL_INTEGER);    }    else {      int size = string1Blocks.length;      out.writeInt(size);      for(int i=0; i<size; i++) {        out.writeObject(string1Blocks[i]);      }    }    if (string2Blocks == null) {      out.writeInt(NULL_INTEGER);    }    else {      int size = string2Blocks.length;      out.writeInt(size);      for(int i=0; i<size; i++) {        out.writeObject(string2Blocks[i]);      }    }    out.writeObject(string1Present);     out.writeObject(string2Present);     out.writeObject(lexicon);     if (block1Indices == null) {      out.writeInt(NULL_INTEGER);    }    else {      int size = block1Indices.length;      out.writeInt(size);      for (int i=0; i<size; i++) {        out.writeInt(block1Indices[i]);      }    }    if (block2Indices == null) {      out.writeInt(NULL_INTEGER);    }    else {      int size = block2Indices.length;      out.writeInt(size);      for (int i=0; i<size; i++) {        out.writeInt(block2Indices[i]);      }    }    out.writeChar(delim);  }  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {    int version = in.readInt ();    int string1Length = in.readInt();    int string2Length = in.readInt();    String string1 = (String) in.readObject();    String string2 = (String) in.readObject();    int size = in.readInt();    if (size == NULL_INTEGER) {      string1Blocks = null;    }    else {      string1Blocks = new String[size];      for (int i = 0; i<size; i++) {        string1Blocks[i] = (String) in.readObject();      }    }    size = in.readInt();    if (size == NULL_INTEGER) {      string2Blocks = null;    }    else {      string2Blocks = new String[size];      for (int i = 0; i<size; i++) {        string2Blocks[i] = (String) in.readObject();      }    }    TObjectIntHashMap string1Present = (TObjectIntHashMap) in.readObject();    TObjectIntHashMap string2Present = (TObjectIntHashMap) in.readObject();    TObjectIntHashMap lexicon = (TObjectIntHashMap) in.readObject();    size = in.readInt();    if (size == NULL_INTEGER) {      block1Indices = null;    }    else {      block1Indices = new int[size];      for (int i = 0; i<size; i++) {        block1Indices[i] = in.readInt();      }    }    size = in.readInt();    if (size == NULL_INTEGER) {      block2Indices = null;    }    else {      block2Indices = new int[size];      for (int i = 0; i<size; i++) {        block2Indices[i] = in.readInt();      }    }    delim = in.readChar();  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?