alphabet.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 285 行

JAVA
285
字号
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. *//**   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>*/package edu.umass.cs.mallet.base.types;import java.util.ArrayList;import java.io.*;import java.util.Iterator;import java.util.HashMap;import java.rmi.dgc.VMID;/** *  A mapping between integers and objects where the mapping in each * direction is efficient.  Integers are assigned consecutively, starting * at zero, as objects are added to the Alphabet.  Objects can not be * deleted from the Alphabet and thus the integers are never reused. * <p> * The most common use of an alphabet is as a dictionary of feature names * associated with a {@link edu.umass.cs.mallet.base.types.FeatureVector} in an * {@link edu.umass.cs.mallet.base.types.Instance}. In a simple document * classification usage, * each unique word in a document would be a unique entry in the Alphabet * with a unique integer associated with it.   FeatureVectors rely on * the integer part of the mapping to efficiently represent the subset of * the Alphabet present in the FeatureVector. * @see FeatureVector * @see Instance * @see edu.umass.cs.mallet.base.pipe.Pipe */public class Alphabet implements Serializable{    gnu.trove.TObjectIntHashMap map;    ArrayList entries;    boolean growthStopped = false;    Class entryClass = null;    VMID instanceId = new VMID();  //used in readResolve to identify persitent instances    public Alphabet (int capacity, Class entryClass)    {	this.map = new gnu.trove.TObjectIntHashMap (capacity);	this.entries = new ArrayList (capacity);	this.entryClass = entryClass;    }    public Alphabet (Class entryClass)    {	this (8, entryClass);    }    public Alphabet (int capacity)    {	this (capacity, null);    }    public Alphabet ()    {	this (8, null);    }    public Object clone ()    {	//try {	// Wastes effort, because we over-write ivars we create	Alphabet ret = new Alphabet ();	ret.map = (gnu.trove.TObjectIntHashMap) map.clone();	ret.entries = (ArrayList) entries.clone();	ret.growthStopped = growthStopped;	ret.entryClass = entryClass;	return ret;	//} catch (CloneNotSupportedException e) {	//e.printStackTrace();	//throw new IllegalStateException ("Couldn't clone InstanceList Vocabuary");	//}    }    /** Return -1 if entry isn't present. */    public int lookupIndex (Object entry, boolean addIfNotPresent)    {	if (entry == null)	    throw new IllegalArgumentException ("Can't lookup \"null\" in an Alphabet.");	if (entryClass == null)	    entryClass = entry.getClass();	else	    // Insist that all entries in the Alphabet are of the same	    // class.  This may not be strictly necessary, but will catch a	    // bunch of easily-made errors.	    if (entry.getClass() != entryClass)		throw new IllegalArgumentException ("Non-matching entry class, "+entry.getClass()+", was "+entryClass);	int retIndex = -1;	if (map.containsKey( entry )) {	    retIndex = map.get( entry );	}	else if (!growthStopped && addIfNotPresent) {	    retIndex = entries.size();	    map.put (entry, retIndex);	    entries.add (entry);	}	return retIndex;    }    public int lookupIndex (Object entry)    {	return lookupIndex (entry, true);    }    public Object lookupObject (int index)    {	return entries.get(index);    }    public Object[] toArray () {	return entries.toArray();    }    /**     * Returns an array containing all the entries in the Alphabet.     *  The runtime type of the returned array is the runtime type of in.     *  If in is large enough to hold everything in the alphabet, then it     *  it used.  The returned array is such that for all entries <tt>obj</tt>,     *  <tt>ret[lookupIndex(obj)] = obj</tt> .     */     public Object[] toArray (Object[] in) {	return entries.toArray (in);    }    // xxx This should disable the iterator's remove method...    public Iterator iterator () {	return entries.iterator();    }    public Object[] lookupObjects (int[] indices)    {	Object[] ret = new Object[indices.length];	for (int i = 0; i < indices.length; i++)	    ret[i] = entries.get(indices[i]);	return ret;    }    /**     * Returns an array of the objects corresponding to     * @param indices An array of indices to look up     * @param buf An array to store the returned objects in.     * @return An array of values from this Alphabet.  The runtime type of the array is the same as buf     */    public Object[] lookupObjects (int[] indices, Object[] buf)    {	for (int i = 0; i < indices.length; i++)	    buf[i] = entries.get(indices[i]);	return buf;    }    public int[] lookupIndices (Object[] objects, boolean addIfNotPresent)    {	int[] ret = new int[objects.length];	for (int i = 0; i < objects.length; i++)	    ret[i] = lookupIndex (objects[i], addIfNotPresent);	return ret;    }    public boolean contains (Object entry)    {	return map.contains (entry);    }    public int size ()    {	return entries.size();    }    public void stopGrowth ()    {	growthStopped = true;    }	public void startGrowth ()	{		growthStopped = false;	}	public boolean growthStopped ()	{		return growthStopped;	}    public Class entryClass ()    {	return entryClass;    }    /** Return String representation of all Alphabet entries, each	separated by a newline. */    public String toString()    {	StringBuffer sb = new StringBuffer();	for (int i = 0; i < entries.size(); i++) {	    sb.append (entries.get(i).toString());	    sb.append ('\n');	}	return sb.toString();    }    public void dump () { dump (System.out); }    public void dump (PrintStream out)    {      dump (new PrintWriter (new OutputStreamWriter (out), true));    }  public void dump (PrintWriter out)  {	for (int i = 0; i < entries.size(); i++) {	    out.println (i+" => "+entries.get (i));	}    }    public VMID getInstanceId() { return instanceId;} // for debugging    public void setInstanceId(VMID id) { this.instanceId = id; }    // Serialization    private static final long serialVersionUID = 1;    private static final int CURRENT_SERIAL_VERSION = 1;    private void writeObject (ObjectOutputStream out) throws IOException {	out.writeInt (CURRENT_SERIAL_VERSION);	out.writeInt (entries.size());	for (int i = 0; i < entries.size(); i++)	    out.writeObject (entries.get(i));	out.writeBoolean (growthStopped);	out.writeObject (entryClass);        out.writeObject(instanceId);    }    private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {	int version = in.readInt ();	int size = in.readInt();	entries = new ArrayList (size);	map = new gnu.trove.TObjectIntHashMap (size);	for (int i = 0; i < size; i++) {	    Object o = in.readObject();	    map.put (o, i);	    entries. add (o);	}	growthStopped = in.readBoolean();	entryClass = (Class) in.readObject();        if (version >0 ){ // instanced id added in version 1S            instanceId = (VMID) in.readObject();        }    }    private transient static HashMap deserializedEntries = new HashMap();    /**     * This gets called after readObject; it lets the object decide whether     * to return itself or return a previously read in version.     * We use a hashMap of instanceIds to determine if we have already read     * in this object.     * @return     * @throws ObjectStreamException     */    public Object readResolve() throws ObjectStreamException {	Object previous = deserializedEntries.get(instanceId);	if (previous != null){	    //System.out.println(" ***Alphabet ReadResolve:Resolving to previous instance. instance id= " + instanceId);	    return previous;	}	if (instanceId != null){	    deserializedEntries.put(instanceId, this);	}	//System.out.println(" *** Alphabet ReadResolve: new instance. instance id= " + instanceId);	return this;    }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?