pagedinstancelist.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 679 行 · 第 1/2 页

JAVA
679
字号
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.base.types;import java.util.BitSet;import java.util.List;import java.util.ArrayList;import java.util.Collections;import java.util.Iterator;import java.rmi.dgc.VMID;import edu.umass.cs.mallet.base.types.Labeling;import edu.umass.cs.mallet.base.pipe.Pipe;import edu.umass.cs.mallet.base.pipe.PipeOutputAccumulator;import edu.umass.cs.mallet.base.pipe.SerialPipes;import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureSequence;import edu.umass.cs.mallet.base.pipe.FeatureSequence2FeatureVector;import edu.umass.cs.mallet.base.pipe.Target2Label;import edu.umass.cs.mallet.base.pipe.iterator.PipeInputIterator;import edu.umass.cs.mallet.base.pipe.iterator.RandomTokenSequenceIterator;import edu.umass.cs.mallet.base.util.MalletLogger;import edu.umass.cs.mallet.base.util.PropertyList;import edu.umass.cs.mallet.base.util.Random;import edu.umass.cs.mallet.base.util.DoubleList;import edu.umass.cs.mallet.base.types.Instance;import java.util.logging.*;import java.io.*;import java.lang.Runtime;/**	 xxx .split() methods still unreliable	 An InstanceList which avoids OutOfMemoryErrors by saving Instances	 to disk when there is not enough memory to create a new	 Instance. It implements a fixed-size paging scheme, where each page	 on disk stores <code>instancesPerPage</code> Instances. So, while	 the number of Instances per pages is constant, the size in bytes of	 each page may vary. Using this class instead of InstanceList means	 the number of Instances you can store is essentially limited only	 by disk size (and patience).	 The paging scheme is optimized for the most frequent case of	 looping through the InstanceList from index 0 to n. If there are n	 instances, then instances 0->(n/size()) are stored together on page	 1, instances (n/size)+1 -> 2*(n/size) are on page 2, ... etc. This	 way, pages adjacent in the <code>instances</code> list will usually	 be in the same page.	 The paging scheme also tries to only keep one page in memory at a	 time. The justification for this is that the page size is near the	 limit of the maximum number of instances that can be kept in	 memory. Since we assume the frequent case is looping from instance	 0 to n, keeping other Instances in memory will be a waste of	 resources.	 	 About <code>instancesPerPage</code> -- If	 <code>instancesPerPage</code> = -1, then its value will be set	 automatically by the following: When the first OutOfMemoryError is	 thrown, count how many instances are currently in memory, then	 divide by two. This is a conservative estimate of how many Instance	 objects can fit in memory simultaneously. If you know this value	 beforehand, simply pass it to the constructor.	 NOTE: The event which causes an OutOfMemoryError is the	 instantiation of a new Instance, _not_ the addition of this	 Instance to an InstanceList. Therefore, if you want to avoid	 OutOfMemoryErrors, let PagedInstanceList instantiate the new	 Instance for you. IOW, do this:	 Pipe p = ...;	 PagedInstanceList ilist = new PagedInstanceList (p);	 ilist.add (data, target, name, source);	 Or This	 PipeInputIterator iter = ...;	 Pipe p = ...;	 PagedInstanceList ilist = new PagedInstanceList (p);	 ilist.add (iter);	 	 But Not This:	 Pipe p = ...;	 PagedInstanceList ilist = new PagedInstanceList (p);	 ilist.add (new Instance (data, target, name, source));	 If memory is low, the last example will throw an OutOfMemoryError	 before control has been passed to PagedInstanceList to catch the	 error.	 NOTE ALSO: To save write time, we do not write the same Instance to	 disk more than once, i.e., there are no dirty bits or	 write-throughs. Thus, this assumes that after an Instance has been	 passed through its Pipe, it is no longer modified. One way around	 this is to call PagedInstanceList.setInstance (Instance inst),	 which _will_ overwrite an Instance that has been paged to disk.	 	 @see InstanceList   @author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a> */public class PagedInstanceList extends InstanceList{	private static Logger logger = MalletLogger.getLogger(PagedInstanceList.class.getName());	/** number of instances to put in one page. if -1, determine at	 * first call to <code>swapOutExcept</code> */	int instancesPerPage;	/** directory to store swap files */	File swapDir;		/** inMemory.get(i) == true iff instances.get(i) is in memory, else 0 */	BitSet inMemory;	/**  pageNotInMemory.get(i) == true iff page i is not in memory,	 *  else 0 */	BitSet pageNotInMemory;	/** recommend garbage collection after every swap out? */	boolean collectGarbage = true;		/** uniquely identifies this InstanceList. Used in creating	 * serialized page name for swap files. */	VMID id = new VMID();	// CONSTRUCTORS		/** Creates a PagedInstanceList where "instancesPerPage" instances	 * are swapped to disk in directory "swapDir" if the amount of free	 * system memory drops below "minFreeMemory" bytes	 * @param pipe instance pipe	 * @param instancesPerPage number of Instances to store in each	 * page. If -1, determine at first call to	 * <code>swapOutExcept</code>	 * @param swapDir where the pages on disk live.	 */	public PagedInstanceList (Pipe pipe, int size, int instancesPerPage, File swapDir)	{		super (pipe, size);		inMemory = new BitSet();		pageNotInMemory = new BitSet();		this.instancesPerPage = instancesPerPage;		this.swapDir = swapDir;		try {			if (!swapDir.exists()) {				swapDir.mkdir();			}		} catch (SecurityException e) {			System.err.println ("No permission to make directory " + swapDir);			System.exit(-1);		}	}	public PagedInstanceList (Pipe pipe, int size) {		this (pipe, size, -1, new File ("."));	}	public PagedInstanceList (Pipe pipe) {		 		this (pipe, 10);	}	public PagedInstanceList () {				this (notYetSetPipe);	}	// SPLITTING AND SAMPLING METHODS	  /**   * Shuffles the elements of this list among several smaller   * lists. Overrides InstanceList.split to add instances in original   * order, to prevent thrashing.   * @param proportions A list of numbers (not necessarily summing to 1) which,   * when normalized, correspond to the proportion of elements in each returned   * sublist.   * @param r The source of randomness to use in shuffling.   * @return one <code>InstanceList</code> for each element of <code>proportions</code>   */	public InstanceList[] split (java.util.Random r, double[] proportions)	{    ArrayList shuffled = new ArrayList (size());		for (int i=0; i < size(); i++)			shuffled.add (new Integer (i));		Collections.shuffle (shuffled, r);    return splitInOrder(shuffled, proportions, this);	}	public InstanceList[] split (double[] proportions)	{		return split (new java.util.Random(System.currentTimeMillis()), proportions);	}  private static InstanceList[] splitInOrder (List instanceIndices, double[] proportions,                                              PagedInstanceList cloneMe) {    double[] maxind = new double[proportions.length]; 		System.arraycopy (proportions, 0, maxind, 0, proportions.length);		PagedInstanceList[] ret = new PagedInstanceList[proportions.length];		ArrayList[] splitIndices = new ArrayList[proportions.length];		DenseVector.normalize(maxind);		// Fill maxind[] with the highest instance index that should go in		// each corresponding returned InstanceList.		for (int i = 0; i < maxind.length; i++) {			// xxx Is it dangerous to share the featureSelection that comes with cloning?			ret[i] = (PagedInstanceList)cloneMe.cloneEmpty();			splitIndices[i] = new ArrayList();			if (i > 0)				maxind[i] += maxind[i-1];		}		for (int i = 0; i < maxind.length; i++)			maxind[i] = Math.rint (maxind[i] * instanceIndices.size());		int j = 0;		// This gives a slight bias toward putting an extra instance in the last InstanceList.		for (int i = 0; i < instanceIndices.size(); i++) {			while (i >= maxind[j])				j++;			splitIndices[j].add(new Integer(i));		}		// now sort each splitIndices so paging is reduced		for (int i=0; i < splitIndices.length; i++) {			Collections.sort(splitIndices[i]);		}		for (int i=0; i < cloneMe.size(); i++) {			Instance tmpi = null;			try { // try to read in instance i				tmpi = cloneMe.getInstance(i);			}			catch (OutOfMemoryError e) {				tmpi = null;				System.gc();				logger.warning ("Caught " + e + " while splitting InstanceList. Paging out instances in all lists and retrying...");				cloneMe.swapOutAll();				for (int x=0; x < ret.length; x++) {					if (ret[x].size() > 0) {						System.out.println ("Swapping out ilist " + x);						ret[x].swapOutAll();					}				}				System.gc();				try {					tmpi = cloneMe.getInstance(i);				}				catch (OutOfMemoryError ee) {					logger.warning ("Still can't free enough memory to read in instance " +													i + " while splitting. Try using smaller value for \"instancesPerPage\".");					System.exit(-1);				}							}			tmpi = tmpi.shallowCopy();			boolean found = false;			for (int ii=0; ii < splitIndices.length; ii++) {				if (splitIndices[ii].size()==0)					continue;				int index = ((Integer)splitIndices[ii].get(0)).intValue();				if (index==i) {					logger.info ("adding instance " + i + " to split ilist " + ii);					found = true; 					ret[ii].add (tmpi);				 	splitIndices[ii].remove(0);				}				if (!found)					throw new IllegalStateException ("Error splitting instances.");			} 		}			 	return ret;	}		  /** Returns a pair of new lists such that the first list in the   * pair contains every <code>m</code>th element of this list,   * starting with the first.  The second list contains all remaining   * elements. Overrides InstanceList.splitByModulo to use   * PagedInstanceLists.   */	public InstanceList[] splitByModulo (int m)	{		PagedInstanceList[] ret = new PagedInstanceList[2];		ret[0] = (PagedInstanceList)this.cloneEmpty();		ret[1] = (PagedInstanceList)this.cloneEmpty();		for (int i = 0; i < this.size(); i++) {			if (i % m == 0)				ret[0].instances.add (this.getInstance(i));			else				ret[1].instances.add (this.getInstance(i));		}		return ret;	}	/** Overridden to add samples in original order to reduce	 * thrashing. */	public InstanceList sampleWithReplacement (java.util.Random r, int numSamples)	{		PagedInstanceList ret = (PagedInstanceList)this.cloneEmpty();		ArrayList indices = new ArrayList (numSamples);		for (int i=0; i < numSamples; i++)			indices.add (new Integer (r.nextInt(size())));		Collections.sort (indices);		for (int i = 0; i < indices.size(); i++)			ret.instances.add (this.getInstance(((Integer)indices.get(i)).intValue()));		return ret;	}  /**   * Returns an <code>InstanceList</code> of the same size, where the instances come from the   * random sampling (with replacement) of this list using the given weights.   * The length of the weight array must be the same as the length of this list   * The new instances all have their weights set to one.   */  // added by Gary - ghuang@cs.umass.edu  public InstanceList sampleWithWeights(java.util.Random r, double[] weights)   {    if (weights.length != size())		  throw new IllegalArgumentException("length of weight vector must equal number of instances");		if (size() == 0)		  return cloneEmpty();				double sumOfWeights = 0;		for (int i = 0; i < size(); i++) {		  if (weights[i] < 0)				throw new IllegalArgumentException("weight vector must be non-negative");		  sumOfWeights += weights[i];		}		if (sumOfWeights <= 0)		  throw new IllegalArgumentException("weights must sum to positive value");				PagedInstanceList newList = new PagedInstanceList();		double[] probabilities = new double[size()];		double sumProbs = 0;		for (int i = 0; i < size(); i++) {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?