⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 instancelist.java

📁 这是一个matlab的java实现。里面有许多内容。请大家慢慢捉摸。
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.base.types;import java.util.List;import java.util.ArrayList;import java.util.Collections;import java.util.Iterator;import edu.umass.cs.mallet.base.types.Labeling;import edu.umass.cs.mallet.base.pipe.Pipe;import edu.umass.cs.mallet.base.pipe.PipeOutputAccumulator;import edu.umass.cs.mallet.base.pipe.SerialPipes;import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureSequence;import edu.umass.cs.mallet.base.pipe.FeatureSequence2FeatureVector;import edu.umass.cs.mallet.base.pipe.Target2Label;import edu.umass.cs.mallet.base.pipe.iterator.PipeInputIterator;import edu.umass.cs.mallet.base.pipe.iterator.RandomTokenSequenceIterator;import edu.umass.cs.mallet.base.util.MalletLogger;import edu.umass.cs.mallet.base.util.PropertyList;import edu.umass.cs.mallet.base.util.Random;import edu.umass.cs.mallet.base.util.DoubleList;import edu.umass.cs.mallet.base.types.Instance;import java.util.logging.*;import java.io.*;/**	 A list of machine learning instances, typically used for training	 or testing of a machine learning algorithm.   <p>	 All of the instances in the list will have been passed through the	 same {@link edu.umass.cs.mallet.base.pipe.Pipe}, and thus must also share the same data and target Alphabets.   InstanceList keeps a reference to the pipe and the two alphabets.   <p>   The most common way of adding instances to an InstanceList is through   the <code>add(PipeInputIterator)</code> method. PipeInputIterators are a way of mapping general   data sources into instances suitable for processing through a pipe.     As each {@link edu.umass.cs.mallet.base.types.Instance} is pulled from the PipeInputIterator, the InstanceList     copies the instance and runs the copy through its pipe (with resultant     destructive modifications) before saving the modified instance on its list.     This is the  usual way in which instances are transformed by pipes.     <p>     InstanceList also contains methods for randomly generating lists of     feature vectors; splitting lists into non-overlapping subsets (useful     for test/train splits), and iterators for cross validation.   @see Instance   @see Pipe   @see PipeInputIterator   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */public class InstanceList implements Serializable, PipeOutputAccumulator{	private static Logger logger = MalletLogger.getLogger(InstanceList.class.getName());	ArrayList instances;	DoubleList instanceWeights = null;	FeatureSelection featureSelection = null;	FeatureSelection[] perLabelFeatureSelection = null;	Pipe pipe;	Alphabet dataVocab, targetVocab;	Class dataClass = null;	Class targetClass = null;  /**   * Creates a list with the given pipe and initial capacity   * where all added instances are passed through the specified pipe.   * @param pipe The pipe through which all added instances will be passed.   */  // XXX not very useful, should perhaps be removed	public InstanceList (Pipe pipe, int capacity)	{		this.pipe = pipe;		this.instances = new ArrayList (capacity);	}  /**   * Creates a list with the given pipe.   * @param pipe The pipe through which all added instances will be passed.   */	public InstanceList (Pipe pipe)	{		this (pipe, 10);	}	/** <p>Creates a list which will not pass added instances through a pipe.</p>   *   * <p>Used in those infrequent circumstances when the <code>InstanceList</code>   * has no pipe, and objects containing vocabularies are entered   * directly into the <code>InstanceList</code>; for example, the creation of a   * random <code>InstanceList using <code>Dirichlet</code>s and   * <code>Multinomial</code>s.</p>   *   * @param dataVocab The vocabulary for added instances' data fields   * @param targetVocab The vocabulary for added instances' targets   */	public InstanceList (Alphabet dataVocab, Alphabet targetVocab)	{		this (null, 10);		this.dataVocab = dataVocab;		this.targetVocab = targetVocab;	}	private static class NotYetSetPipe extends Pipe	{		public Instance pipe (Instance carrier)	{			throw new UnsupportedOperationException (				"The InstanceList has yet to have its pipe set; "+				"this could happen by calling InstanceList.add(InstanceList)");		}	}	static final Pipe notYetSetPipe = new NotYetSetPipe();  /** Creates a list which must have its pipe set later. */	public InstanceList ()	{		this (notYetSetPipe);	}	/**   * Creates a list consisting of randomly-generated   * <code>FeatureVector</code>s.   */	// xxx Perhaps split these out into a utility class	public InstanceList (Random r,											 // the generator of all random-ness used here											 Dirichlet classCentroidDistribution,											 // includes a Alphabet											 double classCentroidAverageAlphaMean,											 // Gaussian mean on the sum of alphas											 double classCentroidAverageAlphaVariance,											 // Gaussian variance on the sum of alphas											 double featureVectorSizePoissonLambda,											 double classInstanceCountPoissonLambda,											 String[] classNames)	{		this (new SerialPipes (new Pipe[]	{			new TokenSequence2FeatureSequence (),			new FeatureSequence2FeatureVector (),			new Target2Label()}));		//classCentroidDistribution.print();		PipeInputIterator iter = new RandomTokenSequenceIterator (			r, classCentroidDistribution,			classCentroidAverageAlphaMean, classCentroidAverageAlphaVariance,			featureVectorSizePoissonLambda, classInstanceCountPoissonLambda,			classNames);		this.add (iter);	}	private static Alphabet dictOfSize (int size)	{		Alphabet ret = new Alphabet ();		for (int i = 0; i < size; i++)			ret.lookupIndex ("feature"+i);		return ret;	}	private static String[] classNamesOfSize (int size)	{		String[] ret = new String[size];		for (int i = 0; i < size; i++)			ret[i] = "class"+i;		return ret;	}	public InstanceList (Random r, Alphabet vocab, String[] classNames,											 int meanInstancesPerLabel)	{		this (r, new Dirichlet(vocab, 2.0),					30, 0,					10, meanInstancesPerLabel, classNames);	}			public InstanceList (Random r, int vocabSize, int numClasses)	{		this (r, new Dirichlet(dictOfSize(vocabSize), 2.0),					30, 0,					10, 20, classNamesOfSize(numClasses));	}	public InstanceList shallowClone ()	{		InstanceList ret = new InstanceList (pipe, instances.size());		for (int i = 0; i < instances.size(); i++)			ret.add ((Instance)instances.get(i));		if (instanceWeights == null)			ret.instanceWeights = null;		else			ret.instanceWeights = instanceWeights.cloneDoubleList();		return ret;	}	// Intentionally add some noise into the data.  // return the real random ratio	// added by Fuchun Peng, Sept. 2003	public double noisify(double ratio)	{//		ArrayList new_instances = new ArrayList( instances.size() );				assert(ratio >= 0 && ratio <= 1);		int instance_size = instances.size();		int noise_instance_num = (int)( ratio * instance_size);		java.util.Random r = new java.util.Random ();//		System.out.println(noise_instance_num + "/" + instance_size);		ArrayList randnumlist = new ArrayList(noise_instance_num);		for(int i=0; i<noise_instance_num; i++){			int randIndex = r.nextInt(instance_size);			//	System.out.println(i + ": " + randIndex );					Integer nn = new Integer(randIndex);				if(randnumlist.indexOf(nn) != -1){				i--;			}			else{				randnumlist.add(nn);			}		}			LabelAlphabet targets = (LabelAlphabet) pipe.getTargetAlphabet();		int realRandNum = 0;		for(int i=0; i<randnumlist.size(); i++){			int index = ((Integer)randnumlist.get(i)).intValue();			Instance inst = getInstance( index );			int randIndex = r.nextInt( targets.size() );//			System.out.println(i + ": " +  index +": " + inst.getTarget().toString()//						+ " : " + targets.lookupLabel(randIndex) );			String oldTargetStr = inst.getTarget().toString();			String newTargetStr = targets.lookupLabel(randIndex).toString();			if(!oldTargetStr.equals(newTargetStr)){				inst.unLock();					inst.setTarget(targets.lookupLabel(randIndex));				inst.setLock();								realRandNum ++;			}  //                      System.out.println(i + ": " +  index +": " + inst.getTarget().toString()   //                                              + " : " + targets.lookupObject(randIndex) );			instances.set(index, inst);		}				double realRatio = (double)realRandNum/instance_size;			return realRatio;	}	public InstanceList cloneEmpty ()	{		InstanceList ret = new InstanceList (pipe);		ret.instanceWeights = instanceWeights == null ? null : (DoubleList) instanceWeights.clone();		// xxx Should the featureSelection and perLabel... be cloned?		// Note that RoostingTrainer currently depends on not cloning its splitting.		ret.featureSelection = this.featureSelection;		ret.perLabelFeatureSelection = this.perLabelFeatureSelection;		ret.dataClass = this.dataClass;		ret.targetClass = this.targetClass;		ret.dataVocab = this.dataVocab;		ret.targetVocab = this.targetVocab;		return ret;	}  /**   * Shuffles the elements of this list among several smaller lists.   * @param proportions A list of numbers (not necessarily summing to 1) which,   * when normalized, correspond to the proportion of elements in each returned   * sublist.   * @param r The source of randomness to use in shuffling.   * @return one <code>InstanceList</code> for each element of <code>proportions</code>   */	public InstanceList[] split (java.util.Random r, double[] proportions)	{    ArrayList shuffled = (ArrayList) this.instances.clone();		Collections.shuffle (shuffled, r);    return splitInOrder(shuffled, proportions, this);	}	public InstanceList[] split (double[] proportions)	{		return split (new java.util.Random(System.currentTimeMillis()), proportions);	}  /** Chops this list into several sequential sublists.   * @param proportions A list of numbers corresponding to the proportion of   * elements in each returned sublist.   * @return one <code>InstanceList</code> for each element of <code>proportions</code>   */	public InstanceList[] splitInOrder (double[] proportions)	{    return splitInOrder(this.instances, proportions, this);	}    private static InstanceList[] splitInOrder (List instances, double[] proportions,                                              InstanceList cloneMe) {    double[] maxind = new double[proportions.length];		System.arraycopy (proportions, 0, maxind, 0, proportions.length);		InstanceList[] ret = new InstanceList[proportions.length];		DenseVector.normalize(maxind);		// Fill maxind[] with the highest instance index that should go in		// each corresponding returned InstanceList.		for (int i = 0; i < maxind.length; i++) {			// xxx Is it dangerous to share the featureSelection that comes with cloning?

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -