budgetedlearningcurveresultproducer.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 1,354 行 · 第 1/3 页
JAVA
1,354 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    BudgetedLearningCurveResultProducer.java *    Copyright (C) 2005 Prem Melville * *///////////////////////////////////// WARNING: UNDER DEVELOPMENT//////////////////////////////////package weka.experiment;import java.util.*;import java.io.*;import weka.classifiers.*;import weka.core.*;/** * Does an N-fold cross-validation, but generates a learning curve by * also varying the number of training examples. Creates a split that * uses increasingly larger fractions of the full training set from * the N fold but always using the same N-fold test set for * testing. If this is applied to an active learner, then the training * examples are selected actively by the learner from the pool of * unlabeled examples. If this is not used with an active learner, it * should produce the same results as the * LearningCurveCrossValidationResultProducer. * * @author Prem Melville (melville@cs.utexas.edu)  */public class BudgetedLearningCurveResultProducer     implements ResultProducer, OptionHandler, AdditionalMeasureProducer {    /**      * The directory used when loading feature cost files on demand, null indicates     * current directory      */    protected File m_FeatureCostDirectory = new File(System.getProperty("user.dir")+"/featureCosts");        /** Costs for acquiring each feature */     protected double []m_FeatureCosts = null;    /** The dataset of interest */    protected Instances m_Instances;    /** The ResultListener to send results to */    protected ResultListener m_ResultListener = new CSVResultListener();    /** The number of folds in the cross-validation */    protected int m_NumFolds = 10;    /** Save raw output of split evaluators --- for debugging purposes */    protected boolean m_debugOutput = false;    /** The output zipper to use for saving raw splitEvaluator output */    protected OutputZipper m_ZipDest = null;    /** The destination output file/directory for raw output */    protected File m_OutputFile = new File(					   new File(System.getProperty("user.dir")), 					   "splitEvalutorOut.zip");    /** The SplitEvaluator used to generate results */    //protected SplitEvaluator m_SplitEvaluator = new ClassifierSplitEvaluator();    protected SplitEvaluator m_SplitEvaluator = new FeatureCostSensitiveClassifierSplitEvaluator();    /** The names of any additional measures to look for in SplitEvaluators */    protected String [] m_AdditionalMeasures = null;    /**      * The minimum number of instances to use. If this is zero, the first     * step will contain m_StepSize instances      */    protected double m_LowerSize = 0;      /**     * The maximum number of instances to use. -1 indicates no maximum      * (other than the total number of instances)     */    protected double m_UpperSize = -1;    /** The number of instances to add at each step */    protected double m_StepSize = 1;    /** The specific points to plot, either integers representing specific numbers of training examples,     * or decimal fractions representing percentages of the full training set*/    protected double[] m_PlotPoints;    /** The current dataset size during stepping */    protected int m_CurrentSize = 0;    /** Cost to build current model */    protected double m_Cost = 0.0;        /* The name of the key field containing the dataset name */    public static String DATASET_FIELD_NAME = "Dataset";    /* The name of the key field containing the run number */    public static String RUN_FIELD_NAME = "Run";    /* The name of the key field containing the fold number */    public static String FOLD_FIELD_NAME = "Fold";    /* The name of the result field containing the timestamp */    public static String TIMESTAMP_FIELD_NAME = "Date_time";        /* The name of the result field containing the sample selection time */    public static String SELECTION_TIME_FIELD_NAME = "Selection_time";        /* The name of the key field containing the learning rate step number */    public static String STEP_FIELD_NAME = "Total_instances";    /* The name of the key field containing the fraction of total instances used */    public static String FRACTION_FIELD_NAME = "Fraction_instances";        public static String COST_FIELD_NAME = "Model_cost";        /* Extension for feature cost files */    public static String FEATURE_COST_FILE_EXTENSION = ".featureCosts";        /* Indicates whether fractions or actual number of instances have been specified */    protected boolean m_IsFraction = false;        /**     * Returns a string describing this result producer     * @return a description of the result producer suitable for     * displaying in the explorer/experimenter gui     */    public String globalInfo() {	return "Performs a learning-curve cross validation run using a supplied "	    +"split evaluator. Trains on increasing subsets of the training data for each split, "	    +"repeatedly testing on the test set for that split after training on subsets of various sizes.";    }            /**     * Sets the dataset that results will be obtained for.     *     * @param instances a value of type 'Instances'.     */    public void setInstances(Instances instances) {    	m_Instances = instances;	loadFeatureCosts();    }                /**     * Returns the directory that will be searched for feature cost files when     * loading on demand.     *     * @return The cost file search directory.     */    public File getFeatureCostDirectory() {	return m_FeatureCostDirectory;    }        /**     * Sets the directory that will be searched for feature cost files when     * loading on demand.     *     * @param newDir The cost file search directory.     */    public void setFeatureCostDirectory(File newDir) {	if (newDir.isDirectory()) {	    m_FeatureCostDirectory = newDir;	} else {	    m_FeatureCostDirectory = new File(newDir.getParent());	}    }        //Read feature costs from file    protected void loadFeatureCosts(){	try{	    String costName = m_Instances.relationName() + FEATURE_COST_FILE_EXTENSION;	    File costFile = new File(getFeatureCostDirectory(), costName);	    if (!costFile.exists()) {		throw new Exception("Feature cost file doesn't exist: " + costFile);	    }	    int numFeatures = numFeatures();	    m_FeatureCosts = new double[numFeatures];	    	    BufferedReader bin = new BufferedReader(new FileReader(costFile));	    String line = bin.readLine();//read first line - ignore rest of file	    bin.close();	    StringTokenizer st = new StringTokenizer(line);	    int ctr = 0;//count number of costs in the file	    while(st.hasMoreTokens()) {		if(ctr < numFeatures){		    m_FeatureCosts[ctr] = Double.parseDouble(st.nextToken());		    ctr++;		}else{		    //throw new Exception("Feature cost file has too many costs.");   		    System.err.println("Feature cost file has too many costs: "+costFile);   		    System.exit(0);		}	    }	    if(ctr < numFeatures){		System.err.println("Feature cost file has too few costs: "+costFile);   		System.exit(0);		//throw new Exception("Feature cost file has too few costs.");   	    }	}catch (Exception e){	    System.err.println (e);	}	//DEBUG	Utils.printArray(m_FeatureCosts);    }        /**     * Sets the object to send results of each run to.     *     * @param listener a value of type 'ResultListener'     */    public void setResultListener(ResultListener listener) {	m_ResultListener = listener;    }        /**     * Set a list of method names for additional measures to look for     * in SplitEvaluators. This could contain many measures (of which only a     * subset may be produceable by the current SplitEvaluator) if an experiment     * is the type that iterates over a set of properties.     * @param additionalMeasures an array of measure names, null if none     */    public void setAdditionalMeasures(String [] additionalMeasures) {	m_AdditionalMeasures = additionalMeasures;	if (m_SplitEvaluator != null) {	    System.err.println("LearningCurveCrossValidationResultProducer: setting additional "			       +"measures for "			       +"split evaluator");	    m_SplitEvaluator.setAdditionalMeasures(m_AdditionalMeasures);	}    }    /**     * Returns an enumeration of any additional measure names that might be     * in the SplitEvaluator     * @return an enumeration of the measure names     */    public Enumeration enumerateMeasures() {	Vector newVector = new Vector();	if (m_SplitEvaluator instanceof AdditionalMeasureProducer) {	    Enumeration en = ((AdditionalMeasureProducer)m_SplitEvaluator).		enumerateMeasures();	    while (en.hasMoreElements()) {		String mname = (String)en.nextElement();		newVector.addElement(mname);	    }	}	return newVector.elements();    }      /**     * Returns the value of the named measure     * @param measureName the name of the measure to query for its value     * @return the value of the named measure     * @exception IllegalArgumentException if the named measure is not supported     */    public double getMeasure(String additionalMeasureName) {	if (m_SplitEvaluator instanceof AdditionalMeasureProducer) {	    return ((AdditionalMeasureProducer)m_SplitEvaluator).		getMeasure(additionalMeasureName);	} else {	    throw new IllegalArgumentException("LearningCurveCrossValidationResultProducer: "					       +"Can't return value for : "+additionalMeasureName					       +". "+m_SplitEvaluator.getClass().getName()+" "					       +"is not an AdditionalMeasureProducer");	}    }      /**     * Gets a Double representing the current date and time.     * eg: 1:46pm on 20/5/1999 -> 19990520.1346     *     * @return a value of type Double     */    public static Double getTimestamp() {	Calendar now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));	double timestamp = now.get(Calendar.YEAR) * 10000	    + (now.get(Calendar.MONTH) + 1) * 100	    + now.get(Calendar.DAY_OF_MONTH)	    + now.get(Calendar.HOUR_OF_DAY) / 100.0	    + now.get(Calendar.MINUTE) / 10000.0;	return new Double(timestamp);    }      /**     * Prepare to generate results.     *     * @exception Exception if an error occurs during preprocessing.     */    public void preProcess() throws Exception {	if (m_SplitEvaluator == null) {	    throw new Exception("No SplitEvalutor set");	}	if (m_ResultListener == null) {	    throw new Exception("No ResultListener set");	}	m_ResultListener.preProcess(this);    }      /**     * Perform any postprocessing. When this method is called, it indicates     * that no more requests to generate results for the current experiment     * will be sent.     *     * @exception Exception if an error occurs     */    public void postProcess() throws Exception {	m_ResultListener.postProcess(this);	if (m_debugOutput) {	    if (m_ZipDest != null) {		m_ZipDest.finished();		m_ZipDest = null;	    }	}    }      /**     * Gets the keys for a specified run number. Different run     * numbers correspond to different randomizations of the data. Keys     * produced should be sent to the current ResultListener     *     * @param run the run number to get keys for.     * @exception Exception if a problem occurs while getting the keys     */    public void doRunKeys(int run) throws Exception {	int numExtraKeys;	if(m_IsFraction)	    numExtraKeys = 5;	else numExtraKeys = 4;		if (m_Instances == null) {	    throw new Exception("No Instances set");	}	if (m_ResultListener == null) {	    throw new Exception("No ResultListener set");	}	for (int fold = 0; fold < m_NumFolds; fold++) {	    int pointNum = 0;	    // For each subsample size	    if (m_PlotPoints != null) {		m_CurrentSize = plotPoint(0);	    }	    else if (m_LowerSize == 0) {		m_CurrentSize = stepSize(m_StepSize);	    } else {		m_CurrentSize = lowerSize(m_LowerSize);	    }	    while (m_CurrentSize <= maxNumQueries()) {		// Add in some fields to the key like run and fold number, dataset name		Object [] seKey = m_SplitEvaluator.getKey();		Object [] key = new Object [seKey.length + numExtraKeys];		key[0] = Utils.backQuoteChars(m_Instances.relationName());		key[1] = "" + run;		key[2] = "" + (fold + 1);		key[3] = "" + m_CurrentSize;		if(m_IsFraction) key[4] = "" + m_PlotPoints[pointNum];		System.arraycopy(seKey, 0, key, numExtraKeys, seKey.length);		if (m_ResultListener.isResultRequired(this, key)) {		    try {			m_ResultListener.acceptResult(this, key, null);		    } catch (Exception ex) {			// Save the train and test datasets for debugging purposes?			throw ex;		    }		}		if (m_PlotPoints != null) {		    pointNum ++;		    m_CurrentSize = plotPoint(pointNum);		}		else {		    m_CurrentSize += stepSize(m_StepSize);		}	    }	}    }        /**      * Get the maximum number of queries base on the upperSize limit     * or maximum training set size from the n-fold CV      */    protected int maxNumQueries() {	if (m_UpperSize == -1 || m_PlotPoints != null)	    return ((int)(m_Instances.numInstances()*(1 - 1/((double)m_NumFolds))))*numFeatures();	else if(isInteger(m_UpperSize))	    return (int) m_UpperSize;	else return (int) (m_UpperSize*((int)(m_Instances.numInstances()*(1 - 1/((double)m_NumFolds))))*numFeatures());    }        //If lowersize is a fraction, compute as a fraction of total number of features.    protected int lowerSize(double lowerSize){	if(isInteger(lowerSize))	    return (int) lowerSize;	else return (int) (lowerSize * ((int)(m_Instances.numInstances()*(1 - 1/((double)m_NumFolds))))*numFeatures());    }        //If lowersize is a fraction, compute as a fraction of total number of features.    protected int stepSize(double stepSize){	int out;	if(isInteger(stepSize))	    out = (int) stepSize;	else out = (int) (stepSize * ((int)(m_Instances.numInstances()*(1 - 1/((double)m_NumFolds))))*numFeatures());	//step size must at least be 1	if(out<1) out=1;	return out;    }        //Returns the number of features in the datasets (excluding the class attribute)    protected int numFeatures(){	return (m_Instances.numAttributes() - 1);    }        /**     * Gets the results for a specified run number. Different run     * numbers correspond to different randomizations of the data. Results     * produced should be sent to the current ResultListener     *     * @param run the run number to get results for.     * @exception Exception if a problem occurs while getting the results     */
budgetedlearningcurveresultproducer.java - 源码说明

本页面展示了「wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器」中的 budgetedlearningcurveresultproducer.java 源码文件，采用 Java 编程语言编写，共 1,354 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与university相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?