rulestats.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 898 行 · 第 1/2 页
JAVA
898 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    RuleStats.java *    Copyright (C) 2001 Xin Xu */package weka.classifiers.rules;import java.util.Enumeration;import java.util.Random;import weka.core.FastVector;import weka.core.Instances;import weka.core.Instance;import weka.core.Attribute;import weka.core.AttributeStats;import weka.core.Utils;/** * This class implements the statistics functions used in the  * propositional rule learner, from the simpler ones like count of * true/false positive/negatives, filter data based on the ruleset, etc. * to the more sophisticated ones such as MDL calculation and rule * variants generation for each rule in the ruleset. <p> * * Obviously the statistics functions listed above need the specific * data and the specific ruleset, which are given in order to instantiate * an object of this class. <p> *   * @author Xin Xu (xx5@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */public class RuleStats{  /** The data on which the stats calculation is based */  private Instances m_Data;  /** The specific ruleset in question */  private FastVector m_Ruleset;  /** The simple stats of each rule */  private FastVector m_SimpleStats;  /** The set of instances filtered by the ruleset */  private FastVector m_Filtered;      /** The total number of possible conditions that could   *  appear in a rule */  private double m_Total;   /** The redundancy factor in theory description length */	  private static double REDUNDANCY_FACTOR = 0.5;      /** The theory weight in the MDL calculation */  private double MDL_THEORY_WEIGHT = 1.0;    /** The class distributions predicted by each rule */    private FastVector m_Distributions;  /** Default constructor */  public RuleStats(){    m_Data = null;    m_Ruleset = null;    m_SimpleStats = null;    m_Filtered = null;    m_Distributions = null;    m_Total = -1;  }      /**    * Constructor that provides ruleset and data    *   * @param data the data   * @param rules the ruleset   */  public RuleStats(Instances data, FastVector rules){    this();    m_Data = data;    m_Ruleset = rules;	  }  /**   * Set the number of all conditions that could appear    * in a rule in this RuleStats object, if the number set   * is smaller than 0 (typically -1), then it calcualtes   * based on the data store    *   * @param total the set number   */  public void setNumAllConds(double total){    if(total < 0)      m_Total = numAllConditions(m_Data);        else      m_Total = total;  }      /**    * Set the data of the stats, overwriting the old one if any    *   * @param data the data to be set   */  public void setData(Instances data){    m_Data = data;  }      /**    * Get the data of the stats    *   * @return the data    */  public Instances getData(){    return m_Data;  }      /**    * Set the ruleset of the stats, overwriting the old one if any    *         * @param rules the set of rules to be set   */  public void setRuleset(FastVector rules){    m_Ruleset = rules;  }      /**    * Get the ruleset of the stats    *         * @return the set of rules    */  public FastVector getRuleset(){    return m_Ruleset;  }  /**    * Get the size of the ruleset in the stats    *         * @return the size of ruleset    */  public int getRulesetSize(){    return m_Ruleset.size();  }    /**    * Get the simple stats of one rule, including 6 parameters:   * 0: coverage; 1:uncoverage; 2: true positive; 3: true negatives;   * 4: false positives; 5: false negatives   *   * @param index the index of the rule   * @return the stats   */  public double[] getSimpleStats(int index){    if((m_SimpleStats != null) && (index < m_SimpleStats.size()))      return (double[])m_SimpleStats.elementAt(index);	    return null;  }      /**   * Get the data after filtering the given rule   *   * @param index the index of the rule   * @return the data covered and uncovered by the rule   */  public Instances[] getFiltered(int index){	    if((m_Filtered != null) && (index < m_Filtered.size()))      return (Instances[])m_Filtered.elementAt(index);	    return null;  }        /**     * Get the class distribution predicted by the rule in     * given position     *     * @param index the position index of the rule     * @return the class distributions     */    public double[] getDistributions(int index){		if((m_Distributions != null) && (index < m_Distributions.size()))	    return (double[])m_Distributions.elementAt(index);		return null;    }          /**   * Set the weight of theory in MDL calcualtion   *   * @param weight the weight to be set   */  public void setMDLTheoryWeight(double weight){    MDL_THEORY_WEIGHT = weight;  }   /**   * Compute the number of all possible conditions that could    * appear in a rule of a given data.  For nominal attributes,   * it's the number of values that could appear; for numeric    * attributes, it's the number of values * 2, i.e. <= and >=   * are counted as different possible conditions.   *   * @param data the given data   * @return number of all conditions of the data   */  public static double numAllConditions(Instances data){    double total = 0;    Enumeration attEnum = data.enumerateAttributes();	    while(attEnum.hasMoreElements()){      Attribute att= (Attribute)attEnum.nextElement();      if(att.isNominal())	total += (double)att.numValues();      else	total += 2.0 * (double)data.numDistinctValues(att);	    }    return total;  }  /**   * Filter the data according to the ruleset and compute the basic   * stats: coverage/uncoverage, true/false positive/negatives of    * each rule   */  public void countData(){    if((m_Filtered != null) ||       (m_Ruleset == null)  ||       (m_Data == null))      return;	    int size = m_Ruleset.size();    m_Filtered = new FastVector(size);    m_SimpleStats = new FastVector(size);    m_Distributions = new FastVector(size);    Instances data = new Instances(m_Data);	    for(int i=0; i < size; i++){      double[] stats = new double[6];  // 6 statistics parameters      double[] classCounts = new double[m_Data.classAttribute().numValues()];      Instances[] filtered = computeSimpleStats(i, data, stats, classCounts);      m_Filtered.addElement(filtered);      m_SimpleStats.addElement(stats);      m_Distributions.addElement(classCounts);      data = filtered[1];  // Data not covered    }	  }        /**     * Count data from the position index in the ruleset     * assuming that given data are not covered by the rules     * in position 0...(index-1), and the statistics of these     * rules are provided.<br>     * This procedure is typically useful when a temporary      * object of RuleStats is constructed in order to efficiently     * calculate the relative DL of rule in position index,      * thus all other stuff is not needed.     *     * @param index the given position     * @param uncovered the data not covered by rules before index     * @param prevRuleStats the provided stats of previous rules     */    public void countData(int index, Instances uncovered, 			  double[][] prevRuleStats){	if((m_Filtered != null) ||	   (m_Ruleset == null))	    return;		int size = m_Ruleset.size();	m_Filtered = new FastVector(size);	m_SimpleStats = new FastVector(size);	Instances[] data = new Instances[2];	data[1] = uncovered;		for(int i=0; i < index; i++){	    m_SimpleStats.addElement(prevRuleStats[i]);	    if(i+1 == index)		m_Filtered.addElement(data);	    else		m_Filtered.addElement(new Object()); // Stuff sth.	}		for(int j=index; j < size; j++){	    double[] stats = new double[6];  // 6 statistics parameters	    Instances[] filtered = computeSimpleStats(j, data[1], stats, null);	    m_Filtered.addElement(filtered);	    m_SimpleStats.addElement(stats);	    data = filtered;  // Data not covered	}	    }      /**   * Find all the instances in the dataset covered/not covered by    * the rule in given index, and the correponding simple statistics   * and predicted class distributions are stored in the given double array,   * which can be obtained by getSimpleStats() and getDistributions().<br>   *    * @param index the given index, assuming correct   * @param insts the dataset to be covered by the rule   * @param stats the given double array to hold stats, side-effected   * @param dist the given array to hold class distributions, side-effected   *             if null, the distribution is not necessary    * @return the instances covered and not covered by the rule   */  private Instances[] computeSimpleStats(int index, Instances insts, 					 double[] stats, double[] dist){    Rule rule = (Rule)m_Ruleset.elementAt(index);	    Instances[] data = new Instances[2];    data[0] = new Instances(insts, insts.numInstances());    data[1] = new Instances(insts, insts.numInstances());    for(int i=0; i<insts.numInstances(); i++){      Instance datum = insts.instance(i);      double weight = datum.weight();      if(rule.covers(datum)){	data[0].add(datum);        // Covered by this rule	stats[0] += weight;        // Coverage	if((int)datum.classValue() == (int)rule.getConsequent())	  stats[2] += weight;    // True positives	else	  stats[4] += weight;    // False positives	if(dist != null)	    dist[(int)datum.classValue()] += weight;      }      else{	data[1].add(datum);        // Not covered by this rule	stats[1] += weight; 	if((int)datum.classValue() != (int)rule.getConsequent())	  stats[3] += weight;    // True negatives	else	  stats[5] += weight;    // False negatives	          }    }        return data;  }          /**    * Add a rule to the ruleset and update the stats   *   * @param the rule to be added   */  public void addAndUpdate(Rule lastRule){    if(m_Ruleset == null)      m_Ruleset = new FastVector();    m_Ruleset.addElement(lastRule);      Instances data = (m_Filtered == null) ?      m_Data : ((Instances[])m_Filtered.lastElement())[1];    double[] stats = new double[6];    double[] classCounts = new double[m_Data.classAttribute().numValues()];    Instances[] filtered = 	computeSimpleStats(m_Ruleset.size()-1, data, stats, classCounts);        if(m_Filtered == null)	m_Filtered = new FastVector();	    m_Filtered.addElement(filtered);    if(m_SimpleStats == null)      m_SimpleStats = new FastVector();	    m_SimpleStats.addElement(stats);    if(m_Distributions == null)	m_Distributions = new FastVector();    m_Distributions.addElement(classCounts);  }          /**   * Subset description length: <br>   * S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)   *   * Details see Quilan: "MDL and categorical theories (Continued)",ML95   *   * @param t the number of elements in a known set   * @param k the number of elements in a subset   * @param p the expected proportion of subset known by recipient   */  public static double subsetDL(double t, double k, double p){    double rt = Utils.gr(p, 0.0) ? (- k*Utils.log2(p)) : 0.0;    rt -= (t-k)*Utils.log2(1-p);    return rt;  }          /**    * The description length of the theory for a given rule.  Computed as:<br>   *                 0.5* [||k||+ S(t, k, k/t)]<br>   * where k is the number of antecedents of the rule; t is the total   * possible antecedents that could appear in a rule; ||K|| is the    * universal prior for k , log2*(k) and S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)   * is the subset encoding length.<p>   *   * Details see Quilan: "MDL and categorical theories (Continued)",ML95   *   * @param index the index of the given rule (assuming correct)   * @exception if index out of range or object not initialized yet   * @return the theory DL, weighted if weight != 1.0   */  public double theoryDL(int index){	    double k = ((Rule)m_Ruleset.elementAt(index)).size();	    if(k == 0)      return 0.0;	    double tdl = Utils.log2(k);               	        if(k > 1)                           // Approximation      tdl += 2.0 * Utils.log2(tdl);   // of log2 star	    tdl += subsetDL(m_Total, k, k/m_Total);    //System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);    return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;  }       /**    * The description length of data given the parameters of the data   * based on the ruleset. <p>   * Details see Quinlan: "MDL and categorical theories (Continued)",ML95<p>   *   * @param expFPOverErr expected FP/(FP+FN)   * @param cover coverage   * @param uncover uncoverage   * @param fp False Positive   * @param fn False Negative   */  public static double dataDL(double expFPOverErr, double cover, 			      double uncover, double fp, double fn){    double totalBits = Utils.log2(cover+uncover+1.0); // how many data?
rulestats.java - 源码说明

本页面展示了「wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器」中的 rulestats.java 源码文件，采用 Java 编程语言编写，共 898 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与university相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?