📄 rulestats.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    RuleStats.java
 *    Copyright (C) 2001 Xin Xu
 */

package weka.classifiers.rules;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;

/**
 * This class implements the statistics functions used in the 
 * propositional rule learner, from the simpler ones like count of
 * true/false positive/negatives, filter data based on the ruleset, etc.
 * to the more sophisticated ones such as MDL calculation and rule
 * variants generation for each rule in the ruleset. <p>
 *
 * Obviously the statistics functions listed above need the specific
 * data and the specific ruleset, which are given in order to instantiate
 * an object of this class. <p>
 *  
 * @author Xin Xu (xx5@cs.waikato.ac.nz)
 * @version $Revision$
 */
public class RuleStats implements Serializable {

  /** The data on which the stats calculation is based */
  private Instances m_Data;

  /** The specific ruleset in question */
  private FastVector m_Ruleset;

  /** The simple stats of each rule */
  private FastVector m_SimpleStats;

  /** The set of instances filtered by the ruleset */
  private FastVector m_Filtered;
    
  /** The total number of possible conditions that could
   *  appear in a rule */
  private double m_Total;
 
  /** The redundancy factor in theory description length */	
  private static double REDUNDANCY_FACTOR = 0.5;
    
  /** The theory weight in the MDL calculation */
  private double MDL_THEORY_WEIGHT = 1.0;

    /** The class distributions predicted by each rule */
    private FastVector m_Distributions;

  /** Default constructor */
  public RuleStats(){
    m_Data = null;
    m_Ruleset = null;
    m_SimpleStats = null;
    m_Filtered = null;
    m_Distributions = null;
    m_Total = -1;
  }

    
  /** 
   * Constructor that provides ruleset and data 
   *
   * @param data the data
   * @param rules the ruleset
   */
  public RuleStats(Instances data, FastVector rules){
    this();
    m_Data = data;
    m_Ruleset = rules;	
  }


  /**
   * Set the number of all conditions that could appear 
   * in a rule in this RuleStats object, if the number set
   * is smaller than 0 (typically -1), then it calcualtes
   * based on the data store 
   *
   * @param total the set number
   */
  public void setNumAllConds(double total){
    if(total < 0)
      m_Total = numAllConditions(m_Data);    
    else
      m_Total = total;
  }
    
  /** 
   * Set the data of the stats, overwriting the old one if any 
   *
   * @param data the data to be set
   */
  public void setData(Instances data){
    m_Data = data;
  }
    
  /** 
   * Get the data of the stats 
   *
   * @return the data 
   */
  public Instances getData(){
    return m_Data;
  }

    
  /** 
   * Set the ruleset of the stats, overwriting the old one if any 
   *      
   * @param rules the set of rules to be set
   */
  public void setRuleset(FastVector rules){
    m_Ruleset = rules;
  }

    
  /** 
   * Get the ruleset of the stats 
   *      
   * @return the set of rules 
   */
  public FastVector getRuleset(){
    return m_Ruleset;
  }

  /** 
   * Get the size of the ruleset in the stats 
   *      
   * @return the size of ruleset 
   */
  public int getRulesetSize(){
    return m_Ruleset.size();
  }
  
  /** 
   * Get the simple stats of one rule, including 6 parameters:
   * 0: coverage; 1:uncoverage; 2: true positive; 3: true negatives;
   * 4: false positives; 5: false negatives
   *
   * @param index the index of the rule
   * @return the stats
   */
  public double[] getSimpleStats(int index){
    if((m_SimpleStats != null) && (index < m_SimpleStats.size()))
      return (double[])m_SimpleStats.elementAt(index);
	
    return null;
  }    


  /**
   * Get the data after filtering the given rule
   *
   * @param index the index of the rule
   * @return the data covered and uncovered by the rule
   */
  public Instances[] getFiltered(int index){
	
    if((m_Filtered != null) && (index < m_Filtered.size()))
      return (Instances[])m_Filtered.elementAt(index);
	
    return null;
  }    

    /**
     * Get the class distribution predicted by the rule in
     * given position
     *
     * @param index the position index of the rule
     * @return the class distributions
     */
    public double[] getDistributions(int index){
	
	if((m_Distributions != null) && (index < m_Distributions.size()))
	    return (double[])m_Distributions.elementAt(index);
	
	return null;
    }    
    
  /**
   * Set the weight of theory in MDL calcualtion
   *
   * @param weight the weight to be set
   */
  public void setMDLTheoryWeight(double weight){
    MDL_THEORY_WEIGHT = weight;
  } 

  /**
   * Compute the number of all possible conditions that could 
   * appear in a rule of a given data.  For nominal attributes,
   * it's the number of values that could appear; for numeric 
   * attributes, it's the number of values * 2, i.e. <= and >=
   * are counted as different possible conditions.
   *
   * @param data the given data
   * @return number of all conditions of the data
   */
  public static double numAllConditions(Instances data){
    double total = 0;
    Enumeration attEnum = data.emerateAttributes();	
    while(attEnum.hasMoreElements()){
      Attribute att= (Attribute)attEnum.nextElement();
      if(att.isNominal())
	total += (double)att.numValues();
      else
	total += 2.0 * (double)data.numDistinctValues(att);	
    }
    return total;
  }


  /**
   * Filter the data according to the ruleset and compute the basic
   * stats: coverage/uncoverage, true/false positive/negatives of 
   * each rule
   */
  public void countData(){
    if((m_Filtered != null) ||
       (m_Ruleset == null)  ||
       (m_Data == null))
      return;
	
    int size = m_Ruleset.size();
    m_Filtered = new FastVector(size);
    m_SimpleStats = new FastVector(size);
    m_Distributions = new FastVector(size);
    Instances data = new Instances(m_Data);
	
    for(int i=0; i < size; i++){
      double[] stats = new double[6];  // 6 statistics parameters
      double[] classCounts = new double[m_Data.classAttribute().numValues()];
      Instances[] filtered = computeSimpleStats(i, data, stats, classCounts);
      m_Filtered.addElement(filtered);
      m_SimpleStats.addElement(stats);
      m_Distributions.addElement(classCounts);
      data = filtered[1];  // Data not covered
    }	
  }
    
    /**
     * Count data from the position index in the ruleset
     * assuming that given data are not covered by the rules
     * in position 0...(index-1), and the statistics of these
     * rules are provided.<br>
     * This procedure is typically useful when a temporary 
     * object of RuleStats is constructed in order to efficiently
     * calculate the relative DL of rule in position index, 
     * thus all other stuff is not needed.
     *
     * @param index the given position
     * @param uncovered the data not covered by rules before index
     * @param prevRuleStats the provided stats of previous rules
     */
    public void countData(int index, Instances uncovered, 
			  double[][] prevRuleStats){
	if((m_Filtered != null) ||
	   (m_Ruleset == null))
	    return;
	
	int size = m_Ruleset.size();
	m_Filtered = new FastVector(size);
	m_SimpleStats = new FastVector(size);
	Instances[] data = new Instances[2];
	data[1] = uncovered;
	
	for(int i=0; i < index; i++){
	    m_SimpleStats.addElement(prevRuleStats[i]);
	    if(i+1 == index)
		m_Filtered.addElement(data);
	    else
		m_Filtered.addElement(new Object()); // Stuff sth.
	}
	
	for(int j=index; j < size; j++){
	    double[] stats = new double[6];  // 6 statistics parameters
	    Instances[] filtered = computeSimpleStats(j, data[1], stats, null);
	    m_Filtered.addElement(filtered);
	    m_SimpleStats.addElement(stats);
	    data = filtered;  // Data not covered
	}	
    }
    
  /**
   * Find all the instances in the dataset covered/not covered by 
   * the rule in given index, and the correponding simple statistics
   * and predicted class distributions are stored in the given double array,
   * which can be obtained by getSimpleStats() and getDistributions().<br>
   * 
   * @param index the given index, assuming correct
   * @param insts the dataset to be covered by the rule
   * @param stats the given double array to hold stats, side-effected
   * @param dist the given array to hold class distributions, side-effected
   *             if null, the distribution is not necessary 
   * @return the instances covered and not covered by the rule
   */
  private Instances[] computeSimpleStats(int index, Instances insts, 
					 double[] stats, double[] dist){
    Rule rule = (Rule)m_Ruleset.elementAt(index);
	
    Instances[] data = new Instances[2];
    data[0] = new Instances(insts, insts.numInstances());
    data[1] = new Instances(insts, insts.numInstances());

    for(int i=0; i<insts.numInstances(); i++){
      Instance datum = insts.instance(i);
      double weight = datum.weight();
      if(rule.covers(datum)){
	data[0].add(datum);        // Covered by this rule
	stats[0] += weight;        // Coverage
	if((int)datum.classValue() == (int)rule.getConsequent())
	  stats[2] += weight;    // True positives
	else
	  stats[4] += weight;    // False positives
	if(dist != null)
	    dist[(int)datum.classValue()] += weight;
      }
      else{
	data[1].add(datum);        // Not covered by this rule
	stats[1] += weight; 
	if((int)datum.classValue() != (int)rule.getConsequent())
	  stats[3] += weight;    // True negatives
	else
	  stats[5] += weight;    // False negatives	    
      }
    }
    
    return data;
  }
    
    
  /** 
   * Add a rule to the ruleset and update the stats
   *
   * @param the rule to be added
   */
  public void addAndUpdate(Rule lastRule){
    if(m_Ruleset == null)
      m_Ruleset = new FastVector();
    m_Ruleset.addElement(lastRule);
  
    Instances data = (m_Filtered == null) ?
      m_Data : ((Instances[])m_Filtered.lastElement())[1];
    double[] stats = new double[6];
    double[] classCounts = new double[m_Data.classAttribute().numValues()];
    Instances[] filtered = 
	computeSimpleStats(m_Ruleset.size()-1, data, stats, classCounts);
    
    if(m_Filtered == null)
	m_Filtered = new FastVector();	
    m_Filtered.addElement(filtered);

    if(m_SimpleStats == null)
      m_SimpleStats = new FastVector();	
    m_SimpleStats.addElement(stats);

    if(m_Distributions == null)
	m_Distributions = new FastVector();
    m_Distributions.addElement(classCounts);
  }
    
    
  /**
   * Subset description length: <br>
   * S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
   *
   * Details see Quilan: "MDL and categorical theories (Continued)",ML95
   *
   * @param t the number of elements in a known set
   * @param k the number of elements in a subset
   * @param p the expected proportion of subset known by recipient
   */
  public static double subsetDL(double t, double k, double p){
    double rt = Utils.gr(p, 0.0) ? (- k*Utils.log2(p)) : 0.0;
    rt -= (t-k)*Utils.log2(1-p);
    return rt;
  }
    
    
  /** 
   * The description length of the theory for a given rule.  Computed as:<br>
   *                 0.5* [||k||+ S(t, k, k/t)]<br>
   * where k is the number of antecedents of the rule; t is the total
   * possible antecedents that could appear in a rule; ||K|| is the 
   * universal prior for k , log2*(k) and S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
   * is the subset encoding length.<p>
   *
   * Details see Quilan: "MDL and categorical theories (Continued)",ML95
   *
   * @param index the index of the given rule (assuming correct)
   * @exception if index out of range or object not initialized yet
   * @return the theory DL, weighted if weight != 1.0
   */
  public double theoryDL(int index){
	
    double k = ((Rule)m_Ruleset.elementAt(index)).size();
	
    if(k == 0)
      return 0.0;
	
    double tdl = Utils.log2(k);               	    
    if(k > 1)                           // Approximation
      tdl += 2.0 * Utils.log2(tdl);   // of log2 star	
    tdl += subsetDL(m_Total, k, k/m_Total);
    //System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);
    return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;
  }
     

  /** 
   * The description length of data given the parameters of the data
   * based on the ruleset. <p>
   * Details see Quinlan: "MDL and categorical theories (Continued)",ML95<p>
   *
   * @param expFPOverErr expected FP/(FP+FN)
   * @param cover coverage
   * @param uncover uncoverage
   * @param fp False Positive
   * @param fn False Negative
   */
  public static double dataDL(double expFPOverErr, double cover, 
			      double uncover, double fp, double fn){
    double totalBits = Utils.log2(cover+uncover+1.0); // how many data?
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -