⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 infogainattributeeval.java

📁 一个数据挖掘系统的源码
💻 JAVA
字号:

/**
 *
 *   AgentAcademy - an open source Data Mining framework for
 *   training intelligent agents
 *
 *   Copyright (C)   2001-2003 AA Consortium.
 *
 *   This library is open source software; you can redistribute it
 *   and/or modify it under the terms of the GNU Lesser General
 *   Public License as published by the Free Software Foundation;
 *   either version 2.0 of the License, or (at your option) any later
 *   version.
 *
 *   This library is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU Lesser General Public
 *   License along with this library; if not, write to the Free
 *   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *   MA  02111-1307 USA
 *
 */

package  org.agentacademy.modules.dataminer.attributeSelection;

import java.util.Enumeration;
import java.util.Vector;

import org.agentacademy.modules.dataminer.core.ContingencyTables;
import org.agentacademy.modules.dataminer.core.Instance;
import org.agentacademy.modules.dataminer.core.Instances;
import org.agentacademy.modules.dataminer.core.Option;
import org.agentacademy.modules.dataminer.core.OptionHandler;
import org.agentacademy.modules.dataminer.core.Utils;
import org.agentacademy.modules.dataminer.filters.DiscretizeFilter;
import org.agentacademy.modules.dataminer.filters.Filter;
import org.agentacademy.modules.dataminer.filters.NumericToBinaryFilter;
import org.apache.log4j.Logger;

/**
 * Class for Evaluating attributes individually by measuring information gain
 * with respect to the class.
 *
 * Valid options are:<p>
 *
 * -M <br>
 * Treat missing values as a seperate value. <br>
 *
 * -B <br>
 * Just binarize numeric attributes instead of properly discretizing them. <br>
 *
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision: 1.3 $
 */
public class InfoGainAttributeEval
  extends AttributeEvaluator
  implements OptionHandler {

 public static Logger                log = Logger.getLogger(InfoGainAttributeEval.class);
/** Treat missing values as a seperate value */
  private boolean m_missing_merge;

  /** Just binarize numeric attributes */
  private boolean m_Binarize;

  /** The info gain for each attribute */
  private double[] m_InfoGains;

  /**
   * Returns a string describing this attribute evaluator
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "InfoGainAttributeEval :\n\nEvaluates the worth of an attribute "
      +"by measuring the information gain with respect to the class.\n\n"
      +"InfoGain(Class,Attribute) = H(Class) - H(Class | Attribute).\n";
  }

  /**
   * Constructor
   */
  public InfoGainAttributeEval () {
    resetOptions();
  }

  /**
   * Returns an enumeration describing the available options.
   * @return an enumeration of all the available options.
   **/
  public Enumeration listOptions () {
    Vector newVector = new Vector(2);
    newVector.addElement(new Option("\ttreat missing values as a seperate "
				    + "value.", "M", 0, "-M"));
    newVector.addElement(new Option("\tjust binarize numeric attributes instead\n "
				    +"\tof properly discretizing them.", "B", 0,
				    "-B"));
    return  newVector.elements();
  }


  /**
   * Parses a given list of options. <p>
   *
   * Valid options are:<p>
   *
   * -M <br>
   * Treat missing values as a seperate value. <br>
   *
   * -B <br>
   * Just binarize numeric attributes instead of properly discretizing them. <br>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   *
   **/
  public void setOptions (String[] options)
    throws Exception {

    resetOptions();
    setMissingMerge(!(Utils.getFlag('M', options)));
    setBinarizeNumericAttributes(Utils.getFlag('B', options));
  }


  /**
   * Gets the current settings of WrapperSubsetEval.
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String[] getOptions () {
    String[] options = new String[2];
    int current = 0;

    if (!getMissingMerge()) {
      options[current++] = "-M";
    }
    if (getBinarizeNumericAttributes()) {
      options[current++] = "-B";
    }

    while (current < options.length) {
      options[current++] = "";
    }

    return  options;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String binarizeNumericAttributesTipText() {
    return "Just binarize numeric attributes instead of properly discretizing them.";
  }

  /**
   * Binarize numeric attributes.
   *
   * @param b true=binarize numeric attributes
   */
  public void setBinarizeNumericAttributes (boolean b) {
    m_Binarize = b;
  }


  /**
   * get whether numeric attributes are just being binarized.
   *
   * @return true if missing values are being distributed.
   */
  public boolean getBinarizeNumericAttributes () {
    return  m_Binarize;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String missingMergeTipText() {
    return "Distribute counts for missing values. Counts are distributed "
      +"across other values in proportion to their frequency. Otherwise, "
      +"missing is treated as a separate value.";
  }

  /**
   * distribute the counts for missing values across observed values
   *
   * @param b true=distribute missing values.
   */
  public void setMissingMerge (boolean b) {
    m_missing_merge = b;
  }


  /**
   * get whether missing values are being distributed or not
   *
   * @return true if missing values are being distributed.
   */
  public boolean getMissingMerge () {
    return  m_missing_merge;
  }


  /**
   * Initializes an information gain attribute evaluator.
   * Discretizes all attributes that are numeric.
   *
   * @param data set of instances serving as training data
   * @exception Exception if the evaluator has not been
   * generated successfully
   */
  public void buildEvaluator (Instances data)
    throws Exception {

    if (data.checkForStringAttributes()) {
      throw  new Exception("Can't handle string attributes!");
    }

    int classIndex = data.classIndex();
    if (data.attribute(classIndex).isNumeric()) {
      throw  new Exception("Class must be nominal!");
    }
    int numInstances = data.numInstances();

    if (!m_Binarize) {
      DiscretizeFilter disTransform = new DiscretizeFilter();
      disTransform.setUseBetterEncoding(true);
      disTransform.setInputFormat(data);
      data = Filter.useFilter(data, disTransform);
    } else {
      NumericToBinaryFilter binTransform = new NumericToBinaryFilter();
      binTransform.setInputFormat(data);
      data = Filter.useFilter(data, binTransform);
    }
    int numClasses = data.attribute(classIndex).numValues();

    // Reserve space and initialize counters
    double[][][] counts = new double[data.numAttributes()][][];
    for (int k = 0; k < data.numAttributes(); k++) {
      if (k != classIndex) {
	int numValues = data.attribute(k).numValues();
	counts[k] = new double[numValues + 1][numClasses + 1];
      }
    }

    // Initialize counters
    double[] temp = new double[numClasses + 1];
    for (int k = 0; k < numInstances; k++) {
      Instance inst = data.instance(k);
      if (inst.classIsMissing()) {
	temp[numClasses] += inst.weight();
      } else {
	temp[(int)inst.classValue()] += inst.weight();
      }
    }
    for (int k = 0; k < counts.length; k++) {
      if (k != classIndex) {
	for (int i = 0; i < temp.length; i++) {
	  counts[k][0][i] = temp[i];
	}
      }
    }

    // Get counts
    for (int k = 0; k < numInstances; k++) {
      Instance inst = data.instance(k);
      for (int i = 0; i < inst.numValues(); i++) {
	if (inst.index(i) != classIndex) {
	  if (inst.isMissingSparse(i) || inst.classIsMissing()) {
	    if (!inst.isMissingSparse(i)) {
	      counts[inst.index(i)][(int)inst.valueSparse(i)][numClasses] +=
		inst.weight();
	      counts[inst.index(i)][0][numClasses] -= inst.weight();
	    } else if (!inst.classIsMissing()) {
	      counts[inst.index(i)][data.attribute(inst.index(i)).numValues()]
		[(int)inst.classValue()] += inst.weight();
	      counts[inst.index(i)][0][(int)inst.classValue()] -=
		inst.weight();
	    } else {
	      counts[inst.index(i)][data.attribute(inst.index(i)).numValues()]
		[numClasses] += inst.weight();
	      counts[inst.index(i)][0][numClasses] -= inst.weight();
	    }
	  } else {
	    counts[inst.index(i)][(int)inst.valueSparse(i)]
	      [(int)inst.classValue()] += inst.weight();
	    counts[inst.index(i)][0][(int)inst.classValue()] -= inst.weight();
	  }
	}
      }
    }

    // distribute missing counts if required
    if (m_missing_merge) {

      for (int k = 0; k < data.numAttributes(); k++) {
	if (k != classIndex) {
	  int numValues = data.attribute(k).numValues();

	  // Compute marginals
	  double[] rowSums = new double[numValues];
	  double[] columnSums = new double[numClasses];
	  double sum = 0;
	  for (int i = 0; i < numValues; i++) {
	    for (int j = 0; j < numClasses; j++) {
	      rowSums[i] += counts[k][i][j];
	      columnSums[j] += counts[k][i][j];
	    }
	    sum += rowSums[i];
	  }

	  if (Utils.gr(sum, 0)) {
	    double[][] additions = new double[numValues][numClasses];

	    // Compute what needs to be added to each row
	    for (int i = 0; i < numValues; i++) {
	      for (int j = 0; j  < numClasses; j++) {
		additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j];
	      }
	    }

	    // Compute what needs to be added to each column
	    for (int i = 0; i < numClasses; i++) {
	      for (int j = 0; j  < numValues; j++) {
		additions[j][i] += (columnSums[i] / sum) *
		  counts[k][j][numClasses];
	      }
	    }

	    // Compute what needs to be added to each cell
	    for (int i = 0; i < numClasses; i++) {
	      for (int j = 0; j  < numValues; j++) {
		additions[j][i] += (counts[k][j][i] / sum) *
		  counts[k][numValues][numClasses];
	      }
	    }

	    // Make new contingency table
	    double[][] newTable = new double[numValues][numClasses];
	    for (int i = 0; i < numValues; i++) {
	      for (int j = 0; j < numClasses; j++) {
		newTable[i][j] = counts[k][i][j] + additions[i][j];
	      }
	    }
	    counts[k] = newTable;
	  }
	}
      }
    }

    // Compute info gains
    m_InfoGains = new double[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
      if (i != classIndex) {
	m_InfoGains[i] =
	  (ContingencyTables.entropyOverColumns(counts[i])
	   - ContingencyTables.entropyConditionedOnRows(counts[i]));
      }
    }
  }

  /**
   * Reset options to their default values
   */
  protected void resetOptions () {
    m_InfoGains = null;
    m_missing_merge = true;
    m_Binarize = false;
  }


  /**
   * evaluates an individual attribute by measuring the amount
   * of information gained about the class given the attribute.
   *
   * @param attribute the index of the attribute to be evaluated
   * @exception Exception if the attribute could not be evaluated
   */
  public double evaluateAttribute (int attribute)
    throws Exception {

    return m_InfoGains[attribute];
  }

  /**
   * Describe the attribute evaluator
   * @return a description of the attribute evaluator as a string
   */
  public String toString () {
    StringBuffer text = new StringBuffer();

    if (m_InfoGains == null) {
      text.append("Information Gain attribute evaluator has not been built");
    }
    else {
      text.append("\tInformation Gain Ranking Filter");
      if (!m_missing_merge) {
	text.append("\n\tMissing values treated as seperate");
      }
      if (m_Binarize) {
	text.append("\n\tNumeric attributes are just binarized");
      }
    }

    text.append("\n");
    return  text.toString();
  }


  // ============
  // Test method.
  // ============
  /**
   * Main method for testing this class.
   *
   * @param argv the options
   */
  public static void main (String[] args) {
    try {
      System.out.println(AttributeSelection.
			 SelectAttributes(new InfoGainAttributeEval(), args));
    }
    catch (Exception e) {
      log.error(e.getStackTrace().toString());
      log.error(e.getMessage());
    }
  }
}


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -