⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cfssubseteval.java

📁 一个数据挖掘系统的源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:

/**
 *
 *   AgentAcademy - an open source Data Mining framework for
 *   training intelligent agents
 *
 *   Copyright (C)   2001-2003 AA Consortium.
 *
 *   This library is open source software; you can redistribute it
 *   and/or modify it under the terms of the GNU Lesser General
 *   Public License as published by the Free Software Foundation;
 *   either version 2.0 of the License, or (at your option) any later
 *   version.
 *
 *   This library is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU Lesser General Public
 *   License along with this library; if not, write to the Free
 *   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *   MA  02111-1307 USA
 *
 */

package  org.agentacademy.modules.dataminer.attributeSelection;

import java.util.BitSet;
import java.util.Enumeration;
import java.util.Vector;

import org.agentacademy.modules.dataminer.core.ContingencyTables;
import org.agentacademy.modules.dataminer.core.Instance;
import org.agentacademy.modules.dataminer.core.Instances;
import org.agentacademy.modules.dataminer.core.Matrix;
import org.agentacademy.modules.dataminer.core.Option;
import org.agentacademy.modules.dataminer.core.OptionHandler;
import org.agentacademy.modules.dataminer.core.Utils;
import org.agentacademy.modules.dataminer.filters.DiscretizeFilter;
import org.agentacademy.modules.dataminer.filters.Filter;
import org.apache.log4j.Logger;

/**
 * CFS attribute subset evaluator.
 * For more information see: <p>
 *
 * Hall, M. A. (1998). Correlation-based Feature Subset Selection for Machine
 * Learning. Thesis submitted in partial fulfilment of the requirements of the
 * degree of Doctor of Philosophy at the University of Waikato. <p>
 *
 * Valid options are:
 *
 * -M <br>
 * Treat missing values as a seperate value. <p>
 *
 * -L <br>
 * Include locally predictive attributes. <p>
 *
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision: 1.3 $
 */
public class CfsSubsetEval
  extends SubsetEvaluator
  implements OptionHandler
{

 public static Logger                log = Logger.getLogger(CfsSubsetEval.class);
  /** The training instances */
  private Instances m_trainInstances;
  /** Discretise attributes when class in nominal */
  private DiscretizeFilter m_disTransform;
  /** The class index */
  private int m_classIndex;
  /** Is the class numeric */
  private boolean m_isNumeric;
  /** Number of attributes in the training data */
  private int m_numAttribs;
  /** Number of instances in the training data */
  private int m_numInstances;
  /** Treat missing values as seperate values */
  private boolean m_missingSeperate;
  /** Include locally predicitive attributes */
  private boolean m_locallyPredictive;
  /** Holds the matrix of attribute correlations */
  private Matrix m_corr_matrix;
  /** Standard deviations of attributes (when using pearsons correlation) */
  private double[] m_std_devs;
  /** Threshold for admitting locally predictive features */
  private double m_c_Threshold;

  /**
   * Returns a string describing this attribute evaluator
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "
      +"by considering the individual predictive ability of each feature "
      +"along with the degree of redundancy between them.\n\n"
      +"Subsets of features that are highly correlated with the class "
      +"while having low intercorrelation are preferred.\n";
  }

  /**
   * Constructor
   */
  public CfsSubsetEval () {
    resetOptions();
  }


  /**
   * Returns an enumeration describing the available options.
   * @return an enumeration of all the available options.
   *
   **/
  public Enumeration listOptions () {
    Vector newVector = new Vector(3);
    newVector.addElement(new Option("\tTreat missing values as a seperate"
				    + "\n\tvalue.", "M", 0, "-M"));
    newVector.addElement(new Option("\tInclude locally predictive attributes"
				    + ".", "L", 0, "-L"));
    return  newVector.elements();
  }


  /**
   * Parses and sets a given list of options. <p>
   *
   * Valid options are:
   *
   * -M <br>
   * Treat missing values as a seperate value. <p>
   *
   * -L <br>
   * Include locally predictive attributes. <p>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   *
   **/
  public void setOptions (String[] options)
    throws Exception
  {
    String optionString;
    resetOptions();
    setMissingSeperate(Utils.getFlag('M', options));
    setLocallyPredictive(Utils.getFlag('L', options));
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String locallyPredictiveTipText() {
    return "Identify locally predictive attributes. Iteratively adds "
      +"attributes with the highest correlation with the class as long "
      +"as there is not already an attribute in the subset that has a "
      +"higher correlation with the attribute in question";
  }

  /**
   * Include locally predictive attributes
   *
   * @param b true or false
   */
  public void setLocallyPredictive (boolean b) {
    m_locallyPredictive = b;
  }


  /**
   * Return true if including locally predictive attributes
   *
   * @return true if locally predictive attributes are to be used
   */
  public boolean getLocallyPredictive () {
    return  m_locallyPredictive;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String missingSeperateTipText() {
    return "Treat missing as a separate value. Otherwise, counts for missing "
      +"values are distributed across other values in proportion to their "
      +"frequency.";
  }

  /**
   * Treat missing as a seperate value
   *
   * @param b true or false
   */
  public void setMissingSeperate (boolean b) {
    m_missingSeperate = b;
  }


  /**
   * Return true is missing is treated as a seperate value
   *
   * @return true if missing is to be treated as a seperate value
   */
  public boolean getMissingSeperate () {
    return  m_missingSeperate;
  }


  /**
   * Gets the current settings of CfsSubsetEval
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String[] getOptions () {
    String[] options = new String[2];
    int current = 0;

    if (getMissingSeperate()) {
      options[current++] = "-M";
    }

    if (getLocallyPredictive()) {
      options[current++] = "-L";
    }

    while (current < options.length) {
      options[current++] = "";
    }

    return  options;
  }


  /**
   * Generates a attribute evaluator. Has to initialize all fields of the
   * evaluator that are not being set via options.
   *
   * CFS also discretises attributes (if necessary) and initializes
   * the correlation matrix.
   *
   * @param data set of instances serving as training data
   * @exception Exception if the evaluator has not been
   * generated successfully
   */
  public void buildEvaluator (Instances data)
    throws Exception
  {
    if (data.checkForStringAttributes()) {
      throw  new Exception("Can't handle string attributes!");
    }

    m_trainInstances = data;
    m_trainInstances.deleteWithMissingClass();
    m_classIndex = m_trainInstances.classIndex();
    m_numAttribs = m_trainInstances.numAttributes();
    m_numInstances = m_trainInstances.numInstances();
    m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric();

    if (!m_isNumeric) {
      m_disTransform = new DiscretizeFilter();
      m_disTransform.setUseBetterEncoding(true);
      m_disTransform.setInputFormat(m_trainInstances);
      m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
    }

    m_std_devs = new double[m_numAttribs];
    m_corr_matrix = new Matrix(m_numAttribs, m_numAttribs);

    for (int i = 0; i < m_corr_matrix.numRows(); i++) {
      m_corr_matrix.setElement(i, i, 1.0);
      m_std_devs[i] = 1.0;
    }

    for (int i = 0; i < m_numAttribs; i++) {
      for (int j = i + 1; j < m_numAttribs; j++) {
        m_corr_matrix.setElement(i, j, -999);
        m_corr_matrix.setElement(j, i, -999);
      }
    }
  }


  /**
   * evaluates a subset of attributes
   *
   * @param subset a bitset representing the attribute subset to be
   * evaluated
   * @exception Exception if the subset could not be evaluated
   */
  public double evaluateSubset (BitSet subset)
    throws Exception
  {
    double num = 0.0;
    double denom = 0.0;
    double corr;

    // do numerator
    for (int i = 0; i < m_numAttribs; i++) {
      if (i != m_classIndex) {
        if (subset.get(i)) {
          if (m_corr_matrix.getElement(i, m_classIndex) == -999) {
            corr = correlate(i, m_classIndex);
            m_corr_matrix.setElement(i, m_classIndex, corr);
            m_corr_matrix.setElement(m_classIndex, i, corr);
            num += (m_std_devs[i] * corr);
          }
          else {num += (m_std_devs[i] *
			m_corr_matrix.getElement(i, m_classIndex));
	  }
	}
      }
    }

    // do denominator
    for (int i = 0; i < m_numAttribs; i++) {
      if (i != m_classIndex) {
	if (subset.get(i)) {
	  denom += (1.0 * m_std_devs[i] * m_std_devs[i]);

	  for (int j = i + 1; j < m_numAttribs; j++) {if (subset.get(j)) {
	    if (m_corr_matrix.getElement(i, j) == -999) {
	      corr = correlate(i, j);
	      m_corr_matrix.setElement(i, j, corr);
	      m_corr_matrix.setElement(j, i, corr);
	      denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr);
	    }
	    else {denom += (2.0 * m_std_devs[i] * m_std_devs[j] *
			    m_corr_matrix.getElement(i, j));
	    }
	  }
	  }
	}
      }
    }

    if (denom < 0.0) {
      denom *= -1.0;
    }

    if (denom == 0.0) {
      return  (0.0);
    }

    double merit = (num/Math.sqrt(denom));

    if (merit < 0.0) {
      merit *= -1.0;
    }

    return  merit;
  }


  private double correlate (int att1, int att2) {
    if (!m_isNumeric) {
      return  symmUncertCorr(att1, att2);
    }

    boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric());
    boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric());

    if (att1_is_num && att2_is_num) {
      return  num_num(att1, att2);
    }
    else {if (att2_is_num) {
      return  num_nom2(att1, att2);
    }
    else {if (att1_is_num) {
      return  num_nom2(att2, att1);
    }
    }
    }

    return  nom_nom(att1, att2);
  }


  private double symmUncertCorr (int att1, int att2) {
    int i, j, k, ii, jj;
    int nnj, nni, ni, nj;
    double sum = 0.0;
    double sumi[], sumj[];
    double counts[][];
    Instance inst;
    double corr_measure;
    boolean flag = false;
    double temp = 0.0;

    if (att1 == m_classIndex || att2 == m_classIndex) {
      flag = true;
    }

    ni = m_trainInstances.attribute(att1).numValues() + 1;
    nj = m_trainInstances.attribute(att2).numValues() + 1;
    counts = new double[ni][nj];
    sumi = new double[ni];
    sumj = new double[nj];

    for (i = 0; i < ni; i++) {
      sumi[i] = 0.0;

      for (j = 0; j < nj; j++) {
	sumj[j] = 0.0;
	counts[i][j] = 0.0;
      }
    }

    // Fill the contingency table
    for (i = 0; i < m_numInstances; i++) {
      inst = m_trainInstances.instance(i);

      if (inst.isMissing(att1)) {
	ii = ni - 1;
      }
      else {
	ii = (int)inst.value(att1);
      }

      if (inst.isMissing(att2)) {
	jj = nj - 1;
      }
      else {
	jj = (int)inst.value(att2);
      }

      counts[ii][jj]++;
    }

    // get the row totals
    for (i = 0; i < ni; i++) {
      sumi[i] = 0.0;

      for (j = 0; j < nj; j++) {
	sumi[i] += counts[i][j];
	sum += counts[i][j];
      }
    }

    // get the column totals
    for (j = 0; j < nj; j++) {
      sumj[j] = 0.0;

      for (i = 0; i < ni; i++) {
	sumj[j] += counts[i][j];
      }
    }

    // distribute missing counts
    if (!m_missingSeperate &&
	(sumi[ni-1] < m_numInstances) &&
	(sumj[nj-1] < m_numInstances)) {
      double[] i_copy = new double[sumi.length];
      double[] j_copy = new double[sumj.length];
      double[][] counts_copy = new double[sumi.length][sumj.length];

      for (i = 0; i < ni; i++) {
	System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
      }

      System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
      System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
      double total_missing =
	(sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);

      // do the missing i's
      if (sumi[ni - 1] > 0.0) {
	for (j = 0; j < nj - 1; j++) {
	  if (counts[ni - 1][j] > 0.0) {
	    for (i = 0; i < ni - 1; i++) {
	      temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);
	      counts[i][j] += temp;
	      sumi[i] += temp;
	    }

	    counts[ni - 1][j] = 0.0;
	  }
	}
      }

      sumi[ni - 1] = 0.0;

      // do the missing j's
      if (sumj[nj - 1] > 0.0) {
	for (i = 0; i < ni - 1; i++) {
	  if (counts[i][nj - 1] > 0.0) {
	    for (j = 0; j < nj - 1; j++) {
	      temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
	      counts[i][j] += temp;
	      sumj[j] += temp;
	    }

	    counts[i][nj - 1] = 0.0;
	  }
	}
      }

      sumj[nj - 1] = 0.0;

      // do the both missing
      if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
	for (i = 0; i < ni - 1; i++) {
	  for (j = 0; j < nj - 1; j++) {
	    temp = (counts_copy[i][j]/(sum - total_missing)) *
	      counts_copy[ni - 1][nj - 1];

	    counts[i][j] += temp;
	    sumi[i] += temp;
	    sumj[j] += temp;
	  }
	}

	counts[ni - 1][nj - 1] = 0.0;
      }
    }

    // corr_measure = Correlate.symm_uncert(counts,sumi,sumj,sum,ni,nj,flag);
    corr_measure = ContingencyTables.symmetricalUncertainty(counts);

    // corr_measure = ContingencyTables.gainRatio(counts);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -