cfssubseteval.java

来自「这是关于数据挖掘的一些算法」· Java 代码 · 共 1,129 行 · 第 1/2 页
JAVA
1,129 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    CfsSubsetEval.java *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */package  weka.attributeSelection;import weka.core.Capabilities;import weka.core.ContingencyTables;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.filters.Filter;import weka.filters.supervised.attribute.Discretize;import java.util.BitSet;import java.util.Enumeration;import java.util.Vector;/**  <!-- globalinfo-start --> * CfsSubsetEval :<br/> * <br/> * Evaluates the worth of a subset of attributes by considering the individual predictive ability of each feature along with the degree of redundancy between them.<br/> * <br/> * Subsets of features that are highly correlated with the class while having low intercorrelation are preferred.<br/> * <br/> * For more information see:<br/> * <br/> * M. A. Hall (1998). Correlation-based Feature Subset Selection for Machine Learning. Hamilton, New Zealand. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * &#64;phdthesis{Hall1998, *    address = {Hamilton, New Zealand}, *    author = {M. A. Hall}, *    school = {University of Waikato}, *    title = {Correlation-based Feature Subset Selection for Machine Learning}, *    year = {1998} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -M *  Treat missing values as a seperate value.</pre> *  * <pre> -L *  Don't include locally predictive attributes.</pre> *  <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 1.26 $ * @see Discretize */public class CfsSubsetEval  extends SubsetEvaluator  implements OptionHandler, TechnicalInformationHandler {    /** for serialization */  static final long serialVersionUID = 747878400813276317L;  /** The training instances */  private Instances m_trainInstances;  /** Discretise attributes when class in nominal */  private Discretize m_disTransform;  /** The class index */  private int m_classIndex;  /** Is the class numeric */  private boolean m_isNumeric;  /** Number of attributes in the training data */  private int m_numAttribs;  /** Number of instances in the training data */  private int m_numInstances;  /** Treat missing values as seperate values */  private boolean m_missingSeperate;  /** Include locally predicitive attributes */  private boolean m_locallyPredictive;  /** Holds the matrix of attribute correlations */  //  private Matrix m_corr_matrix;  private float [][] m_corr_matrix;  /** Standard deviations of attributes (when using pearsons correlation) */  private double[] m_std_devs;  /** Threshold for admitting locally predictive features */  private double m_c_Threshold;  /**   * Returns a string describing this attribute evaluator   * @return a description of the evaluator suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {    return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "      +"by considering the individual predictive ability of each feature "      +"along with the degree of redundancy between them.\n\n"      +"Subsets of features that are highly correlated with the class "      +"while having low intercorrelation are preferred.\n\n"      + "For more information see:\n\n"      + getTechnicalInformation().toString();  }  /**   * Returns an instance of a TechnicalInformation object, containing    * detailed information about the technical background of this class,   * e.g., paper reference or book this class is based on.   *    * @return the technical information about this class   */  public TechnicalInformation getTechnicalInformation() {    TechnicalInformation        result;        result = new TechnicalInformation(Type.PHDTHESIS);    result.setValue(Field.AUTHOR, "M. A. Hall");    result.setValue(Field.YEAR, "1998");    result.setValue(Field.TITLE, "Correlation-based Feature Subset Selection for Machine Learning");    result.setValue(Field.SCHOOL, "University of Waikato");    result.setValue(Field.ADDRESS, "Hamilton, New Zealand");        return result;  }  /**   * Constructor   */  public CfsSubsetEval () {    resetOptions();  }  /**   * Returns an enumeration describing the available options.   * @return an enumeration of all the available options.   *   **/  public Enumeration listOptions () {    Vector newVector = new Vector(3);    newVector.addElement(new Option("\tTreat missing values as a seperate "                                     + "value.", "M", 0, "-M"));    newVector.addElement(new Option("\tDon't include locally predictive attributes"                                     + ".", "L", 0, "-L"));    return  newVector.elements();  }  /**   * Parses and sets a given list of options. <p/>   *   <!-- options-start -->   * Valid options are: <p/>   *    * <pre> -M   *  Treat missing values as a seperate value.</pre>   *    * <pre> -L   *  Don't include locally predictive attributes.</pre>   *    <!-- options-end -->   *   * @param options the list of options as an array of strings   * @throws Exception if an option is not supported   *   **/  public void setOptions (String[] options)    throws Exception {    resetOptions();    setMissingSeperate(Utils.getFlag('M', options));    setLocallyPredictive(!Utils.getFlag('L', options));  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String locallyPredictiveTipText() {    return "Identify locally predictive attributes. Iteratively adds "      +"attributes with the highest correlation with the class as long "      +"as there is not already an attribute in the subset that has a "      +"higher correlation with the attribute in question";  }  /**   * Include locally predictive attributes   *   * @param b true or false   */  public void setLocallyPredictive (boolean b) {    m_locallyPredictive = b;  }  /**   * Return true if including locally predictive attributes   *   * @return true if locally predictive attributes are to be used   */  public boolean getLocallyPredictive () {    return  m_locallyPredictive;  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String missingSeperateTipText() {    return "Treat missing as a separate value. Otherwise, counts for missing "      +"values are distributed across other values in proportion to their "      +"frequency.";  }  /**   * Treat missing as a seperate value   *   * @param b true or false   */  public void setMissingSeperate (boolean b) {    m_missingSeperate = b;  }  /**   * Return true is missing is treated as a seperate value   *   * @return true if missing is to be treated as a seperate value   */  public boolean getMissingSeperate () {    return  m_missingSeperate;  }  /**   * Gets the current settings of CfsSubsetEval   *   * @return an array of strings suitable for passing to setOptions()   */  public String[] getOptions () {    String[] options = new String[2];    int current = 0;    if (getMissingSeperate()) {      options[current++] = "-M";    }    if (!getLocallyPredictive()) {      options[current++] = "-L";    }    while (current < options.length) {      options[current++] = "";    }    return  options;  }  /**   * Returns the capabilities of this evaluator.   *   * @return            the capabilities of this evaluator   * @see               Capabilities   */  public Capabilities getCapabilities() {    Capabilities result = super.getCapabilities();        // attributes    result.enable(Capability.NOMINAL_ATTRIBUTES);    result.enable(Capability.NUMERIC_ATTRIBUTES);    result.enable(Capability.DATE_ATTRIBUTES);    result.enable(Capability.MISSING_VALUES);        // class    result.enable(Capability.NOMINAL_CLASS);    result.enable(Capability.NUMERIC_CLASS);    result.enable(Capability.DATE_CLASS);    result.enable(Capability.MISSING_CLASS_VALUES);        return result;  }  /**   * Generates a attribute evaluator. Has to initialize all fields of the    * evaluator that are not being set via options.   *   * CFS also discretises attributes (if necessary) and initializes   * the correlation matrix.   *   * @param data set of instances serving as training data    * @throws Exception if the evaluator has not been    * generated successfully   */  public void buildEvaluator (Instances data)    throws Exception {    // can evaluator handle data?    getCapabilities().testWithFail(data);    m_trainInstances = new Instances(data);    m_trainInstances.deleteWithMissingClass();    m_classIndex = m_trainInstances.classIndex();    m_numAttribs = m_trainInstances.numAttributes();    m_numInstances = m_trainInstances.numInstances();    m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric();    if (!m_isNumeric) {      m_disTransform = new Discretize();      m_disTransform.setUseBetterEncoding(true);      m_disTransform.setInputFormat(m_trainInstances);      m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);    }    m_std_devs = new double[m_numAttribs];    m_corr_matrix = new float [m_numAttribs][];    for (int i = 0; i < m_numAttribs; i++) {      m_corr_matrix[i] = new float [i+1];    }    for (int i = 0; i < m_corr_matrix.length; i++) {      m_corr_matrix[i][i] = 1.0f;      m_std_devs[i] = 1.0;    }    for (int i = 0; i < m_numAttribs; i++) {      for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {        m_corr_matrix[i][j] = -999;      }    }  }  /**   * evaluates a subset of attributes   *   * @param subset a bitset representing the attribute subset to be    * evaluated    * @return the merit   * @throws Exception if the subset could not be evaluated   */  public double evaluateSubset (BitSet subset)    throws Exception {    double num = 0.0;    double denom = 0.0;    float corr;    int larger, smaller;    // do numerator    for (int i = 0; i < m_numAttribs; i++) {      if (i != m_classIndex) {        if (subset.get(i)) {          if (i > m_classIndex) {            larger = i; smaller = m_classIndex;          } else {            smaller = i; larger = m_classIndex;          }          /*      int larger = (i > m_classIndex ? i : m_classIndex);                  int smaller = (i > m_classIndex ? m_classIndex : i); */          if (m_corr_matrix[larger][smaller] == -999) {            corr = correlate(i, m_classIndex);            m_corr_matrix[larger][smaller] = corr;            num += (m_std_devs[i] * corr);          }          else {            num += (m_std_devs[i] * m_corr_matrix[larger][smaller]);          }        }      }    }    // do denominator    for (int i = 0; i < m_numAttribs; i++) {      if (i != m_classIndex) {        if (subset.get(i)) {          denom += (1.0 * m_std_devs[i] * m_std_devs[i]);          for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {            if (subset.get(j)) {              if (m_corr_matrix[i][j] == -999) {                corr = correlate(i, j);                m_corr_matrix[i][j] = corr;                denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr);              }              else {                denom += (2.0 * m_std_devs[i] * m_std_devs[j] * m_corr_matrix[i][j]);              }            }          }        }      }    }    if (denom < 0.0) {      denom *= -1.0;    }    if (denom == 0.0) {      return  (0.0);    }    double merit = (num/Math.sqrt(denom));    if (merit < 0.0) {      merit *= -1.0;    }    return  merit;  }  private float correlate (int att1, int att2) {    if (!m_isNumeric) {      return  (float) symmUncertCorr(att1, att2);    }    boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric());    boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric());    if (att1_is_num && att2_is_num) {      return  (float) num_num(att1, att2);    }    else {if (att2_is_num) {      return  (float) num_nom2(att1, att2);    }    else {if (att1_is_num) {      return  (float) num_nom2(att2, att1);    }    }    }    return (float) nom_nom(att1, att2);  }  private double symmUncertCorr (int att1, int att2) {    int i, j, k, ii, jj;    int ni, nj;    double sum = 0.0;    double sumi[], sumj[];    double counts[][];    Instance inst;    double corr_measure;    boolean flag = false;    double temp = 0.0;    if (att1 == m_classIndex || att2 == m_classIndex) {      flag = true;    }    ni = m_trainInstances.attribute(att1).numValues() + 1;    nj = m_trainInstances.attribute(att2).numValues() + 1;    counts = new double[ni][nj];    sumi = new double[ni];    sumj = new double[nj];    for (i = 0; i < ni; i++) {      sumi[i] = 0.0;      for (j = 0; j < nj; j++) {        sumj[j] = 0.0;        counts[i][j] = 0.0;      }    }    // Fill the contingency table    for (i = 0; i < m_numInstances; i++) {      inst = m_trainInstances.instance(i);      if (inst.isMissing(att1)) {        ii = ni - 1;      }      else {        ii = (int)inst.value(att1);      }      if (inst.isMissing(att2)) {        jj = nj - 1;      }      else {        jj = (int)inst.value(att2);      }      counts[ii][jj]++;    }    // get the row totals    for (i = 0; i < ni; i++) {      sumi[i] = 0.0;      for (j = 0; j < nj; j++) {        sumi[i] += counts[i][j];        sum += counts[i][j];      }    }    // get the column totals    for (j = 0; j < nj; j++) {      sumj[j] = 0.0;      for (i = 0; i < ni; i++) {        sumj[j] += counts[i][j];      }    }    // distribute missing counts    if (!m_missingSeperate &&         (sumi[ni-1] < m_numInstances) &&         (sumj[nj-1] < m_numInstances)) {      double[] i_copy = new double[sumi.length];      double[] j_copy = new double[sumj.length];      double[][] counts_copy = new double[sumi.length][sumj.length];      for (i = 0; i < ni; i++) {        System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);      }      System.arraycopy(sumi, 0, i_copy, 0, sumi.length);      System.arraycopy(sumj, 0, j_copy, 0, sumj.length);      double total_missing =         (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);      // do the missing i's      if (sumi[ni - 1] > 0.0) {        for (j = 0; j < nj - 1; j++) {          if (counts[ni - 1][j] > 0.0) {            for (i = 0; i < ni - 1; i++) {              temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);              counts[i][j] += temp;              sumi[i] += temp;            }            counts[ni - 1][j] = 0.0;          }        }      }      sumi[ni - 1] = 0.0;
cfssubseteval.java - 源码说明

本页面展示了「这是关于数据挖掘的一些算法」中的 cfssubseteval.java 源码文件，采用 Java 编程语言编写，共 1,129 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与数据挖掘相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?