📄 cfssubseteval.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * CfsSubsetEval.java * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */package weka.attributeSelection;import weka.core.Capabilities;import weka.core.ContingencyTables;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.filters.Filter;import weka.filters.supervised.attribute.Discretize;import java.util.BitSet;import java.util.Enumeration;import java.util.Vector;/** <!-- globalinfo-start --> * CfsSubsetEval :<br/> * <br/> * Evaluates the worth of a subset of attributes by considering the individual predictive ability of each feature along with the degree of redundancy between them.<br/> * <br/> * Subsets of features that are highly correlated with the class while having low intercorrelation are preferred.<br/> * <br/> * For more information see:<br/> * <br/> * M. A. Hall (1998). Correlation-based Feature Subset Selection for Machine Learning. Hamilton, New Zealand. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @phdthesis{Hall1998, * address = {Hamilton, New Zealand}, * author = {M. A. Hall}, * school = {University of Waikato}, * title = {Correlation-based Feature Subset Selection for Machine Learning}, * year = {1998} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -M * Treat missing values as a seperate value.</pre> * * <pre> -L * Don't include locally predictive attributes.</pre> * <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 1.26 $ * @see Discretize */public class CfsSubsetEval extends SubsetEvaluator implements OptionHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 747878400813276317L; /** The training instances */ private Instances m_trainInstances; /** Discretise attributes when class in nominal */ private Discretize m_disTransform; /** The class index */ private int m_classIndex; /** Is the class numeric */ private boolean m_isNumeric; /** Number of attributes in the training data */ private int m_numAttribs; /** Number of instances in the training data */ private int m_numInstances; /** Treat missing values as seperate values */ private boolean m_missingSeperate; /** Include locally predicitive attributes */ private boolean m_locallyPredictive; /** Holds the matrix of attribute correlations */ // private Matrix m_corr_matrix; private float [][] m_corr_matrix; /** Standard deviations of attributes (when using pearsons correlation) */ private double[] m_std_devs; /** Threshold for admitting locally predictive features */ private double m_c_Threshold; /** * Returns a string describing this attribute evaluator * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes " +"by considering the individual predictive ability of each feature " +"along with the degree of redundancy between them.\n\n" +"Subsets of features that are highly correlated with the class " +"while having low intercorrelation are preferred.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.PHDTHESIS); result.setValue(Field.AUTHOR, "M. A. Hall"); result.setValue(Field.YEAR, "1998"); result.setValue(Field.TITLE, "Correlation-based Feature Subset Selection for Machine Learning"); result.setValue(Field.SCHOOL, "University of Waikato"); result.setValue(Field.ADDRESS, "Hamilton, New Zealand"); return result; } /** * Constructor */ public CfsSubsetEval () { resetOptions(); } /** * Returns an enumeration describing the available options. * @return an enumeration of all the available options. * **/ public Enumeration listOptions () { Vector newVector = new Vector(3); newVector.addElement(new Option("\tTreat missing values as a seperate " + "value.", "M", 0, "-M")); newVector.addElement(new Option("\tDon't include locally predictive attributes" + ".", "L", 0, "-L")); return newVector.elements(); } /** * Parses and sets a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -M * Treat missing values as a seperate value.</pre> * * <pre> -L * Don't include locally predictive attributes.</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported * **/ public void setOptions (String[] options) throws Exception { resetOptions(); setMissingSeperate(Utils.getFlag('M', options)); setLocallyPredictive(!Utils.getFlag('L', options)); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String locallyPredictiveTipText() { return "Identify locally predictive attributes. Iteratively adds " +"attributes with the highest correlation with the class as long " +"as there is not already an attribute in the subset that has a " +"higher correlation with the attribute in question"; } /** * Include locally predictive attributes * * @param b true or false */ public void setLocallyPredictive (boolean b) { m_locallyPredictive = b; } /** * Return true if including locally predictive attributes * * @return true if locally predictive attributes are to be used */ public boolean getLocallyPredictive () { return m_locallyPredictive; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String missingSeperateTipText() { return "Treat missing as a separate value. Otherwise, counts for missing " +"values are distributed across other values in proportion to their " +"frequency."; } /** * Treat missing as a seperate value * * @param b true or false */ public void setMissingSeperate (boolean b) { m_missingSeperate = b; } /** * Return true is missing is treated as a seperate value * * @return true if missing is to be treated as a seperate value */ public boolean getMissingSeperate () { return m_missingSeperate; } /** * Gets the current settings of CfsSubsetEval * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions () { String[] options = new String[2]; int current = 0; if (getMissingSeperate()) { options[current++] = "-M"; } if (!getLocallyPredictive()) { options[current++] = "-L"; } while (current < options.length) { options[current++] = ""; } return options; } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.NUMERIC_CLASS); result.enable(Capability.DATE_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); return result; } /** * Generates a attribute evaluator. Has to initialize all fields of the * evaluator that are not being set via options. * * CFS also discretises attributes (if necessary) and initializes * the correlation matrix. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully */ public void buildEvaluator (Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); m_trainInstances = new Instances(data); m_trainInstances.deleteWithMissingClass(); m_classIndex = m_trainInstances.classIndex(); m_numAttribs = m_trainInstances.numAttributes(); m_numInstances = m_trainInstances.numInstances(); m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric(); if (!m_isNumeric) { m_disTransform = new Discretize(); m_disTransform.setUseBetterEncoding(true); m_disTransform.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform); } m_std_devs = new double[m_numAttribs]; m_corr_matrix = new float [m_numAttribs][]; for (int i = 0; i < m_numAttribs; i++) { m_corr_matrix[i] = new float [i+1]; } for (int i = 0; i < m_corr_matrix.length; i++) { m_corr_matrix[i][i] = 1.0f; m_std_devs[i] = 1.0; } for (int i = 0; i < m_numAttribs; i++) { for (int j = 0; j < m_corr_matrix[i].length - 1; j++) { m_corr_matrix[i][j] = -999; } } } /** * evaluates a subset of attributes * * @param subset a bitset representing the attribute subset to be * evaluated * @return the merit * @throws Exception if the subset could not be evaluated */ public double evaluateSubset (BitSet subset) throws Exception { double num = 0.0; double denom = 0.0; float corr; int larger, smaller; // do numerator for (int i = 0; i < m_numAttribs; i++) { if (i != m_classIndex) { if (subset.get(i)) { if (i > m_classIndex) { larger = i; smaller = m_classIndex; } else { smaller = i; larger = m_classIndex; } /* int larger = (i > m_classIndex ? i : m_classIndex); int smaller = (i > m_classIndex ? m_classIndex : i); */ if (m_corr_matrix[larger][smaller] == -999) { corr = correlate(i, m_classIndex); m_corr_matrix[larger][smaller] = corr; num += (m_std_devs[i] * corr); } else { num += (m_std_devs[i] * m_corr_matrix[larger][smaller]); } } } } // do denominator for (int i = 0; i < m_numAttribs; i++) { if (i != m_classIndex) { if (subset.get(i)) { denom += (1.0 * m_std_devs[i] * m_std_devs[i]); for (int j = 0; j < m_corr_matrix[i].length - 1; j++) { if (subset.get(j)) { if (m_corr_matrix[i][j] == -999) { corr = correlate(i, j); m_corr_matrix[i][j] = corr; denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr); } else { denom += (2.0 * m_std_devs[i] * m_std_devs[j] * m_corr_matrix[i][j]); } } } } } } if (denom < 0.0) { denom *= -1.0; } if (denom == 0.0) { return (0.0); } double merit = (num/Math.sqrt(denom)); if (merit < 0.0) { merit *= -1.0; } return merit; } private float correlate (int att1, int att2) { if (!m_isNumeric) { return (float) symmUncertCorr(att1, att2); } boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric()); boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric()); if (att1_is_num && att2_is_num) { return (float) num_num(att1, att2); } else {if (att2_is_num) { return (float) num_nom2(att1, att2); } else {if (att1_is_num) { return (float) num_nom2(att2, att1); } } } return (float) nom_nom(att1, att2); } private double symmUncertCorr (int att1, int att2) { int i, j, k, ii, jj; int ni, nj; double sum = 0.0; double sumi[], sumj[]; double counts[][]; Instance inst; double corr_measure; boolean flag = false; double temp = 0.0; if (att1 == m_classIndex || att2 == m_classIndex) { flag = true; } ni = m_trainInstances.attribute(att1).numValues() + 1; nj = m_trainInstances.attribute(att2).numValues() + 1; counts = new double[ni][nj]; sumi = new double[ni]; sumj = new double[nj]; for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumj[j] = 0.0; counts[i][j] = 0.0; } } // Fill the contingency table for (i = 0; i < m_numInstances; i++) { inst = m_trainInstances.instance(i); if (inst.isMissing(att1)) { ii = ni - 1; } else { ii = (int)inst.value(att1); } if (inst.isMissing(att2)) { jj = nj - 1; } else { jj = (int)inst.value(att2); } counts[ii][jj]++; } // get the row totals for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumi[i] += counts[i][j]; sum += counts[i][j]; } } // get the column totals for (j = 0; j < nj; j++) { sumj[j] = 0.0; for (i = 0; i < ni; i++) { sumj[j] += counts[i][j]; } } // distribute missing counts if (!m_missingSeperate && (sumi[ni-1] < m_numInstances) && (sumj[nj-1] < m_numInstances)) { double[] i_copy = new double[sumi.length]; double[] j_copy = new double[sumj.length]; double[][] counts_copy = new double[sumi.length][sumj.length]; for (i = 0; i < ni; i++) { System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length); } System.arraycopy(sumi, 0, i_copy, 0, sumi.length); System.arraycopy(sumj, 0, j_copy, 0, sumj.length); double total_missing = (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]); // do the missing i's if (sumi[ni - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { if (counts[ni - 1][j] > 0.0) { for (i = 0; i < ni - 1; i++) { temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]); counts[i][j] += temp; sumi[i] += temp; } counts[ni - 1][j] = 0.0; } } } sumi[ni - 1] = 0.0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -