📄 cfssubseteval.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * CfsSubsetEval.java * Copyright (C) 1999 Mark Hall * */package weka.attributeSelection;import java.io.*;import java.util.*;import weka.core.*;import weka.classifiers.*;import weka.filters.supervised.attribute.Discretize;import weka.filters.Filter;/** * CFS attribute subset evaluator. * For more information see: <p> * * Hall, M. A. (1998). Correlation-based Feature Subset Selection for Machine * Learning. Thesis submitted in partial fulfilment of the requirements of the * degree of Doctor of Philosophy at the University of Waikato. <p> * * Valid options are: * * -M <br> * Treat missing values as a seperate value. <p> * * -L <br> * Include locally predictive attributes. <p> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */public class CfsSubsetEval extends SubsetEvaluator implements OptionHandler{ /** The training instances */ private Instances m_trainInstances; /** Discretise attributes when class in nominal */ private Discretize m_disTransform; /** The class index */ private int m_classIndex; /** Is the class numeric */ private boolean m_isNumeric; /** Number of attributes in the training data */ private int m_numAttribs; /** Number of instances in the training data */ private int m_numInstances; /** Treat missing values as seperate values */ private boolean m_missingSeperate; /** Include locally predicitive attributes */ private boolean m_locallyPredictive; /** Holds the matrix of attribute correlations */ private Matrix m_corr_matrix; /** Standard deviations of attributes (when using pearsons correlation) */ private double[] m_std_devs; /** Threshold for admitting locally predictive features */ private double m_c_Threshold; /** * Returns a string describing this attribute evaluator * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes " +"by considering the individual predictive ability of each feature " +"along with the degree of redundancy between them.\n\n" +"Subsets of features that are highly correlated with the class " +"while having low intercorrelation are preferred.\n"; } /** * Constructor */ public CfsSubsetEval () { resetOptions(); } /** * Returns an enumeration describing the available options. * @return an enumeration of all the available options. * **/ public Enumeration listOptions () { Vector newVector = new Vector(3); newVector.addElement(new Option("\tTreat missing values as a seperate" + "\n\tvalue.", "M", 0, "-M")); newVector.addElement(new Option("\tInclude locally predictive attributes" + ".", "L", 0, "-L")); return newVector.elements(); } /** * Parses and sets a given list of options. <p> * * Valid options are: * * -M <br> * Treat missing values as a seperate value. <p> * * -L <br> * Include locally predictive attributes. <p> * * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions (String[] options) throws Exception { String optionString; resetOptions(); setMissingSeperate(Utils.getFlag('M', options)); setLocallyPredictive(Utils.getFlag('L', options)); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String locallyPredictiveTipText() { return "Identify locally predictive attributes. Iteratively adds " +"attributes with the highest correlation with the class as long " +"as there is not already an attribute in the subset that has a " +"higher correlation with the attribute in question"; } /** * Include locally predictive attributes * * @param b true or false */ public void setLocallyPredictive (boolean b) { m_locallyPredictive = b; } /** * Return true if including locally predictive attributes * * @return true if locally predictive attributes are to be used */ public boolean getLocallyPredictive () { return m_locallyPredictive; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String missingSeperateTipText() { return "Treat missing as a separate value. Otherwise, counts for missing " +"values are distributed across other values in proportion to their " +"frequency."; } /** * Treat missing as a seperate value * * @param b true or false */ public void setMissingSeperate (boolean b) { m_missingSeperate = b; } /** * Return true is missing is treated as a seperate value * * @return true if missing is to be treated as a seperate value */ public boolean getMissingSeperate () { return m_missingSeperate; } /** * Gets the current settings of CfsSubsetEval * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions () { String[] options = new String[2]; int current = 0; if (getMissingSeperate()) { options[current++] = "-M"; } if (getLocallyPredictive()) { options[current++] = "-L"; } while (current < options.length) { options[current++] = ""; } return options; } /** * Generates a attribute evaluator. Has to initialize all fields of the * evaluator that are not being set via options. * * CFS also discretises attributes (if necessary) and initializes * the correlation matrix. * * @param data set of instances serving as training data * @exception Exception if the evaluator has not been * generated successfully */ public void buildEvaluator (Instances data) throws Exception { if (data.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Can't handle string attributes!"); } m_trainInstances = data; m_trainInstances.deleteWithMissingClass(); m_classIndex = m_trainInstances.classIndex(); m_numAttribs = m_trainInstances.numAttributes(); m_numInstances = m_trainInstances.numInstances(); m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric(); if (!m_isNumeric) { m_disTransform = new Discretize(); m_disTransform.setUseBetterEncoding(true); m_disTransform.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform); } m_std_devs = new double[m_numAttribs]; m_corr_matrix = new Matrix(m_numAttribs, m_numAttribs); for (int i = 0; i < m_corr_matrix.numRows(); i++) { m_corr_matrix.setElement(i, i, 1.0); m_std_devs[i] = 1.0; } for (int i = 0; i < m_numAttribs; i++) { for (int j = i + 1; j < m_numAttribs; j++) { m_corr_matrix.setElement(i, j, -999); m_corr_matrix.setElement(j, i, -999); } } } /** * evaluates a subset of attributes * * @param subset a bitset representing the attribute subset to be * evaluated * @exception Exception if the subset could not be evaluated */ public double evaluateSubset (BitSet subset) throws Exception { double num = 0.0; double denom = 0.0; double corr; // do numerator for (int i = 0; i < m_numAttribs; i++) { if (i != m_classIndex) { if (subset.get(i)) { if (m_corr_matrix.getElement(i, m_classIndex) == -999) { corr = correlate(i, m_classIndex); m_corr_matrix.setElement(i, m_classIndex, corr); m_corr_matrix.setElement(m_classIndex, i, corr); num += (m_std_devs[i] * corr); } else {num += (m_std_devs[i] * m_corr_matrix.getElement(i, m_classIndex)); } } } } // do denominator for (int i = 0; i < m_numAttribs; i++) { if (i != m_classIndex) { if (subset.get(i)) { denom += (1.0 * m_std_devs[i] * m_std_devs[i]); for (int j = i + 1; j < m_numAttribs; j++) {if (subset.get(j)) { if (m_corr_matrix.getElement(i, j) == -999) { corr = correlate(i, j); m_corr_matrix.setElement(i, j, corr); m_corr_matrix.setElement(j, i, corr); denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr); } else {denom += (2.0 * m_std_devs[i] * m_std_devs[j] * m_corr_matrix.getElement(i, j)); } } } } } } if (denom < 0.0) { denom *= -1.0; } if (denom == 0.0) { return (0.0); } double merit = (num/Math.sqrt(denom)); if (merit < 0.0) { merit *= -1.0; } return merit; } private double correlate (int att1, int att2) { if (!m_isNumeric) { return symmUncertCorr(att1, att2); } boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric()); boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric()); if (att1_is_num && att2_is_num) { return num_num(att1, att2); } else {if (att2_is_num) { return num_nom2(att1, att2); } else {if (att1_is_num) { return num_nom2(att2, att1); } } } return nom_nom(att1, att2); } private double symmUncertCorr (int att1, int att2) { int i, j, k, ii, jj; int nnj, nni, ni, nj; double sum = 0.0; double sumi[], sumj[]; double counts[][]; Instance inst; double corr_measure; boolean flag = false; double temp = 0.0; if (att1 == m_classIndex || att2 == m_classIndex) { flag = true; } ni = m_trainInstances.attribute(att1).numValues() + 1; nj = m_trainInstances.attribute(att2).numValues() + 1; counts = new double[ni][nj]; sumi = new double[ni]; sumj = new double[nj]; for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumj[j] = 0.0; counts[i][j] = 0.0; } } // Fill the contingency table for (i = 0; i < m_numInstances; i++) { inst = m_trainInstances.instance(i); if (inst.isMissing(att1)) { ii = ni - 1; } else { ii = (int)inst.value(att1); } if (inst.isMissing(att2)) { jj = nj - 1; } else { jj = (int)inst.value(att2); } counts[ii][jj]++; } // get the row totals for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumi[i] += counts[i][j]; sum += counts[i][j]; } } // get the column totals for (j = 0; j < nj; j++) { sumj[j] = 0.0; for (i = 0; i < ni; i++) { sumj[j] += counts[i][j]; } } // distribute missing counts if (!m_missingSeperate && (sumi[ni-1] < m_numInstances) && (sumj[nj-1] < m_numInstances)) { double[] i_copy = new double[sumi.length]; double[] j_copy = new double[sumj.length]; double[][] counts_copy = new double[sumi.length][sumj.length]; for (i = 0; i < ni; i++) { System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length); } System.arraycopy(sumi, 0, i_copy, 0, sumi.length); System.arraycopy(sumj, 0, j_copy, 0, sumj.length); double total_missing = (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]); // do the missing i's if (sumi[ni - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { if (counts[ni - 1][j] > 0.0) { for (i = 0; i < ni - 1; i++) { temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]); counts[i][j] += temp; sumi[i] += temp; } counts[ni - 1][j] = 0.0; } } } sumi[ni - 1] = 0.0; // do the missing j's if (sumj[nj - 1] > 0.0) { for (i = 0; i < ni - 1; i++) { if (counts[i][nj - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]); counts[i][j] += temp; sumj[j] += temp; } counts[i][nj - 1] = 0.0; } } } sumj[nj - 1] = 0.0; // do the both missing if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) { for (i = 0; i < ni - 1; i++) { for (j = 0; j < nj - 1; j++) { temp = (counts_copy[i][j]/(sum - total_missing)) * counts_copy[ni - 1][nj - 1]; counts[i][j] += temp; sumi[i] += temp; sumj[j] += temp; } } counts[ni - 1][nj - 1] = 0.0; } } // corr_measure = Correlate.symm_uncert(counts,sumi,sumj,sum,ni,nj,flag); corr_measure = ContingencyTables.symmetricalUncertainty(counts); // corr_measure = ContingencyTables.gainRatio(counts); if (Utils.eq(corr_measure, 0.0)) { if (flag == true) { return (0.0); } else { return (1.0); } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -