📄 cfssubseteval.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* CfsSubsetEval.java
* Copyright (C) 1999 Mark Hall
*
*/
package weka.attributeSelection;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Vector;
import weka.core.ContingencyTables;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.UnsupportedAttributeTypeException;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;
/**
* CFS attribute subset evaluator.
* For more information see: <p>
*
* Hall, M. A. (1998). Correlation-based Feature Subset Selection for Machine
* Learning. Thesis submitted in partial fulfilment of the requirements of the
* degree of Doctor of Philosophy at the University of Waikato. <p>
*
* Valid options are:
*
* -M <br>
* Treat missing values as a seperate value. <p>
*
* -L <br>
* Include locally predictive attributes. <p>
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision$
*/
public class CfsSubsetEval
extends SubsetEvaluator
implements OptionHandler
{
/** The training instances */
private Instances m_trainInstances;
/** Discretise attributes when class in nominal */
private Discretize m_disTransform;
/** The class index */
private int m_classIndex;
/** Is the class numeric */
private boolean m_isNumeric;
/** Number of attributes in the training data */
private int m_numAttribs;
/** Number of instances in the training data */
private int m_numInstances;
/** Treat missing values as seperate values */
private boolean m_missingSeperate;
/** Include locally predicitive attributes */
private boolean m_locallyPredictive;
/** Holds the matrix of attribute correlations */
// private Matrix m_corr_matrix;
private float [][] m_corr_matrix;
/** Standard deviations of attributes (when using pearsons correlation) */
private double[] m_std_devs;
/** Threshold for admitting locally predictive features */
private double m_c_Threshold;
/**
* Returns a string describing this attribute evaluator
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "
+"by considering the individual predictive ability of each feature "
+"along with the degree of redundancy between them.\n\n"
+"Subsets of features that are highly correlated with the class "
+"while having low intercorrelation are preferred.\n";
}
/**
* Constructor
*/
public CfsSubsetEval () {
resetOptions();
}
/**
* Returns an enumeration describing the available options.
* @return an enumeration of all the available options.
*
**/
public Enumeration listOptions () {
Vector newVector = new Vector(3);
newVector.addElement(new Option("\tTreat missing values as a seperate"
+ "\n\tvalue.", "M", 0, "-M"));
newVector.addElement(new Option("\tInclude locally predictive attributes"
+ ".", "L", 0, "-L"));
return newVector.elements();
}
/**
* Parses and sets a given list of options. <p>
*
* Valid options are:
*
* -M <br>
* Treat missing values as a seperate value. <p>
*
* -L <br>
* Include locally predictive attributes. <p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*
**/
public void setOptions (String[] options)
throws Exception {
String optionString;
resetOptions();
setMissingSeperate(Utils.getFlag('M', options));
setLocallyPredictive(Utils.getFlag('L', options));
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String locallyPredictiveTipText() {
return "Identify locally predictive attributes. Iteratively adds "
+"attributes with the highest correlation with the class as long "
+"as there is not already an attribute in the subset that has a "
+"higher correlation with the attribute in question";
}
/**
* Include locally predictive attributes
*
* @param b true or false
*/
public void setLocallyPredictive (boolean b) {
m_locallyPredictive = b;
}
/**
* Return true if including locally predictive attributes
*
* @return true if locally predictive attributes are to be used
*/
public boolean getLocallyPredictive () {
return m_locallyPredictive;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String missingSeperateTipText() {
return "Treat missing as a separate value. Otherwise, counts for missing "
+"values are distributed across other values in proportion to their "
+"frequency.";
}
/**
* Treat missing as a seperate value
*
* @param b true or false
*/
public void setMissingSeperate (boolean b) {
m_missingSeperate = b;
}
/**
* Return true is missing is treated as a seperate value
*
* @return true if missing is to be treated as a seperate value
*/
public boolean getMissingSeperate () {
return m_missingSeperate;
}
/**
* Gets the current settings of CfsSubsetEval
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions () {
String[] options = new String[2];
int current = 0;
if (getMissingSeperate()) {
options[current++] = "-M";
}
if (getLocallyPredictive()) {
options[current++] = "-L";
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Generates a attribute evaluator. Has to initialize all fields of the
* evaluator that are not being set via options.
*
* CFS also discretises attributes (if necessary) and initializes
* the correlation matrix.
*
* @param data set of instances serving as training data
* @exception Exception if the evaluator has not been
* generated successfully
*/
public void buildEvaluator (Instances data)
throws Exception {
if (data.checkForStringAttributes()) {
throw new UnsupportedAttributeTypeException("Can't handle string attributes!");
}
m_trainInstances = data;
m_trainInstances.deleteWithMissingClass();
m_classIndex = m_trainInstances.classIndex();
m_numAttribs = m_trainInstances.numAttributes();
m_numInstances = m_trainInstances.numInstances();
m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric();
if (!m_isNumeric) {
m_disTransform = new Discretize();
m_disTransform.setUseBetterEncoding(true);
m_disTransform.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
}
m_std_devs = new double[m_numAttribs];
m_corr_matrix = new float [m_numAttribs][];
for (int i = 0; i < m_numAttribs; i++) {
m_corr_matrix[i] = new float [i+1];
}
for (int i = 0; i < m_corr_matrix.length; i++) {
m_corr_matrix[i][i] = 1.0f;
m_std_devs[i] = 1.0;
}
for (int i = 0; i < m_numAttribs; i++) {
for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
m_corr_matrix[i][j] = -999;
}
}
}
/**
* evaluates a subset of attributes
*
* @param subset a bitset representing the attribute subset to be
* evaluated
* @exception Exception if the subset could not be evaluated
*/
public double evaluateSubset (BitSet subset)
throws Exception {
double num = 0.0;
double denom = 0.0;
float corr;
int larger, smaller;
// do numerator
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
if (i > m_classIndex) {
larger = i; smaller = m_classIndex;
} else {
smaller = i; larger = m_classIndex;
}
/* int larger = (i > m_classIndex ? i : m_classIndex);
int smaller = (i > m_classIndex ? m_classIndex : i); */
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, m_classIndex);
m_corr_matrix[larger][smaller] = corr;
num += (m_std_devs[i] * corr);
}
else {
num += (m_std_devs[i] * m_corr_matrix[larger][smaller]);
}
}
}
}
// do denominator
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
denom += (1.0 * m_std_devs[i] * m_std_devs[i]);
for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
if (subset.get(j)) {
if (m_corr_matrix[i][j] == -999) {
corr = correlate(i, j);
m_corr_matrix[i][j] = corr;
denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr);
}
else {
denom += (2.0 * m_std_devs[i] * m_std_devs[j] * m_corr_matrix[i][j]);
}
}
}
}
}
}
if (denom < 0.0) {
denom *= -1.0;
}
if (denom == 0.0) {
return (0.0);
}
double merit = (num/Math.sqrt(denom));
if (merit < 0.0) {
merit *= -1.0;
}
return merit;
}
private float correlate (int att1, int att2) {
if (!m_isNumeric) {
return (float) symmUncertCorr(att1, att2);
}
boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric());
boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric());
if (att1_is_num && att2_is_num) {
return (float) num_num(att1, att2);
}
else {if (att2_is_num) {
return (float) num_nom2(att1, att2);
}
else {if (att1_is_num) {
return (float) num_nom2(att2, att1);
}
}
}
return (float) nom_nom(att1, att2);
}
private double symmUncertCorr (int att1, int att2) {
int i, j, k, ii, jj;
int nnj, nni, ni, nj;
double sum = 0.0;
double sumi[], sumj[];
double counts[][];
Instance inst;
double corr_measure;
boolean flag = false;
double temp = 0.0;
if (att1 == m_classIndex || att2 == m_classIndex) {
flag = true;
}
ni = m_trainInstances.attribute(att1).numValues() + 1;
nj = m_trainInstances.attribute(att2).numValues() + 1;
counts = new double[ni][nj];
sumi = new double[ni];
sumj = new double[nj];
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
counts[i][j] = 0.0;
}
}
// Fill the contingency table
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
ii = ni - 1;
}
else {
ii = (int)inst.value(att1);
}
if (inst.isMissing(att2)) {
jj = nj - 1;
}
else {
jj = (int)inst.value(att2);
}
counts[ii][jj]++;
}
// get the row totals
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumi[i] += counts[i][j];
sum += counts[i][j];
}
}
// get the column totals
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
for (i = 0; i < ni; i++) {
sumj[j] += counts[i][j];
}
}
// distribute missing counts
if (!m_missingSeperate &&
(sumi[ni-1] < m_numInstances) &&
(sumj[nj-1] < m_numInstances)) {
double[] i_copy = new double[sumi.length];
double[] j_copy = new double[sumj.length];
double[][] counts_copy = new double[sumi.length][sumj.length];
for (i = 0; i < ni; i++) {
System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
}
System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
double total_missing =
(sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);
// do the missing i's
if (sumi[ni - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
if (counts[ni - 1][j] > 0.0) {
for (i = 0; i < ni - 1; i++) {
temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);
counts[i][j] += temp;
sumi[i] += temp;
}
counts[ni - 1][j] = 0.0;
}
}
}
sumi[ni - 1] = 0.0;
// do the missing j's
if (sumj[nj - 1] > 0.0) {
for (i = 0; i < ni - 1; i++) {
if (counts[i][nj - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
counts[i][j] += temp;
sumj[j] += temp;
}
counts[i][nj - 1] = 0.0;
}
}
}
sumj[nj - 1] = 0.0;
// do the both missing
if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
for (i = 0; i < ni - 1; i++) {
for (j = 0; j < nj - 1; j++) {
temp = (counts_copy[i][j]/(sum - total_missing)) *
counts_copy[ni - 1][nj - 1];
counts[i][j] += temp;
sumi[i] += temp;
sumj[j] += temp;
}
}
counts[ni - 1][nj - 1] = 0.0;
}
}
corr_measure = ContingencyTables.symmetricalUncertainty(counts);
if (Utils.eq(corr_measure, 0.0)) {
if (flag == true) {
return (0.0);
}
else {
return (1.0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -