📄 paceregression.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * PaceRegression.java * Copyright (C) 2002 Yong Wang */package weka.classifiers.functions.pace;import weka.classifiers.Classifier;import weka.classifiers.Evaluation;import java.io.*;import java.util.*;import weka.core.*;/** * Class for building pace regression linear models and using them for * prediction. <p> * * Under regularity conditions, pace regression is provably optimal when * the number of coefficients tends to infinity. It consists of a group of * estimators that are either overall optimal or optimal under certain * conditions. <p> * * The current work of the pace regression theory, and therefore also this * implementation, do not handle: <p> * * - missing values <br> * - non-binary nominal attributes <br> * - the case that n - k is small where n is number of instances and k is * number of coefficients (the threshold used in this implmentation is 20) * <p> * * Valid options are:<p> * * -D <br> * Produce debugging output. <p> * -E estimator <br> * The estimator can be one of the following: <br> * <ul> * <li>eb -- Empirical Bayes estimator for noraml mixture (default) <br> * <li>nested -- Optimal nested model selector for normal mixture <br> * <li>subset -- Optimal subset selector for normal mixture <br> * <li>pace2 -- PACE2 for Chi-square mixture <br> * <li>pace4 -- PACE4 for Chi-square mixture<br> * <li>pace6 -- PACE6 for Chi-square mixture <br> * <li>ols -- Ordinary least squares estimator <br> * <li>aic -- AIC estimator <br> * <li>bic -- BIC estimator <br> * <li>ric -- RIC estimator <br> * <li>olsc -- Ordinary least squares subset selector with a threshold <br> * </ul> * -S <threshold value <br> * Threshold for the olsc estimator<p> * * <p> * REFERENCES <p> * * Wang, Y. (2000). "A new approach to fitting linear models in high * dimensional spaces." PhD Thesis. Department of Computer Science, * University of Waikato, New Zealand. <p> * * Wang, Y. and Witten, I. H. (2002). "Modeling for optimal probability * prediction." Proceedings of ICML'2002. Sydney. <p> * * @author Yong Wang (yongwang@cs.waikato.ac.nz) * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */public class PaceRegression extends Classifier implements OptionHandler, WeightedInstancesHandler { /** The model used */ Instances m_Model = null; /** Array for storing coefficients of linear regression. */ private double[] m_Coefficients; /** The index of the class attribute */ private int m_ClassIndex; /** True if debug output will be printed */ private boolean m_Debug; private static final int olsEstimator = 0; private static final int ebEstimator = 1; private static final int nestedEstimator = 2; private static final int subsetEstimator = 3; private static final int pace2Estimator = 4; private static final int pace4Estimator = 5; private static final int pace6Estimator = 6; private static final int olscEstimator = 7; private static final int aicEstimator = 8; private static final int bicEstimator = 9; private static final int ricEstimator = 10; public static final Tag [] TAGS_ESTIMATOR = { new Tag(olsEstimator, "Ordinary least squares"), new Tag(ebEstimator, "Empirical Bayes"), new Tag(nestedEstimator, "Nested model selector"), new Tag(subsetEstimator, "Subset selector"), new Tag(pace2Estimator, "PACE2"), new Tag(pace4Estimator, "PACE4"), new Tag(pace6Estimator, "PACE6"), new Tag(olscEstimator, "Ordinary least squares selection"), new Tag(aicEstimator, "AIC"), new Tag(bicEstimator, "BIC"), new Tag(ricEstimator, "RIC") }; private int paceEstimator = ebEstimator; private double olscThreshold = 2; // AIC /** * Builds a pace regression model for the given data. * * @param data the training data to be used for generating the * linear regression function * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { // Checks on data model and instances try { if (!data.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Class attribute has to be numeric"+ " for pace regression!"); } } catch (UnassignedClassException e) { System.err.println(data); System.err.println(data.classIndex()); } if (data.numInstances() == 0) { throw new Exception("No instances in training file!"); } if (data.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Can't handle string attributes!"); } if (checkForNonBinary(data)) { throw new UnsupportedAttributeTypeException("Can only deal with numeric and binary attributes!"); } // check for missing data and throw exception if some are found if (checkForMissing(data)) { throw new NoSupportForMissingValuesException("Can't handle missing values!"); } // n - k should be >= 20 if (data.numInstances() - data.numAttributes() < 20) { throw new IllegalArgumentException("Not enough instances. Ratio of number of instances (n) to number of " + "attributes (k) is too small (n - k < 20)."); } /* * initialize the following */ m_Model = new Instances(data, 0); m_ClassIndex = data.classIndex(); double[][] transformedDataMatrix = getTransformedDataMatrix(data, m_ClassIndex); double[] classValueVector = data.attributeToDoubleArray(m_ClassIndex); m_Coefficients = null; /* * Perform pace regression */ m_Coefficients = pace(transformedDataMatrix, classValueVector); } /** * pace regression * * @param matrix_X matrix with observations * @param vector_Y vektor with class values * @return vector with coefficients * @exception Exception if pace regression cannot be done successfully */ private double [] pace(double[][] matrix_X, double [] vector_Y) { PaceMatrix X = new PaceMatrix( matrix_X ); PaceMatrix Y = new PaceMatrix( vector_Y, vector_Y.length ); IntVector pvt = IntVector.seq(0, X.getColumnDimension()-1); int n = X.getRowDimension(); int kr = X.getColumnDimension(); X.lsqrSelection( Y, pvt, 1 ); X.positiveDiagonal( Y, pvt ); int k = pvt.size(); PaceMatrix sol = (PaceMatrix) Y.clone(); X.rsolve( sol, pvt, pvt.size() ); DoubleVector betaHat = sol.getColumn(0).unpivoting(pvt, kr); DoubleVector r = Y.getColumn( pvt.size(), n-1, 0); double sde = Math.sqrt(r.sum2() / r.size()); DoubleVector aHat = Y.getColumn( 0, pvt.size()-1, 0).times( 1./sde ); DoubleVector aTilde = null; switch( paceEstimator) { case ebEstimator: case nestedEstimator: case subsetEstimator: NormalMixture d = new NormalMixture(); d.fit( aHat, MixtureDistribution.NNMMethod ); if( paceEstimator == ebEstimator ) aTilde = d.empiricalBayesEstimate( aHat ); else if( paceEstimator == ebEstimator ) aTilde = d.subsetEstimate( aHat ); else aTilde = d.nestedEstimate( aHat ); break; case pace2Estimator: case pace4Estimator: case pace6Estimator: DoubleVector AHat = aHat.square(); ChisqMixture dc = new ChisqMixture(); dc.fit( AHat, MixtureDistribution.NNMMethod ); DoubleVector ATilde; if( paceEstimator == pace6Estimator ) ATilde = dc.pace6( AHat ); else if( paceEstimator == pace2Estimator ) ATilde = dc.pace2( AHat ); else ATilde = dc.pace4( AHat ); aTilde = ATilde.sqrt().times( aHat.sign() ); break; case olsEstimator: aTilde = aHat.copy(); break; case aicEstimator: case bicEstimator: case ricEstimator: case olscEstimator: if(paceEstimator == aicEstimator) olscThreshold = 2; else if(paceEstimator == bicEstimator) olscThreshold = Math.log( n ); else if(paceEstimator == ricEstimator) olscThreshold = 2*Math.log( kr ); aTilde = aHat.copy(); for( int i = 0; i < aTilde.size(); i++ ) if( Math.abs(aTilde.get(i)) < Math.sqrt(olscThreshold) ) aTilde.set(i, 0); } PaceMatrix YTilde = new PaceMatrix((new PaceMatrix(aTilde)).times( sde )); X.rsolve( YTilde, pvt, pvt.size() ); DoubleVector betaTilde = YTilde.getColumn(0).unpivoting( pvt, kr ); return betaTilde.getArrayCopy(); } /** * Checks if instances have a missing value. * @param data the data set * @return true if missing value is present in data set */ public boolean checkForMissing(Instances data) { for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); for (int j = 0; j < data.numAttributes(); j++) { if (inst.isMissing(j)) { return true; } } } return false; } /** * Checks if an instance has a missing value. * @param instance the instance * @return true if missing value is present */ public boolean checkForMissing(Instance instance, Instances model) { for (int j = 0; j < instance.numAttributes(); j++) { if (j != model.classIndex()) { if (instance.isMissing(j)) { return true; } } } return false; } /** * Checks if any of the nominal attributes is non-binary. * @param data the data set * @return true if non binary attribute is present */ public boolean checkForNonBinary(Instances data) { for (int i = 0; i < data.numAttributes(); i++) { if (data.attribute(i).isNominal()) { if (data.attribute(i).numValues() != 2) return true; } } return false; } /** * Transforms dataset into a two-dimensional array. * * @param data dataset * @param classIndex index of the class attribute */ private double [][] getTransformedDataMatrix(Instances data, int classIndex) { int numInstances = data.numInstances(); int numAttributes = data.numAttributes(); int middle = classIndex; if (middle < 0) { middle = numAttributes; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -