📄 interquartilerange.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * InterquartileRange.java * Copyright (C) 2006 University of Waikato, Hamilton, New Zealand */package weka.filters.unsupervised.attribute;import weka.core.Attribute;import weka.core.Capabilities;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.Range;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.filters.SimpleBatchFilter;import java.util.Enumeration;import java.util.Vector;/** <!-- globalinfo-start --> * A filter for detecting outliers and extreme values based on interquartile ranges. The filter skips the class attribute.<br/> * <br/> * Outliers:<br/> * Q3 + OF*IQR < x <= Q3 + EVF*IQR<br/> * or<br/> * Q1 - EVF*IQR <= x < Q1 - OF*IQR<br/> * <br/> * Extreme values:<br/> * x > Q3 + EVF*IQR<br/> * or<br/> * x < Q1 - EVF*IQR<br/> * <br/> * Key:<br/> * Q1 = 25% quartile<br/> * Q3 = 75% quartile<br/> * IQR = Interquartile Range, difference between Q1 and Q3<br/> * OF = Outlier Factor<br/> * EVF = Extreme Value Factor * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Turns on output of debugging information.</pre> * * <pre> -R <col1,col2-col4,...> * Specifies list of columns to base outlier/extreme value detection * on. If an instance is considered in at least one of those * attributes an outlier/extreme value, it is tagged accordingly. * 'first' and 'last' are valid indexes. * (default none)</pre> * * <pre> -O <num> * The factor for outlier detection. * (default: 3)</pre> * * <pre> -E <num> * The factor for extreme values detection. * (default: 2*Outlier Factor)</pre> * * <pre> -E-as-O * Tags extreme values also as outliers. * (default: off)</pre> * * <pre> -P * Generates Outlier/ExtremeValue pair for each numeric attribute in * the range, not just a single indicator pair for all the attributes. * (default: off)</pre> * * <pre> -M * Generates an additional attribute 'Offset' per Outlier/ExtremeValue * pair that contains the multiplier that the value is off the median. * value = median + 'multiplier' * IQR * Note: implicitely sets '-P'. (default: off)</pre> * <!-- options-end --> * * Thanks to Dale for a few brainstorming sessions. * * @author Dale Fletcher (dale at cs dot waikato dot ac dot nz) * @author fracpete (fracpete at waikato dot ac dot nz) * @version $Revision: 1.2 $ */public class InterquartileRange extends SimpleBatchFilter { /** for serialization */ private static final long serialVersionUID = -227879653639723030L; /** indicator for non-numeric attributes */ public final static int NON_NUMERIC = -1; /** the attribute range to work on */ protected Range m_Attributes = new Range("first-last"); /** the generated indices (only for performance reasons) */ protected int[] m_AttributeIndices = null; /** the factor for detecting outliers */ protected double m_OutlierFactor = 3; /** the factor for detecting extreme values, by default 2*m_OutlierFactor */ protected double m_ExtremeValuesFactor = 2*m_OutlierFactor; /** whether extreme values are also tagged as outliers */ protected boolean m_ExtremeValuesAsOutliers = false; /** the upper extreme value threshold (= Q3 + EVF*IQR) */ protected double[] m_UpperExtremeValue = null; /** the upper outlier threshold (= Q3 + OF*IQR) */ protected double[] m_UpperOutlier = null; /** the lower outlier threshold (= Q1 - OF*IQR) */ protected double[] m_LowerOutlier = null; /** the interquartile range */ protected double[] m_IQR = null; /** the median */ protected double[] m_Median = null; /** the lower extreme value threshold (= Q1 - EVF*IQR) */ protected double[] m_LowerExtremeValue = null; /** whether to generate Outlier/ExtremeValue attributes for each attribute * instead of a general one */ protected boolean m_DetectionPerAttribute = false; /** the position of the outlier attribute */ protected int[] m_OutlierAttributePosition = null; /** whether to add another attribute called "Offset", that lists the * 'multiplier' by which the outlier/extreme value is away from the median, * i.e., value = median + 'multiplier' * IQR <br/> * automatically enables m_DetectionPerAttribute! */ protected boolean m_OutputOffsetMultiplier = false; /** * Returns a string describing this filter * * @return a description of the filter suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "A filter for detecting outliers and extreme values based on " + "interquartile ranges. The filter skips the class attribute.\n\n" + "Outliers:\n" + " Q3 + OF*IQR < x <= Q3 + EVF*IQR\n" + " or\n" + " Q1 - EVF*IQR <= x < Q1 - OF*IQR\n" + "\n" + "Extreme values:\n" + " x > Q3 + EVF*IQR\n" + " or\n" + " x < Q1 - EVF*IQR\n" + "\n" + "Key:\n" + " Q1 = 25% quartile\n" + " Q3 = 75% quartile\n" + " IQR = Interquartile Range, difference between Q1 and Q3\n" + " OF = Outlier Factor\n" + " EVF = Extreme Value Factor"; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector result = new Vector(); Enumeration enm = super.listOptions(); while (enm.hasMoreElements()) result.add(enm.nextElement()); result.addElement(new Option( "\tSpecifies list of columns to base outlier/extreme value detection\n" + "\ton. If an instance is considered in at least one of those\n" + "\tattributes an outlier/extreme value, it is tagged accordingly.\n" + " 'first' and 'last' are valid indexes.\n" + "\t(default none)", "R", 1, "-R <col1,col2-col4,...>")); result.addElement(new Option( "\tThe factor for outlier detection.\n" + "\t(default: 3)", "O", 1, "-O <num>")); result.addElement(new Option( "\tThe factor for extreme values detection.\n" + "\t(default: 2*Outlier Factor)", "E", 1, "-E <num>")); result.addElement(new Option( "\tTags extreme values also as outliers.\n" + "\t(default: off)", "E-as-O", 0, "-E-as-O")); result.addElement(new Option( "\tGenerates Outlier/ExtremeValue pair for each numeric attribute in\n" + "\tthe range, not just a single indicator pair for all the attributes.\n" + "\t(default: off)", "P", 0, "-P")); result.addElement(new Option( "\tGenerates an additional attribute 'Offset' per Outlier/ExtremeValue\n" + "\tpair that contains the multiplier that the value is off the median.\n" + "\t value = median + 'multiplier' * IQR\n" + "Note: implicitely sets '-P'." + "\t(default: off)", "M", 0, "-M")); return result.elements(); } /** * Parses a list of options for this object. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Turns on output of debugging information.</pre> * * <pre> -R <col1,col2-col4,...> * Specifies list of columns to base outlier/extreme value detection * on. If an instance is considered in at least one of those * attributes an outlier/extreme value, it is tagged accordingly. * 'first' and 'last' are valid indexes. * (default none)</pre> * * <pre> -O <num> * The factor for outlier detection. * (default: 3)</pre> * * <pre> -E <num> * The factor for extreme values detection. * (default: 2*Outlier Factor)</pre> * * <pre> -E-as-O * Tags extreme values also as outliers. * (default: off)</pre> * * <pre> -P * Generates Outlier/ExtremeValue pair for each numeric attribute in * the range, not just a single indicator pair for all the attributes. * (default: off)</pre> * * <pre> -M * Generates an additional attribute 'Offset' per Outlier/ExtremeValue * pair that contains the multiplier that the value is off the median. * value = median + 'multiplier' * IQR * Note: implicitely sets '-P'. (default: off)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; super.setOptions(options); tmpStr = Utils.getOption("R", options); if (tmpStr.length() != 0) setAttributeIndices(tmpStr); else setAttributeIndices("first-last"); tmpStr = Utils.getOption("O", options); if (tmpStr.length() != 0) setOutlierFactor(Double.parseDouble(tmpStr)); else setOutlierFactor(3); tmpStr = Utils.getOption("E", options); if (tmpStr.length() != 0) setExtremeValuesFactor(Double.parseDouble(tmpStr)); else setExtremeValuesFactor(2*getOutlierFactor()); setExtremeValuesAsOutliers(Utils.getFlag("E-as-O", options)); setDetectionPerAttribute(Utils.getFlag("P", options)); setOutputOffsetMultiplier(Utils.getFlag("M", options)); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector result; String[] options; int i; result = new Vector(); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); result.add("-R"); if (!getAttributeIndices().equals("")) result.add(getAttributeIndices()); else result.add("first-last"); result.add("-O"); result.add("" + getOutlierFactor()); result.add("-E"); result.add("" + getExtremeValuesFactor()); if (getExtremeValuesAsOutliers()) result.add("-E-as-O"); if (getDetectionPerAttribute()) result.add("-P"); if (getOutputOffsetMultiplier()) result.add("-M"); return (String[]) result.toArray(new String[result.size()]); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String attributeIndicesTipText() { return "Specify range of attributes to act on; " + " this is a comma separated list of attribute indices, with" + " \"first\" and \"last\" valid values; specify an inclusive" + " range with \"-\", eg: \"first-3,5,6-10,last\"."; } /** * Gets the current range selection * * @return a string containing a comma separated list of ranges */ public String getAttributeIndices() { return m_Attributes.getRanges(); } /** * Sets which attributes are to be used for interquartile calculations and * outlier/extreme value detection (only numeric attributes among the * selection will be used). * * @param value a string representing the list of attributes. Since * the string will typically come from a user, attributes * are indexed from 1. <br> eg: first-3,5,6-last * @throws IllegalArgumentException if an invalid range list is supplied */ public void setAttributeIndices(String value) { m_Attributes.setRanges(value); } /** * Sets which attributes are to be used for interquartile calculations and * outlier/extreme value detection (only numeric attributes among the * selection will be used). * * @param value an array containing indexes of attributes to work on. * Since the array will typically come from a program, * attributes are indexed from 0. * @throws IllegalArgumentException if an invalid set of ranges is supplied */ public void setAttributeIndicesArray(int[] value) { setAttributeIndices(Range.indicesToRangeList(value)); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String outlierFactorTipText() { return "The factor for determining the thresholds for outliers."; } /** * Sets the factor for determining the thresholds for outliers. * * @param value the factor. */ public void setOutlierFactor(double value) { if (value >= getExtremeValuesFactor()) System.err.println("OutlierFactor must be smaller than ExtremeValueFactor"); else m_OutlierFactor = value; } /** * Gets the factor for determining the thresholds for outliers. * * @return the factor. */ public double getOutlierFactor() { return m_OutlierFactor; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String extremeValuesFactorTipText() { return "The factor for determining the thresholds for extreme values."; } /** * Sets the factor for determining the thresholds for extreme values. * * @param value the factor. */ public void setExtremeValuesFactor(double value) { if (value <= getOutlierFactor()) System.err.println("ExtremeValuesFactor must be greater than OutlierFactor!"); else m_ExtremeValuesFactor = value; } /** * Gets the factor for determining the thresholds for extreme values. * * @return the factor. */ public double getExtremeValuesFactor() { return m_ExtremeValuesFactor; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String extremeValuesAsOutliersTipText() { return "Whether to tag extreme values also as outliers."; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -