📄 interquartilerange.java
字号:
/** * Set whether extreme values are also tagged as outliers. * * @param value whether or not to tag extreme values also as outliers. */ public void setExtremeValuesAsOutliers(boolean value) { m_ExtremeValuesAsOutliers = value; } /** * Get whether extreme values are also tagged as outliers. * * @return true if extreme values are also tagged as outliers. */ public boolean getExtremeValuesAsOutliers() { return m_ExtremeValuesAsOutliers; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String detectionPerAttributeTipText() { return "Generates Outlier/ExtremeValue attribute pair for each numeric " + "attribute, not just a single pair for all numeric attributes together."; } /** * Set whether an Outlier/ExtremeValue attribute pair is generated for * each numeric attribute ("true") or just one pair for all numeric * attributes together ("false"). * * @param value whether or not to generate indicator attribute pairs * for each numeric attribute. */ public void setDetectionPerAttribute(boolean value) { m_DetectionPerAttribute = value; if (!m_DetectionPerAttribute) m_OutputOffsetMultiplier = false; } /** * Gets whether an Outlier/ExtremeValue attribute pair is generated for * each numeric attribute ("true") or just one pair for all numeric * attributes together ("false"). * * @return true if indicator attribute pairs are generated for * each numeric attribute. */ public boolean getDetectionPerAttribute() { return m_DetectionPerAttribute; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String outputOffsetMultiplierTipText() { return "Generates an additional attribute 'Offset' that contains the " + "multiplier the value is off the median: " + "value = median + 'multiplier' * IQR"; } /** * Set whether an additional attribute "Offset" is generated per * Outlier/ExtremeValue attribute pair that lists the multiplier the value * is off the median: value = median + 'multiplier' * IQR. * * @param value whether or not to generate the additional attribute. */ public void setOutputOffsetMultiplier(boolean value) { m_OutputOffsetMultiplier = value; if (m_OutputOffsetMultiplier) m_DetectionPerAttribute = true; } /** * Gets whether an additional attribute "Offset" is generated per * Outlier/ExtremeValue attribute pair that lists the multiplier the value * is off the median: value = median + 'multiplier' * IQR. * * @return true if the additional attribute is generated. */ public boolean getOutputOffsetMultiplier() { return m_OutputOffsetMultiplier; } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enableAllAttributes(); result.enable(Capability.MISSING_VALUES); // class result.enableAllClasses(); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Determines the output format based on the input format and returns * this. In case the output format cannot be returned immediately, i.e., * hasImmediateOutputFormat() returns false, then this method will called * from batchFinished() after the call of preprocess(Instances), in which, * e.g., statistics for the actual processing step can be gathered. * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #hasImmediateOutputFormat() * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { FastVector atts; FastVector values; Instances result; int i; // attributes must be numeric m_Attributes.setUpper(inputFormat.numAttributes() - 1); m_AttributeIndices = m_Attributes.getSelection(); for (i = 0; i < m_AttributeIndices.length; i++) { // ignore class if (m_AttributeIndices[i] == inputFormat.classIndex()) { m_AttributeIndices[i] = NON_NUMERIC; continue; } // not numeric -> ignore it if (!inputFormat.attribute(m_AttributeIndices[i]).isNumeric()) m_AttributeIndices[i] = NON_NUMERIC; } // get old attributes atts = new FastVector(); for (i = 0; i < inputFormat.numAttributes(); i++) atts.addElement(inputFormat.attribute(i)); if (!getDetectionPerAttribute()) { m_OutlierAttributePosition = new int[1]; m_OutlierAttributePosition[0] = atts.size(); // add 2 new attributes values = new FastVector(); values.addElement("no"); values.addElement("yes"); atts.addElement(new Attribute("Outlier", values)); values = new FastVector(); values.addElement("no"); values.addElement("yes"); atts.addElement(new Attribute("ExtremeValue", values)); } else { m_OutlierAttributePosition = new int[m_AttributeIndices.length]; for (i = 0; i < m_AttributeIndices.length; i++) { if (m_AttributeIndices[i] == NON_NUMERIC) continue; m_OutlierAttributePosition[i] = atts.size(); // add new attributes values = new FastVector(); values.addElement("no"); values.addElement("yes"); atts.addElement( new Attribute( inputFormat.attribute( m_AttributeIndices[i]).name() + "_Outlier", values)); values = new FastVector(); values.addElement("no"); values.addElement("yes"); atts.addElement( new Attribute( inputFormat.attribute( m_AttributeIndices[i]).name() + "_ExtremeValue", values)); if (getOutputOffsetMultiplier()) atts.addElement( new Attribute( inputFormat.attribute( m_AttributeIndices[i]).name() + "_Offset")); } } // generate header result = new Instances(inputFormat.relationName(), atts, 0); result.setClassIndex(inputFormat.classIndex()); return result; } /** * computes the thresholds for outliers and extreme values * * @param instances the data to work on */ protected void computeThresholds(Instances instances) { int i; double[] values; int[] sortedIndices; int half; int quarter; double q1; double q2; double q3; m_UpperExtremeValue = new double[m_AttributeIndices.length]; m_UpperOutlier = new double[m_AttributeIndices.length]; m_LowerOutlier = new double[m_AttributeIndices.length]; m_LowerExtremeValue = new double[m_AttributeIndices.length]; m_Median = new double[m_AttributeIndices.length]; m_IQR = new double[m_AttributeIndices.length]; for (i = 0; i < m_AttributeIndices.length; i++) { // non-numeric attribute? if (m_AttributeIndices[i] == NON_NUMERIC) continue; // sort attribute data values = instances.attributeToDoubleArray(m_AttributeIndices[i]); sortedIndices = Utils.sort(values); // determine indices half = sortedIndices.length / 2; quarter = half / 2; if (sortedIndices.length % 2 == 1) { q2 = values[sortedIndices[half]]; } else { q2 = (values[sortedIndices[half]] + values[sortedIndices[half + 1]]) / 2; } if (half % 2 == 1) { q1 = values[sortedIndices[quarter]]; q3 = values[sortedIndices[sortedIndices.length - quarter - 1]]; } else { q1 = (values[sortedIndices[quarter]] + values[sortedIndices[quarter + 1]]) / 2; q3 = (values[sortedIndices[sortedIndices.length - quarter - 1]] + values[sortedIndices[sortedIndices.length - quarter]]) / 2; } // determine thresholds and other values m_Median[i] = q2; m_IQR[i] = q3 - q1; m_UpperExtremeValue[i] = q3 + getExtremeValuesFactor() * m_IQR[i]; m_UpperOutlier[i] = q3 + getOutlierFactor() * m_IQR[i]; m_LowerOutlier[i] = q1 - getOutlierFactor() * m_IQR[i]; m_LowerExtremeValue[i] = q1 - getExtremeValuesFactor() * m_IQR[i]; } } /** * returns whether the instance has an outlier in the specified attribute * or not * * @param inst the instance to test * @param index the attribute index * @return true if the instance is an outlier */ protected boolean isOutlier(Instance inst, int index) { boolean result; double value; value = inst.value(m_AttributeIndices[index]); result = ((m_UpperOutlier[index] < value) && (value <= m_UpperExtremeValue[index])) || ((m_LowerExtremeValue[index] <= value) && (value < m_LowerOutlier[index])); return result; } /** * returns whether the instance is an outlier or not * * @param inst the instance to test * @return true if the instance is an outlier */ protected boolean isOutlier(Instance inst) { boolean result; int i; result = false; for (i = 0; i < m_AttributeIndices.length; i++) { // non-numeric attribute? if (m_AttributeIndices[i] == NON_NUMERIC) continue; result = isOutlier(inst, m_AttributeIndices[i]); if (result) break; } return result; } /** * returns whether the instance has an extreme value in the specified * attribute or not * * @param inst the instance to test * @param index the attribute index * @return true if the instance is an extreme value */ protected boolean isExtremeValue(Instance inst, int index) { boolean result; double value; value = inst.value(m_AttributeIndices[index]); result = (value > m_UpperExtremeValue[index]) || (value < m_LowerExtremeValue[index]); return result; } /** * returns whether the instance is an extreme value or not * * @param inst the instance to test * @return true if the instance is an extreme value */ protected boolean isExtremeValue(Instance inst) { boolean result; int i; result = false; for (i = 0; i < m_AttributeIndices.length; i++) { // non-numeric attribute? if (m_AttributeIndices[i] == NON_NUMERIC) continue; result = isExtremeValue(inst, m_AttributeIndices[i]); if (result) break; } return result; } /** * returns the mulitplier of the IQR the instance is off the median for this * particular attribute. * * @param inst the instance to test * @param index the attribute index * @return the multiplier */ protected double calculateMultiplier(Instance inst, int index) { double result; double value; value = inst.value(m_AttributeIndices[index]); result = (value - m_Median[index]) / m_IQR[index]; return result; } /** * Processes the given data (may change the provided dataset) and returns * the modified version. This method is called in batchFinished(). * This implementation only calls process(Instance) for each instance * in the given dataset. * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected Instances process(Instances instances) throws Exception { Instances result; Instance instOld; Instance instNew; int i; int n; double[] values; int numAttNew; int numAttOld; if (!isFirstBatchDone()) computeThresholds(instances); result = getOutputFormat(); numAttOld = instances.numAttributes(); numAttNew = result.numAttributes(); for (n = 0; n < instances.numInstances(); n++) { instOld = instances.instance(n); values = new double[numAttNew]; System.arraycopy(instOld.toDoubleArray(), 0, values, 0, numAttOld); // generate new instance instNew = new Instance(1.0, values); instNew.setDataset(result); // per attribute? if (!getDetectionPerAttribute()) { // outlier? if (isOutlier(instOld)) instNew.setValue(m_OutlierAttributePosition[0], 1); // extreme value? if (isExtremeValue(instOld)) { instNew.setValue(m_OutlierAttributePosition[0] + 1, 1); // tag extreme values also as outliers? if (getExtremeValuesAsOutliers()) instNew.setValue(m_OutlierAttributePosition[0], 1); } } else { for (i = 0; i < m_AttributeIndices.length; i++) { // non-numeric attribute? if (m_AttributeIndices[i] == NON_NUMERIC) continue; // outlier? if (isOutlier(instOld, m_AttributeIndices[i])) instNew.setValue(m_OutlierAttributePosition[i], 1); // extreme value? if (isExtremeValue(instOld, m_AttributeIndices[i])) { instNew.setValue(m_OutlierAttributePosition[i] + 1, 1); // tag extreme values also as outliers? if (getExtremeValuesAsOutliers()) instNew.setValue(m_OutlierAttributePosition[i], 1); } // add multiplier? if (getOutputOffsetMultiplier()) instNew.setValue( m_OutlierAttributePosition[i] + 2, calculateMultiplier(instOld, m_AttributeIndices[i])); } } // copy possible strings, relational values... copyValues(instNew, false, instOld.dataset(), getOutputFormat()); // add to output result.add(instNew); } return result; } /** * Main method for testing this class. * * @param args should contain arguments to the filter: use -h for help */ public static void main(String[] args) { runFilter(new InterquartileRange(), args); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -