📄 discretizefilter.java
字号:
return "Make resulting attributes binary."; } /** * Gets whether binary attributes should be made for discretized ones. * * @return true if attributes will be binarized */ public boolean getMakeBinary() { return m_MakeBinary; } /** * Sets whether binary attributes should be made for discretized ones. * * @param makeBinary if binary attributes are to be made */ public void setMakeBinary(boolean makeBinary) { m_MakeBinary = makeBinary; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String useMDLTipText() { return "Use class-based discretization. If set to false, does" + " not require a class attribute, and uses a fixed number" + " of bins (according to bins setting)."; } /** * Gets whether MDL will be used as the discretisation method. * * @return true if so, false if fixed bins should be used. */ public boolean getUseMDL() { return m_UseMDL; } /** * Sets whether MDL will be used as the discretisation method. * * @param useMDL true if MDL should be used, false if fixed bins should * be used. */ public void setUseMDL(boolean useMDL) { m_UseMDL = useMDL; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String useKononenkoTipText() { return "Use Kononenko's MDL criterion. If set to false" + " uses the Fayyad & Irani criterion."; } /** * Gets whether Kononenko's MDL criterion is to be used. * * @return true if Kononenko's criterion will be used. */ public boolean getUseKononenko() { return m_UseKononenko; } /** * Sets whether Kononenko's MDL criterion is to be used. * * @param useKon true if Kononenko's one is to be used */ public void setUseKononenko(boolean useKon) { m_UseMDL = true; m_UseKononenko = useKon; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String useBetterEncodingTipText() { return "Uses a different split point encoding. Who says it's better?" + " (Eibe fix this)."; } /** * Gets whether better encoding is to be used for MDL. * * @return true if the better MDL encoding will be used */ public boolean getUseBetterEncoding() { return m_UseBetterEncoding; } /** * Sets whether better encoding is to be used for MDL. * * @param useBetterEncoding true if better encoding to be used. */ public void setUseBetterEncoding(boolean useBetterEncoding) { m_UseMDL = true; m_UseBetterEncoding = useBetterEncoding; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String binsTipText() { return "Number of bins for class-blind discretisation. This" + " setting is ignored if MDL-based discretisation is used."; } /** * Gets the number of bins numeric attributes will be divided into * * @return the number of bins. */ public int getBins() { return m_NumBins; } /** * Sets the number of bins to divide each selected numeric attribute into * * @param numBins the number of bins */ public void setBins(int numBins) { m_UseMDL = false; m_NumBins = numBins; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String invertSelectionTipText() { return "Set attribute selection mode. If false, only selected" + " (numeric) attributes in the range will be discretized; if" + " true, only non-selected attributes will be discretized."; } /** * Gets whether the supplied columns are to be removed or kept * * @return true if the supplied columns will be kept */ public boolean getInvertSelection() { return m_DiscretizeCols.getInvert(); } /** * Sets whether selected columns should be removed or kept. If true the * selected columns are kept and unselected columns are deleted. If false * selected columns are deleted and unselected columns are kept. * * @param invert the new invert setting */ public void setInvertSelection(boolean invert) { m_DiscretizeCols.setInvert(invert); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String attributeIndicesTipText() { return "Specify range of attributes to act on." + " This is a comma separated list of attribute indices, with" + " \"first\" and \"last\" valid values. Specify an inclusive" + " range with \"-\". E.g: \"first-3,5,6-10,last\"."; } /** * Gets the current range selection * * @return a string containing a comma separated list of ranges */ public String getAttributeIndices() { return m_DiscretizeCols.getRanges(); } /** * Sets which attributes are to be Discretized (only numeric * attributes among the selection will be Discretized). * * @param rangeList a string representing the list of attributes. Since * the string will typically come from a user, attributes are indexed from * 1. <br> * eg: first-3,5,6-last * @exception IllegalArgumentException if an invalid range list is supplied */ public void setAttributeIndices(String rangeList) { m_DiscretizeCols.setRanges(rangeList); } /** * Sets which attributes are to be Discretized (only numeric * attributes among the selection will be Discretized). * * @param attributes an array containing indexes of attributes to Discretize. * Since the array will typically come from a program, attributes are indexed * from 0. * @exception IllegalArgumentException if an invalid set of ranges * is supplied */ public void setAttributeIndicesArray(int [] attributes) { setAttributeIndices(Range.indicesToRangeList(attributes)); } /** * Gets the cut points for an attribute * * @param the index (from 0) of the attribute to get the cut points of * @return an array containing the cutpoints (or null if the * attribute requested isn't being Discretized */ public double [] getCutPoints(int attributeIndex) { if (m_CutPoints == null) { return null; } return m_CutPoints[attributeIndex]; } /** Generate the cutpoints for each attribute */ protected void calculateCutPoints() { Instances copy = null; m_CutPoints = new double [getInputFormat().numAttributes()] []; for(int i = getInputFormat().numAttributes() - 1; i >= 0; i--) { if ((m_DiscretizeCols.isInRange(i)) && (getInputFormat().attribute(i).isNumeric())) { if (m_UseMDL) { // Use copy to preserve order if (copy == null) { copy = new Instances(getInputFormat()); } calculateCutPointsByMDL(i, copy); } else { if (m_FindNumBins) { findNumBins(i); } else { calculateCutPointsByBinning(i); } } } } } /** * Set cutpoints for a single attribute using MDL. * * @param index the index of the attribute to set cutpoints for */ protected void calculateCutPointsByMDL(int index, Instances data) { // Sort instances data.sort(data.attribute(index)); // Find first instances that's missing int firstMissing = data.numInstances(); for (int i = 0; i < data.numInstances(); i++) { if (data.instance(i).isMissing(index)) { firstMissing = i; break; } } m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing); } /** Test using Kononenko's MDL criterion. */ private boolean KononenkosMDL(double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double distPrior, instPrior, distAfter = 0, sum, instAfter = 0; double before, after; int numClassesTotal; // Number of classes occuring in the set numClassesTotal = 0; for (int i = 0; i < priorCounts.length; i++) { if (priorCounts[i] > 0) { numClassesTotal++; } } // Encode distribution prior to split distPrior = SpecialFunctions.log2Binomial(numInstances + numClassesTotal - 1, numClassesTotal - 1); // Encode instances prior to split. instPrior = SpecialFunctions.log2Multinomial(numInstances, priorCounts); before = instPrior + distPrior; // Encode distributions and instances after split. for (int i = 0; i < bestCounts.length; i++) { sum = Utils.sum(bestCounts[i]); distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1, numClassesTotal - 1); instAfter += SpecialFunctions.log2Multinomial(sum, bestCounts[i]); } // Coding cost after split after = Utils.log2(numCutPoints) + distAfter + instAfter; // Check if split is to be accepted return (Utils.gr(before, after)); } /** Test using Fayyad and Irani's MDL criterion. */ private boolean FayyadAndIranisMDL(double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double priorEntropy, entropy, gain; double entropyLeft, entropyRight, delta; int numClassesTotal, numClassesRight, numClassesLeft; // Compute entropy before split. priorEntropy = ContingencyTables.entropy(priorCounts); // Compute entropy after split. entropy = ContingencyTables.entropyConditionedOnRows(bestCounts); // Compute information gain. gain = priorEntropy - entropy; // Number of classes occuring in the set numClassesTotal = 0; for (int i = 0; i < priorCounts.length; i++) { if (priorCounts[i] > 0) { numClassesTotal++; } } // Number of classes occuring in the left subset numClassesLeft = 0; for (int i = 0; i < bestCounts[0].length; i++) { if (bestCounts[0][i] > 0) { numClassesLeft++; } } // Number of classes occuring in the right subset numClassesRight = 0; for (int i = 0; i < bestCounts[1].length; i++) { if (bestCounts[1][i] > 0) { numClassesRight++; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -