📄 entropy.java

📁 基于数据挖掘的决策树改进算法和贝叶斯改进算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
package shared;
import java.lang.*;
import java.util.*;

/** This class handles all of the Entropy based calculations. All logs
 * are base 2 (other bases just scale the entropy). The reason for using
 * log_bin is simply that the examples in Quinlan's C4.5 book use it, and
 * those examples were used for testing.  It's also a measure of the
 * number of "bits" in information theory, so it's appealing in that sense
 * too. The computation is based on:                                           <BR>
 * 1. "Boolean Feature Discovery in Empirical Learning" / Pagallo and
 * Haussler.                                                                   <BR>
 * 2. "A First Course in Probability, 2nd Edition" / Ross, pages 354-359.
 *                                                                            <BR>
 * 3. "C4.5: Programs for Machine Learning" / Ross Quinlan, pages 18-24.
 *                                                                            <BR>
 * @author James Louis 5/21/2001	Java implementations.
 * @author Cliff Brunk 5/25/96 Added weight to find_best_threshold and
 * get_split_score.
 * @author Eric Eros 12/05/96 Revised find_best_threshold to use
 * SplitAttr, and to use RealAndLabelColumn and SplitAttr.
 * @author Chia-Hsin Li Revised find_best_threshold to allow multiple
 * InstanceLists.
 * @author Brian, Ronny Kohavi 9/04/93 Initial revision
 */
public class Entropy {
    
    /** Constant for binary log calculations.
     */
    static public double M_LOG2E = 1.4426950408889634074;
    
    /** Automatically determine a good lower bound for minSplit, based on the
     * total weight of an instance list at the start of training.
     * @param totalWeight	The total weight of instances in this
     * InstanceList partition.
     * @return A lower bound for minSplit.
     */
    public static double auto_lbound_min_split(double totalWeight) {
        double k = 1.851;
        double m = 8.447;
        if (totalWeight < 1)
            return 1;
        return Math.max(1.0, k*log_bin(totalWeight)-m);
    }
    
    /** Returns the log base two of the supplied number.
     * @param num	The number for which a log is requested.
     * @return The log base two of the supplied number.
     */
    public static double log_bin(double num) {
        //      return Math.log(num)/ Math.log(2);
        return Math.log(num) * M_LOG2E;
    }
    
    /** Compute the best threshold for RealAndLabelColumn(s). Initially, all
     * instances are at the right hand side of splitDist and
     * splitAndLabelDist. To find the best threshold, we shift the instances
     * from right to left and calculate their conditional entropy. Then we
     * pick up the one with minimum conditional entropy and return
     * bestThreshold and minCondEntropy as results. When two entropies are
     * equal, we prefer the one that splits instances equally into two bins
     * in the hope of making the tree shallower. Tests show minor effect but
     * it does help pima for different leaf counts.
     * @param realAndLabelColumn The column of real values and their associated labels
     * @param minSplit The split value for which conditional entropy is minimal.
     * @param split The scores for all available splits.
     * @param bestThreshold The real value that is the best threshold over the supplied real values.
     * @param bestSplitIndex Index of the best split in the supplied RealAndLabelColumn.
     * @param numDistinctValues The number of non-equal values.
     * @param smoothInst The index of the instance to be smoothed toward.
     * @param smoothFactor The amount of normalization done towards the specified instance index.
     */
    public static void find_best_threshold(RealAndLabelColumn realAndLabelColumn,
    double minSplit, SplitScore split,
    DoubleRef bestThreshold, IntRef bestSplitIndex,
    IntRef numDistinctValues, int smoothInst,
    double smoothFactor) {
        if (minSplit < 0)
            Error.fatalErr("find_best_threshold:  minSplit(" +minSplit+ ") must be at least 0");
        bestThreshold.value = Globals.UNDEFINED_REAL;
        bestSplitIndex.value = 0;
        numDistinctValues.value = 0;
        double totalKnownWeight = realAndLabelColumn.known_weight();
        if (realAndLabelColumn.known_weight()<(2 * minSplit)) {
            split.reset();
            return;
        }
        Vector scores = new Vector();
        double[][] sAndLDist = get_score_array(realAndLabelColumn,
        split, minSplit,
        scores, numDistinctValues,
        smoothInst, smoothFactor);
        if (sAndLDist == null)
            return;
        double bestScore = find_best_score(totalKnownWeight,
        scores, minSplit, bestSplitIndex);
        if (bestScore > Globals.UNDEFINED_REAL) {
            bestThreshold.value =
            ((double)(realAndLabelColumn.index(bestSplitIndex.value - 1).value)+
            (double)(realAndLabelColumn.index(bestSplitIndex.value).value))/ 2;
            for(int k = 1 ; k <= bestSplitIndex.value ; k++) {
                AttrLabelPair alp = realAndLabelColumn.index(k - 1);
                int lab = alp.label;
                double weight = alp.weight;
                sAndLDist[lab][Globals.RIGHT_NODE] -= weight;
                sAndLDist[lab][Globals.LEFT_NODE] += weight;
                if (MLJ.approx_equal(sAndLDist[lab][Globals.RIGHT_NODE],0))
                    sAndLDist[lab][Globals.RIGHT_NODE]= 0;
            }
            sAndLDist = split.set_split_and_label_dist(sAndLDist);
        }
        else {
            split.reset();
            sAndLDist = null;
        }
        MLJ.ASSERT(sAndLDist == null, "Entropy::find_best_threshold: sAndLDist == null");
    }
    
    /** Calculates the distribution array for the given sorted
     * RealAndLabelColumn. This function then progresses through the input array,
     * shifting counts from the right to the left (in the distributions).
     * When the Real values in the input array change, the potential score of
     * a split at that value is calculated, and stored in the output array,
     * outScores. At the end, outScores contains the discrete thresholds, the
     * scores associated with a split at each of these thresholds, and the
     * indices into the original array.
     * @param realAndLabelColumn The supplied column of real values and their associated label values.
     * @param split The scores for all available splits.
     * @param minSplit The split value for which conditional entropy is minimal.
     * @param outScores The scores after smoothing.
     * @param numDistinctValues The number of non-equal values.
     * @param smoothInst The index of the instance to be smoothed towards.
     * @param smoothFactor The amount of normalization done towards the specified instance index.
     * @return The split and label distribution.
     */
    public static double[][] get_score_array(RealAndLabelColumn realAndLabelColumn,
    SplitScore split, double minSplit,
    Vector outScores,
    IntRef numDistinctValues, int smoothInst,
    double smoothFactor) {
        int numLabels = realAndLabelColumn.label_count();
        double[][] splitAndLabelDist = new double[numLabels+1][3];
        Matrix.initialize(0,splitAndLabelDist);
        double[] splitDist = new double[3];
        realAndLabelColumn.init_dist_counts(splitAndLabelDist, splitDist);
        double[] labelDist = new double[splitAndLabelDist.length];
        Matrix.sum_rows(labelDist,splitAndLabelDist);
        int numUsedLabels = 0;
        for(int labelNum = 0 ;
        labelNum < splitAndLabelDist.length ;
        labelNum++)
            if (labelDist[labelNum] > 0)
                ++numUsedLabels;
        if (numUsedLabels < 2) {
            split.reset();
            return null;
        }
        double theEntropy = entropy(labelDist);
        
        double[][] sAndLDist = new double[splitAndLabelDist.length][splitAndLabelDist[0].length];
        for(int i = 0 ; i < splitAndLabelDist.length ; i++)
            for(int j = 0 ; j < splitAndLabelDist[0].length ; j++)
                sAndLDist[i][j] = splitAndLabelDist[i][j];
        numDistinctValues.value = 0;
        fill_scores(realAndLabelColumn, split, minSplit, theEntropy,
        outScores, numDistinctValues, splitAndLabelDist, splitDist);
        if (sAndLDist != null && smoothInst != 0)
            smooth_scores(outScores, smoothInst, smoothFactor);
        return sAndLDist;
    }
    
    /** Normalizes the scores towards the instance at the supplied index.
     * @param scores The set of scores to be smoothed.
     * @param smoothInst The index of the instance to be smoothed towards.
     * @param smoothFactor The amount of normalization done towards the specified instance index.
     */
    public static void smooth_scores(Vector scores, int smoothInst, double smoothFactor) {
        double[] oldScores = new double[smoothInst+1];
        MLJArray.init_values(-1,oldScores);
        int numThresholds = scores.size();
        
        for(int i = 0 ; i < numThresholds ; i++) {
            double summedScores =((ThresholdInfo)scores.get(i)).score;
            double summedFactor = 1;
            double currFactor = smoothFactor;
            for(int left = 1 ; i - left >= 0 && left <= smoothInst ; left++) {
                summedScores +=(currFactor *((ThresholdInfo)scores.get(i - left)).score);
                summedFactor += currFactor;
                currFactor *= smoothFactor;
            }
            currFactor = smoothFactor;
            for(int right = 1 ; i + right < numThresholds && right <= smoothInst ; right++) {
                summedScores +=(currFactor *((ThresholdInfo)scores.get(i + right)).score);
                summedFactor += currFactor;
                currFactor *= smoothFactor;
            }
            if (i > smoothInst) {
                ((ThresholdInfo)scores.get(i - smoothInst - 1)).score = oldScores[smoothInst];
            }
            for(int j = smoothInst - 1 ; j >= 0 ; j--) {
                oldScores[j+1] = oldScores[j];
            }
            if (MLJ.approx_equal(summedFactor, 0.0))
                Error.fatalErr("smooth_scores: divisor (summedFactor) too close to zero");
            oldScores[0] = summedScores/  summedFactor;
        }
        for(int j = 0 ; j < smoothInst+1 ; j++) {
            if (numThresholds - smoothInst - 1 + j >= 0)
                ((ThresholdInfo)scores.get(numThresholds - smoothInst - 1 + j)).score = oldScores[smoothInst - j];
        }
    }
    
    /** Fills the Vector of scores with the scores for all the thresholds.
     * The number of distinct values is only true for the range of numbers if
     * the relevant range that does not include minSplit on the right and left.
     * @param realAndLabelColumn The column of real values and their associated labels over which thresholds are
     * created.
     * @param split The SplitScore used for scoring this threshold split.
     * @param minSplit The minimum value for splits.
     * @param theEntropy The Entropy value
     * @param outScores The Vector of scores to be filled.
     * @param numDistinctValues The number of distinct real values for this attribute.
     * @param splitAndLabelDist Distributions over each split and label pair.
     * @param splitDist The distribution over splits.
     */
    public static void fill_scores(RealAndLabelColumn realAndLabelColumn,
    SplitScore split, double minSplit, double theEntropy,
    Vector outScores, IntRef numDistinctValues, double[][] splitAndLabelDist,
    double[] splitDist) {
        double totalWeight = realAndLabelColumn.total_weight();
        int numKnownInstances = realAndLabelColumn.known_count();
        float lastSeen = realAndLabelColumn.index(0).value;
        int threshIndex = 0;
        // We start at 1 because we're comparing a split just before this
        //   element
        for(int k = 1 ; k < numKnownInstances ; k++) {
            AttrLabelPair alp = realAndLabelColumn.index(k - 1);
            float value = alp.value;
            int lab = alp.label;
            double weight = alp.weight;
            float nextValue = realAndLabelColumn.index(k).value;
            splitDist[Globals.RIGHT_NODE] -= weight;
            splitDist[Globals.LEFT_NODE] += weight;
            if (MLJ.approx_equal((float)splitDist[Globals.RIGHT_NODE], 0.0))
                splitDist[Globals.RIGHT_NODE] = 0;
            splitAndLabelDist[lab][Globals.RIGHT_NODE] -= weight;
            splitAndLabelDist[lab][Globals.LEFT_NODE] += weight;
            if (MLJ.approx_equal((float)splitAndLabelDist[lab][Globals.RIGHT_NODE], 0.0))
                splitAndLabelDist[lab][Globals.RIGHT_NODE] = 0;
            double deltaAttr =(double)nextValue -(double)(lastSeen);
            MLJ.ASSERT(deltaAttr >= 0, "Entropy::fill_scores: deltaAttr >= 0 : deltaAttr == " +deltaAttr);
            if (deltaAttr < 2 * MLJ.storedRealEpsilon)
                continue;
            lastSeen = nextValue;
            if (nextValue - value < MLJ.storedRealEpsilon)
                continue;
            if (splitDist[Globals.RIGHT_NODE] >= minSplit && splitDist[Globals.LEFT_NODE] >= minSplit)
                numDistinctValues.value = numDistinctValues.value + 1;
            
            ThresholdInfo outScore = new ThresholdInfo();
            outScores.add(threshIndex++,outScore);
            outScore.index = k;
            outScore.weight = splitDist[Globals.LEFT_NODE];
            outScore.score = split.score(splitAndLabelDist, splitDist, null, theEntropy, totalWeight);
        }
    }
    
    /** Compute the entropy H(Y) for label Y. Giving us an InstanceList forces counters.
     * If you don't give us the total instances, we count (slightly less efficient).
     * Entropy without the sum acts a bit differently by allowing nodes with 0
     * instances and returning entropy -1. The reason is that the caller shouldn't be
     * required to compute the sum just for this.
     *
12 3 下一页
💿 文件大小 235 K
👤 上传用户 mpeg2000
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#改进算法 #数据挖掘 #决策树 #贝叶斯
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -