📄 entropy.java

📁 本代码是用java语言编写的基于决策树c4.5算法的数据挖掘程序
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
     * @param labelCount	The count of each label found in the data.
     * @return The entropy for the supplied label counts.
     */
    public static double entropy(double[] labelCount) {
        double sum = MLJArray.sum(labelCount);
        if (MLJ.approx_equal(sum, 0.0))
            return Globals.UNDEFINED_REAL;
        return entropy(labelCount, sum);
    }
    
    /** Compute the entropy H(Y) for label Y. Giving us an InstanceList forces counters.
     * If you don't give us the total instances, we count (slightly less efficient).
     * Entropy without the sum acts a bit differently by allowing nodes with 0
     * instances and returning entropy -1. The reason is that the caller shouldn't be
     * required to compute the sum just for this.
     *
     * @param labelCount	The count of each label found in the data.
     * @param totalInstanceWeight	The total weight for all of the instances.
     * @return The entropy for the supplied label counts.
     */
    public static double entropy(double[] labelCount,double totalInstanceWeight) {
        MLJ.verify_strictly_greater(totalInstanceWeight, 0, "entropy: totalInstanceWeight is too small");
        if (Globals.DBG) {
            double sum = MLJArray.sum(labelCount);
            MLJ.verify_approx_equal(sum, totalInstanceWeight,
            "entropy(,,): sum and totalWeight don\'t "
            + "match");
        }
        double H = 0;
        for(int y = 0 ; y < labelCount.length ; y++)
            if (labelCount[y] != 0) {
                // We know that totalInstanceWeight > 0 by the check
                //   on verify_strictly_greater above.
                double prob_y = labelCount[y]/  totalInstanceWeight;
                H -= prob_y * log_bin(prob_y);
            }
        return H;
    }
    
    /** Compute the entropy H(Y) for label Y. Giving us an InstanceList forces counters.
     * If you don't give us the total instances, we count (slightly less efficient).
     * Entropy without the sum acts a bit differently by allowing nodes with 0
     * instances and returning entropy -1. The reason is that the caller shouldn't be
     * required to compute the sum just for this.
     *
     * @param instList The supplied instances for whihc entropy will be calculated.
     * @return The entropy value.
     */
    public static double entropy(InstanceList instList) {
        return entropy(instList.counters().label_counts(),instList.total_weight());
    }
    
    
    /** Search a score array to find the best score/index.
     * @param totalKnownWeight Total weight of all Instances for which a value is known.
     * @param scores The scores of the available Splits.
     * @param minSplit The minimum value for a split.
     * @param bestSplitIndex The index of the best split.
     * @return The score of the best split.
     */
    public static double find_best_score(double totalKnownWeight,
    Vector scores, double minSplit,
    IntRef bestSplitIndex) {
        double minimalDistanceFromCenter = totalKnownWeight/  2;
        double bestScore = Globals.UNDEFINED_REAL;
        for(int k = 0 ; k < scores.size(); k++) {
            
            if (totalKnownWeight -((ThresholdInfo)scores.get(k)).weight < minSplit)
                break;
            if (((ThresholdInfo)scores.get(k)).weight < minSplit)
                continue;
            double currentScore =((ThresholdInfo)scores.get(k)).score;
            double currentDistance = Math.abs(totalKnownWeight / 2 -((ThresholdInfo)scores.get(k)).weight);
            
            if (currentScore > bestScore + MLJ.realEpsilon ||(MLJ.approx_equal(bestScore, currentScore)&& currentDistance < minimalDistanceFromCenter)) {
                bestSplitIndex.value =((ThresholdInfo)scores.get(k)).index;
                minimalDistanceFromCenter = currentDistance;
                bestScore = currentScore;
                LogOptions.GLOBLOG(6, "index " +bestSplitIndex+ ", entropy " +bestScore+ '\n');
            }
        }
        return bestScore;
    }
    
    /** Build the splitAndLabelDist and splitDist arrays needed for calculating
     * conditional entropy. All of the splitAndLabelDist arrays of the Instance Lists
     * are concatenated. The unaccounted instances allow the list of nodes to be
     * partial, i.e., not to contain all instances. The split will be created so that
     * the unaccounted instances are in an extra split with the same label, so that the
     * entropy will be decreased correctly as if they were in a pure node.
     * @param currentLevel	The list of instances in the current partition
     * for which a split is being determined.
     * @param attrNum The number of the attribute over which a split and label distribution is to be
     * built.
     * @return Distributions over each split and label pair.
     */
    public static double[][] build_split_and_label_dist(InstanceList[] currentLevel, int attrNum) {
        return build_split_and_label_dist(currentLevel,attrNum,0);
    }
    
    /** Build the splitAndLabelDist and splitDist arrays needed for calculating
     * conditional entropy. All of the splitAndLabelDist arrays of the Instance Lists
     * are concatenated. The unaccounted instances allow the list of nodes to be
     * partial, i.e., not to contain all instances. The split will be created so that
     * the unaccounted instances are in an extra split with the same label, so that the
     * entropy will be decreased correctly as if they were in a pure node.
     * @param currentLevel The list of instances in the current partition for which a split is being
     * determined.
     * @param attrNum The number of the attribute over which a split and label distribution is to be
     * built.
     * @param unaccountedWeight	The weight for instances that are not
     * accounted for in this partition.
     * @return Distributions over each split and label pair.
     */
    public static double[][] build_split_and_label_dist(InstanceList[] currentLevel,
    int attrNum,double unaccountedWeight) {
        MLJ.ASSERT(currentLevel[0] != null, "Entropy::build_split_and_label_dist: currentlevel == null.");
        Schema schema = currentLevel[0].get_schema();
        int numInstLists = currentLevel.length;
        int numLabelValues = schema.num_label_values();
        int numAttrValues = schema.num_attr_values(attrNum);
        MLJ.ASSERT(numInstLists > 0, "Entropy::build_split_and_label_dist: numInstLists <= 0.");
        MLJ.ASSERT(numLabelValues > 0, "Entropy::build_split_and_label_dist: numLabelValues <= 0.");
        MLJ.ASSERT(numAttrValues > 0, "Entropy::build_split_and_label_dist: numAttrValues <= 0.");
        MLJ.ASSERT(unaccountedWeight >= 0, "Entropy::build_split_and_label_dist: unaccountedWeight < 0.");
        double[][] splitAndLabelDist = new double[numLabelValues][numInstLists*(numAttrValues+1)+((MLJ.approx_equal(unaccountedWeight, 0.0))?0:1)];
        int countSplitAndLabelDistCol = 0;
        for(int instListCount = 0 ; instListCount < numInstLists ; instListCount++) {
            MLJ.ASSERT(currentLevel[instListCount] != null, "Entropy::build_split_and_label_dist: currentLevel[instListCount] == null.");
            for(int attrValCount = 0 ;
            attrValCount < numAttrValues +1 ; //add 1 to account for unknown being moved from -1 to 0 -JL
            attrValCount++, countSplitAndLabelDistCol++) {
                for(int labelValCount = 0 ; labelValCount < numLabelValues ;
                labelValCount++) {
                    BagCounters bc = currentLevel[instListCount].counters();
                    splitAndLabelDist[labelValCount][countSplitAndLabelDistCol]=
                    bc.value_counts()[attrNum][labelValCount+1][attrValCount];
                    //labelValCount needs 1 added because the unknown label has
                    //has been moved from -1 to 0. The first nominal label is now
                    //at 1, not 0, in the BagCounters class. -JL
                }
            }
        }
        // Assign the unaccounted to the last split with all counts
        //   for one label (so it looks pure).
        if (unaccountedWeight > 0) {
            MLJ.ASSERT(countSplitAndLabelDistCol == splitAndLabelDist[0][splitAndLabelDist.length], "Entropy::build_split_and_label_dist: countSplitAndLabelDistCol != splitAndLabelDist[0][splitAndLabelDist.length]");
            for(int labelValCount = 0 ; labelValCount < numLabelValues ; labelValCount++)
                splitAndLabelDist[labelValCount][countSplitAndLabelDistCol]=(labelValCount != 0)? 0 : unaccountedWeight;
        }
        return splitAndLabelDist;
    }
    
    /** Returns the minSplit which is used in find_best_threshold(), given
     * lowerBoundMinSplit, upperBoundMinSplit, and minSplitPercent. This
     * function is called by inducers.
     * @return The minSplit which is used in find_best_threshold().
     * @param upperBoundMinSplit Upper bound for the minimum split value.
     * @param totalWeight The total weight of all instances in the list of instances for which a split
     * is requested.
     * @param numTotalCategories Number of possible values an instance may be categorized as.
     * @param lowerBoundMinSplit Lower bound for the minimum split value.
     * @param minSplitPercent The percentage of total weight per category that represents the minimum value
     * for a split.
     */
    public static double min_split(double totalWeight,
    int numTotalCategories, double lowerBoundMinSplit,
    double upperBoundMinSplit, double minSplitPercent) {
        if (numTotalCategories <= 0)
            Error.fatalErr("min_split: divisor (numTotalCategories) is zero or neg");
        if ((Globals.DBG)&&(lowerBoundMinSplit <= 0))
            Error.fatalErr("min_split:  lowerBoundMinSplit ("
            + lowerBoundMinSplit + ") must be at least one");
        double minSplit = minSplitPercent * totalWeight/  numTotalCategories;
        if (minSplit > upperBoundMinSplit)
            minSplit = upperBoundMinSplit;
        if (minSplit <= lowerBoundMinSplit)
            minSplit = lowerBoundMinSplit;
        MLJ.ASSERT(minSplit > 0, "Entropy::build_split_and_label_dist: minSplit <= 0");
        return minSplit;
    }
    
    /** Returns the minSplit which is used in find_best_threshold(), given
     * lowerBoundMinSplit, upperBoundMinSplit, and minSplitPercent. This
     * function is called by inducers.
     * @param instList The list of instances over which a split is requested.
     * @param upperBoundMinSplit Upper bound for the minimum split value.
     * @param lowerBoundMinSplit Lower bound for the minimum split value.
     * @param minSplitPercent The percentage of total weight per category that represents the minimum value
     * for a split.
     * @param ignoreNumCat Indicator that the number of values that an instance may be classified as should
     * be ignored for this split computation.
     * @return The minSplit which is used in find_best_threshold().
     */
    public static double min_split(InstanceList instList,
    double lowerBoundMinSplit, double upperBoundMinSplit,
    double minSplitPercent, boolean ignoreNumCat) {
        if (ignoreNumCat)
            return min_split(instList.total_weight(), 1, lowerBoundMinSplit, upperBoundMinSplit, minSplitPercent);
        else return min_split(instList.total_weight(), instList.num_categories(), lowerBoundMinSplit, upperBoundMinSplit, minSplitPercent);
    }
    
    /** Returns the minSplit which is used in find_best_threshold(), given
     * lowerBoundMinSplit, upperBoundMinSplit, and minSplitPercent. This
     * function is called by inducers.
     * @param instList The list of instances over which a split is requested.
     * @param upperBoundMinSplit Upper bound for the minimum split value.
     * @param lowerBoundMinSplit Lower bound for the minimum split value.
     * @param minSplitPercent The percentage of total weight per category that represents the minimum value
     * for a split.
     * @return The minSplit which is used in find_best_threshold().
     */
    public static double min_split(InstanceList instList,
    double lowerBoundMinSplit, double upperBoundMinSplit,
    double minSplitPercent) {
        return min_split(instList,lowerBoundMinSplit,upperBoundMinSplit,minSplitPercent,false);
    }
    
    
    /** Computes conditional entropy of the label given attribute X. From Ross,
     * Conditional entropy is defined as:                                          <BR>
     *             H(Y|X) = sum_x H(Y|X=x)*P(X=x).                            <BR>
     *                    = sum_x (-sum_y p(Y=y|X=x)log p(Y=y|X=x)) * P(X=x)  <BR>
     *             now derive Pagallo & Haussler's formula                    <BR>
     *                    = -sum_{x,y} p(Y=y, X=x) log p(Y=y|X=x)             <BR>
     *             Here we estimate p(Y=y, X=x) by counting, but if we
     *               have priors on the probabilities of the labels, then     <BR>
     *               p(x,y) = p(x|y)*p(y) = count(x,y)/s(y)* prior(y)         <BR>
     *               and p(x) = sum_y prior(y) count(x,y)/s(y).               <BR>
     *
     *             By counting we get the following:                          <BR>
     *             -sum_{x,y} num(Y=y,X=x)/num-rec * log num(Y=y,X=x)/num(X=x)
     *
     * @param splitAndLabelDist Distributions over each split and label pair.
     * @param splitDist The distribution over splits.
     * @param totalWeight The total weight distributed.
     * @return The conditional entropy.
     */
    public static double cond_entropy(double[][] splitAndLabelDist,
    double[] splitDist, double totalWeight) {//AttrAndOrigin class function
        //   DBG(MLC::verify_strictly_greater(totalWeight, 0,
        //				    "cond_entropy:: totalWeight must be "
        //				    "positive");
        
        double sum = MLJArray.sum(splitDist);
        MLJ.verify_approx_equal(totalWeight, sum,
        "cond_entropy:: totalWeight does not match splitDist");
        sum = Matrix.total_sum(splitAndLabelDist);
        MLJ.verify_approx_equal(totalWeight, sum,
        "cond_entropy:: totalWeight does not match splitAndLabelDist");
        //       DBGSLOW(
        //	       Array<Real> sDist(UNKNOWN_CATEGORY_VAL,
        //				splitAndLabelDist.num_cols());
        //	       splitAndLabelDist.sum_cols(sDist);
        //	       ASSERT(MLJ.approx_equal(sDist, splitDist));
        //	      )
        //       );
        DoubleRef H = new DoubleRef(0);
        for (int x = 0;
        x < splitAndLabelDist[0].length; x++) {
            double num_x = splitDist[x];
            if (MLJ.approx_equal(num_x, 0.0))
                continue;
            for (int y = 0;
            y < splitAndLabelDist.length; y++) {
                double num_xy = splitAndLabelDist[y][x];
                if (MLJ.approx_equal(num_xy, 0.0))
                    continue;
                H.value -= num_xy * log_bin(num_xy/num_x);
            }
        }
        H.value /= totalWeight; // We know this won't be division by zero.
        MLJ.clamp_above(H, 0, "cond_entropy: negative entropy not allowed");
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -