📄 entropy.java
字号:
package shared;
import java.lang.*;
import java.util.*;
/** This class handles all of the Entropy based calculations. All logs
* are base 2 (other bases just scale the entropy). The reason for using
* log_bin is simply that the examples in Quinlan's C4.5 book use it, and
* those examples were used for testing. It's also a measure of the
* number of "bits" in information theory, so it's appealing in that sense
* too. The computation is based on: <BR>
* 1. "Boolean Feature Discovery in Empirical Learning" / Pagallo and
* Haussler. <BR>
* 2. "A First Course in Probability, 2nd Edition" / Ross, pages 354-359.
* <BR>
* 3. "C4.5: Programs for Machine Learning" / Ross Quinlan, pages 18-24.
* <BR>
* @author James Louis 5/21/2001 Java implementations.
* @author Cliff Brunk 5/25/96 Added weight to find_best_threshold and
* get_split_score.
* @author Eric Eros 12/05/96 Revised find_best_threshold to use
* SplitAttr, and to use RealAndLabelColumn and SplitAttr.
* @author Chia-Hsin Li Revised find_best_threshold to allow multiple
* InstanceLists.
* @author Brian, Ronny Kohavi 9/04/93 Initial revision
*/
public class Entropy {
/** Constant for binary log calculations.
*/
static public double M_LOG2E = 1.4426950408889634074;
/** Automatically determine a good lower bound for minSplit, based on the
* total weight of an instance list at the start of training.
* @param totalWeight The total weight of instances in this
* InstanceList partition.
* @return A lower bound for minSplit.
*/
public static double auto_lbound_min_split(double totalWeight) {
double k = 1.851;
double m = 8.447;
if (totalWeight < 1)
return 1;
return Math.max(1.0, k*log_bin(totalWeight)-m);
}
/** Returns the log base two of the supplied number.
* @param num The number for which a log is requested.
* @return The log base two of the supplied number.
*/
public static double log_bin(double num) {
// return Math.log(num)/ Math.log(2);
return Math.log(num) * M_LOG2E;
}
/** Compute the best threshold for RealAndLabelColumn(s). Initially, all
* instances are at the right hand side of splitDist and
* splitAndLabelDist. To find the best threshold, we shift the instances
* from right to left and calculate their conditional entropy. Then we
* pick up the one with minimum conditional entropy and return
* bestThreshold and minCondEntropy as results. When two entropies are
* equal, we prefer the one that splits instances equally into two bins
* in the hope of making the tree shallower. Tests show minor effect but
* it does help pima for different leaf counts.
* @param realAndLabelColumn The column of real values and their associated labels
* @param minSplit The split value for which conditional entropy is minimal.
* @param split The scores for all available splits.
* @param bestThreshold The real value that is the best threshold over the supplied real values.
* @param bestSplitIndex Index of the best split in the supplied RealAndLabelColumn.
* @param numDistinctValues The number of non-equal values.
* @param smoothInst The index of the instance to be smoothed toward.
* @param smoothFactor The amount of normalization done towards the specified instance index.
*/
public static void find_best_threshold(RealAndLabelColumn realAndLabelColumn,
double minSplit, SplitScore split,
DoubleRef bestThreshold, IntRef bestSplitIndex,
IntRef numDistinctValues, int smoothInst,
double smoothFactor) {
if (minSplit < 0)
Error.fatalErr("find_best_threshold: minSplit(" +minSplit+ ") must be at least 0");
bestThreshold.value = Globals.UNDEFINED_REAL;
bestSplitIndex.value = 0;
numDistinctValues.value = 0;
double totalKnownWeight = realAndLabelColumn.known_weight();
if (realAndLabelColumn.known_weight()<(2 * minSplit)) {
split.reset();
return;
}
Vector scores = new Vector();
double[][] sAndLDist = get_score_array(realAndLabelColumn,
split, minSplit,
scores, numDistinctValues,
smoothInst, smoothFactor);
if (sAndLDist == null)
return;
double bestScore = find_best_score(totalKnownWeight,
scores, minSplit, bestSplitIndex);
if (bestScore > Globals.UNDEFINED_REAL) {
bestThreshold.value =
((double)(realAndLabelColumn.index(bestSplitIndex.value - 1).value)+
(double)(realAndLabelColumn.index(bestSplitIndex.value).value))/ 2;
for(int k = 1 ; k <= bestSplitIndex.value ; k++) {
AttrLabelPair alp = realAndLabelColumn.index(k - 1);
int lab = alp.label;
double weight = alp.weight;
sAndLDist[lab][Globals.RIGHT_NODE] -= weight;
sAndLDist[lab][Globals.LEFT_NODE] += weight;
if (MLJ.approx_equal(sAndLDist[lab][Globals.RIGHT_NODE],0))
sAndLDist[lab][Globals.RIGHT_NODE]= 0;
}
sAndLDist = split.set_split_and_label_dist(sAndLDist);
}
else {
split.reset();
sAndLDist = null;
}
MLJ.ASSERT(sAndLDist == null, "Entropy::find_best_threshold: sAndLDist == null");
}
/** Calculates the distribution array for the given sorted
* RealAndLabelColumn. This function then progresses through the input array,
* shifting counts from the right to the left (in the distributions).
* When the Real values in the input array change, the potential score of
* a split at that value is calculated, and stored in the output array,
* outScores. At the end, outScores contains the discrete thresholds, the
* scores associated with a split at each of these thresholds, and the
* indices into the original array.
* @param realAndLabelColumn The supplied column of real values and their associated label values.
* @param split The scores for all available splits.
* @param minSplit The split value for which conditional entropy is minimal.
* @param outScores The scores after smoothing.
* @param numDistinctValues The number of non-equal values.
* @param smoothInst The index of the instance to be smoothed towards.
* @param smoothFactor The amount of normalization done towards the specified instance index.
* @return The split and label distribution.
*/
public static double[][] get_score_array(RealAndLabelColumn realAndLabelColumn,
SplitScore split, double minSplit,
Vector outScores,
IntRef numDistinctValues, int smoothInst,
double smoothFactor) {
int numLabels = realAndLabelColumn.label_count();
double[][] splitAndLabelDist = new double[numLabels+1][3];
Matrix.initialize(0,splitAndLabelDist);
double[] splitDist = new double[3];
realAndLabelColumn.init_dist_counts(splitAndLabelDist, splitDist);
double[] labelDist = new double[splitAndLabelDist.length];
Matrix.sum_rows(labelDist,splitAndLabelDist);
int numUsedLabels = 0;
for(int labelNum = 0 ;
labelNum < splitAndLabelDist.length ;
labelNum++)
if (labelDist[labelNum] > 0)
++numUsedLabels;
if (numUsedLabels < 2) {
split.reset();
return null;
}
double theEntropy = entropy(labelDist);
double[][] sAndLDist = new double[splitAndLabelDist.length][splitAndLabelDist[0].length];
for(int i = 0 ; i < splitAndLabelDist.length ; i++)
for(int j = 0 ; j < splitAndLabelDist[0].length ; j++)
sAndLDist[i][j] = splitAndLabelDist[i][j];
numDistinctValues.value = 0;
fill_scores(realAndLabelColumn, split, minSplit, theEntropy,
outScores, numDistinctValues, splitAndLabelDist, splitDist);
if (sAndLDist != null && smoothInst != 0)
smooth_scores(outScores, smoothInst, smoothFactor);
return sAndLDist;
}
/** Normalizes the scores towards the instance at the supplied index.
* @param scores The set of scores to be smoothed.
* @param smoothInst The index of the instance to be smoothed towards.
* @param smoothFactor The amount of normalization done towards the specified instance index.
*/
public static void smooth_scores(Vector scores, int smoothInst, double smoothFactor) {
double[] oldScores = new double[smoothInst+1];
MLJArray.init_values(-1,oldScores);
int numThresholds = scores.size();
for(int i = 0 ; i < numThresholds ; i++) {
double summedScores =((ThresholdInfo)scores.get(i)).score;
double summedFactor = 1;
double currFactor = smoothFactor;
for(int left = 1 ; i - left >= 0 && left <= smoothInst ; left++) {
summedScores +=(currFactor *((ThresholdInfo)scores.get(i - left)).score);
summedFactor += currFactor;
currFactor *= smoothFactor;
}
currFactor = smoothFactor;
for(int right = 1 ; i + right < numThresholds && right <= smoothInst ; right++) {
summedScores +=(currFactor *((ThresholdInfo)scores.get(i + right)).score);
summedFactor += currFactor;
currFactor *= smoothFactor;
}
if (i > smoothInst) {
((ThresholdInfo)scores.get(i - smoothInst - 1)).score = oldScores[smoothInst];
}
for(int j = smoothInst - 1 ; j >= 0 ; j--) {
oldScores[j+1] = oldScores[j];
}
if (MLJ.approx_equal(summedFactor, 0.0))
Error.fatalErr("smooth_scores: divisor (summedFactor) too close to zero");
oldScores[0] = summedScores/ summedFactor;
}
for(int j = 0 ; j < smoothInst+1 ; j++) {
if (numThresholds - smoothInst - 1 + j >= 0)
((ThresholdInfo)scores.get(numThresholds - smoothInst - 1 + j)).score = oldScores[smoothInst - j];
}
}
/** Fills the Vector of scores with the scores for all the thresholds.
* The number of distinct values is only true for the range of numbers if
* the relevant range that does not include minSplit on the right and left.
* @param realAndLabelColumn The column of real values and their associated labels over which thresholds are
* created.
* @param split The SplitScore used for scoring this threshold split.
* @param minSplit The minimum value for splits.
* @param theEntropy The Entropy value
* @param outScores The Vector of scores to be filled.
* @param numDistinctValues The number of distinct real values for this attribute.
* @param splitAndLabelDist Distributions over each split and label pair.
* @param splitDist The distribution over splits.
*/
public static void fill_scores(RealAndLabelColumn realAndLabelColumn,
SplitScore split, double minSplit, double theEntropy,
Vector outScores, IntRef numDistinctValues, double[][] splitAndLabelDist,
double[] splitDist) {
double totalWeight = realAndLabelColumn.total_weight();
int numKnownInstances = realAndLabelColumn.known_count();
float lastSeen = realAndLabelColumn.index(0).value;
int threshIndex = 0;
// We start at 1 because we're comparing a split just before this
// element
for(int k = 1 ; k < numKnownInstances ; k++) {
AttrLabelPair alp = realAndLabelColumn.index(k - 1);
float value = alp.value;
int lab = alp.label;
double weight = alp.weight;
float nextValue = realAndLabelColumn.index(k).value;
splitDist[Globals.RIGHT_NODE] -= weight;
splitDist[Globals.LEFT_NODE] += weight;
if (MLJ.approx_equal((float)splitDist[Globals.RIGHT_NODE], 0.0))
splitDist[Globals.RIGHT_NODE] = 0;
splitAndLabelDist[lab][Globals.RIGHT_NODE] -= weight;
splitAndLabelDist[lab][Globals.LEFT_NODE] += weight;
if (MLJ.approx_equal((float)splitAndLabelDist[lab][Globals.RIGHT_NODE], 0.0))
splitAndLabelDist[lab][Globals.RIGHT_NODE] = 0;
double deltaAttr =(double)nextValue -(double)(lastSeen);
MLJ.ASSERT(deltaAttr >= 0, "Entropy::fill_scores: deltaAttr >= 0 : deltaAttr == " +deltaAttr);
if (deltaAttr < 2 * MLJ.storedRealEpsilon)
continue;
lastSeen = nextValue;
if (nextValue - value < MLJ.storedRealEpsilon)
continue;
if (splitDist[Globals.RIGHT_NODE] >= minSplit && splitDist[Globals.LEFT_NODE] >= minSplit)
numDistinctValues.value = numDistinctValues.value + 1;
ThresholdInfo outScore = new ThresholdInfo();
outScores.add(threshIndex++,outScore);
outScore.index = k;
outScore.weight = splitDist[Globals.LEFT_NODE];
outScore.score = split.score(splitAndLabelDist, splitDist, null, theEntropy, totalWeight);
}
}
/** Compute the entropy H(Y) for label Y. Giving us an InstanceList forces counters.
* If you don't give us the total instances, we count (slightly less efficient).
* Entropy without the sum acts a bit differently by allowing nodes with 0
* instances and returning entropy -1. The reason is that the caller shouldn't be
* required to compute the sum just for this.
*
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -