📄 splitattr.java
字号:
Entropy.build_split_and_label_dist(array, attributeNumber);
sAndLDist = super.set_split_and_label_dist(sAndLDist);
return make_nominal_split();
}
/** Helper function to do all processing for nominals. Nominal splits always exist.
* @return TRUE if a nominal split exists, FALSE if not.
*/
public boolean make_nominal_split() {
splitType = nominalSplit;
existSplit = true;
boolean normalize = (get_split_score_criterion() == normalizedMutualInfo);
get_mutual_info(normalize, false); // Determines if split exists
return existSplit;
}
/** Check if it is OK to make a split on the nominal attribute by making sure at
* least two branches have more than minSplit instances. The need is to split into
* 2 disjoint sets, both sets containing at least 'minSplit' instances (so there
* needs to be at least twice 'minSplit' instances). This function checks to see if
* there are enough instances for such a split to occur. The minSplit must be at
* least 1.
*
* @param attrNum The number of the attribute to be checked.
* @param counters Counters of the values for this attribute.
* @param minSplit The minimum split value.
* @return TRUE if the attribute is ok to split.
*/
public static boolean ok_to_split(int attrNum, BagCounters counters, double minSplit) {
// DBG(ASSERT(minSplit >= 1));
// If there aren't two values (or more), we clearly can't split.
if (counters.attr_num_vals(attrNum) < 2)
return false;
if (!MLJ.approx_greater(minSplit,1.0))
return true; // We know there are (at least) two values, each of which
// must have at least one instance.
double[] ac = counters.attr_counts()[attrNum];
if (ac == null)
Error.fatalErr("ID3Inducer::ok_to_split: No counters");
int numAboveMin = 0;
for (int i = 0; numAboveMin < 2 && i <= ac.length; i++)
if (ac[i] >= minSplit)
numAboveMin++;
return (numAboveMin >= 2);
}
/** Returns the number of attributes.
* @return The number of attrbutes.
*/
public int get_attr_num(){
// ASSERT(attributeNum != -1);
return attributeNum;
}
/** Get penalty. Only valid if you are penalizing.
* @return Returns the penalty value.
*/
public double penalty() {
if (!get_penalize_by_mdl())
Error.fatalErr("SplitAttr::penalty: MDL penalty not set");
return mdlPenalty;
}
/** Returns the minimum distance length penalty value.
* @return The minimum distance length penalty value.
*/
public boolean get_penalize_by_mdl(){return penalizeByMDL;}
/** Returns the mutual gain-ratio.
*
* @param penalize TRUE if penalization should occur, FALSE otherwise.
* @return The mutual gain-ratio.
*/
public double get_gain_ratio(boolean penalize) {
double numerator = get_mutual_info(false, penalize);
double divisor = get_split_entropy();
if (MLJ.approx_equal(divisor, 0.0))
Error.fatalErr("SplitAttr::get_gain_ratio: split entropy ("+divisor+
") is too close to zero for division");
// ASSERT(numerator != Globals.UNDEFINED_REAL);
double gain = numerator / divisor;
// ASSERT(gain >= 0);
return gain;
}
/** Return the threshold. Can only be called if the split exists and is a real
* threshold split.
*
* @return The threshold.
*/
public double threshold() {
if (splitType != realThresholdSplit)
Error.fatalErr("SplitAttr::threshold(): split is not realThreshold, it is "+
name_from_value(splitType,splitTypeEnum));
return typeInfo.theThreshold;
}
private String name_from_value(int value, String[] enumNames) {
if(value >= 0 && value < enumNames.length)
return enumNames[value];
return "";
}
/** Copies the given SplitAttr inot this SplitAttr.
* @param original The SplitAttr to be copied.
*/
public void copy(SplitAttr original) {
attributeNum = original.attributeNum;
typeInfo = new TypeInfo();
typeInfo.partition = (int[])typeInfo.partition.clone();
typeInfo.theThreshold = typeInfo.theThreshold;
typeInfo.thresholds = (double[])typeInfo.thresholds.clone();
mdlPenalty = original.mdlPenalty;
existSplit = original.existSplit;
splitType = original.splitType;
penalizeByMDL = original.penalizeByMDL;
realValuedSplitIndex = original.realValuedSplitIndex;
}
/** Initialize attribute data and distribution arrays. The first version bases it on
* splits that were done before calling us. The second version does the split
* based on a given categorizer and computes its worth based on the resulting
* instance lists.
*
* @param instLists The InstanceList to use in initialization.
* @param attributeNumber The number of the attribute.
*/
public void initialize(InstanceList[] instLists,int attributeNumber) {
reset();
set_attr_num(attributeNumber);
int numLabels = instLists[0].get_schema().num_label_values();
double[][] splitAndLabelDist = new double[numLabels][instLists.length];
for (int child = 0; child <= instLists.length; child++) {
//MLJ.ASSERT(instLists[child] != null);
// everything in a set has the same attr value, by definition
for (int labelValCount = 0; labelValCount < numLabels;
labelValCount++) {
BagCounters bc = instLists[child].counters();
splitAndLabelDist[labelValCount][child] = bc.label_count(labelValCount);
}
}
set_split_and_label_dist(splitAndLabelDist);
}
/** The criterion calculation depends on the score criterion. For gainRatio it's
* (surprise) gain ratio. For mutualInfo and normalizedMutualInfo it's mutualInfo.
*
* @return The score for this split.
*/
public double score() {
switch (get_split_score_criterion()) {
case mutualInfo:
return get_mutual_info(false, penalizeByMDL);
case normalizedMutualInfo:
return get_mutual_info(true, penalizeByMDL);
case gainRatio:
return get_gain_ratio(penalizeByMDL);
case mutualInfoRatio:
return get_mutual_info_ratio();
case externalScore:
return get_external_score();
default:
Error.fatalErr("SplitAttr::score: split score criterion of " +
get_split_score_criterion() + " is out of range");
return 0; // Can't get here.
}
}
/** Computes the scores and updates the cache when there are being computed many
* times for the same number of instances and entropy. This would happen, for
* instance, when determining the best threshold for a split.
* @param sAndLDist The split and label distribution.
* @param sDist The split distribution.
* @param lDist The label distribution.
* @param entropy The entropy value.
* @param totalWeight The total weight of instances.
* @return The score for this split.
*/
public double score(double[][] sAndLDist, double[] sDist,
double[] lDist, double entropy, double totalWeight){
return super.score(sAndLDist, sDist, lDist, entropy,totalWeight);
}
/** Computes the scores and updates the cache when there are being computed many
* times for the same number of instances and entropy. This would happen, for
* instance, when determining the best threshold for a split.
* @param sAndLDist The split and label distribution.
* @param sDist The split distribution.
* @param lDist The label distribution.
* @param entropy The entropy value.
* @return The score for this split.
*/
public double score(double[][] sAndLDist, double[] sDist,
double[] lDist, double entropy){
return super.score(sAndLDist, sDist, lDist, entropy,
Globals.UNDEFINED_REAL);
}
/** Computes the scores and updates the cache when there are being computed many
* times for the same number of instances and entropy. This would happen, for
* instance, when determining the best threshold for a split.
* @param sAndLDist The split and label distribution.
* @param sDist The split distribution.
* @param lDist The label distribution.
* @return The score for this split.
*/
public double score(double[][] sAndLDist, double[] sDist, double[] lDist){
return super.score(sAndLDist, sDist, lDist, Globals.UNDEFINED_REAL,
Globals.UNDEFINED_REAL);
}
}//End of class
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -