📄 splitattr.java
字号:
package shared;
import java.lang.*;
import java.util.*;
/** A class for determining, holding, and returning the information associated with
* an attribute split.
*
*/
public class SplitAttr extends SplitScore{
//ENUM
/** SplitTypeEnum value.
*/
static final public int noReasonableSplit = 1; //SplitType enum
/** SplitTypeEnum value.
*/
static final public int realThresholdSplit = 2; // "
/** SplitTypeEnum value.
*/
static final public int multiRealThresholdSplit = 3; // "
/** SplitTypeEnum value.
*/
static final public int nominalSplit = 4; // "
/** SplitTypeEnum value.
*/
static final public int partitionSplit = 5; // "
//END ENUM
/** Names of SplitTypeEnum values.
*/
public static String[] splitTypeEnum ={" ","no reasonable split","real threshold split",
"multi real threshold split","nominal split","partition split"};
private class TypeInfo {
/** The partition for the split.
*/
public int[] partition; // used in partitionSplit
/** The threshold for the split.
*/
public double theThreshold;
/** Other threholds possible for the split.
*/
public double[] thresholds;
// any other info we need
}
private int attributeNum;
private TypeInfo typeInfo;
private double mdlPenalty;
private boolean existSplit;
private int splitType;
private boolean penalizeByMDL;
private int realValuedSplitIndex;
/** Constructor.
*/
public SplitAttr() {
penalizeByMDL = false;
attributeNum = Globals.UNDEFINED_INT;
typeInfo = new TypeInfo();
reset();
}
/** Returns the type value of this SplitAttr.
* @see #noReasonableSplit
* @see #realThresholdSplit
* @see #multiRealThresholdSplit
* @see #nominalSplit
* @see #partitionSplit
* @return The type value of this attribute.
*/
public int split_type(){ return splitType; }
/** Returns the mutual information. The mutual information must be >= 0.
* @param normalize TRUE if the mutual info is to be normalized, FALSE otherwise.
* @param penalize TRUE if the mutual info should be penalized, FALSE otherwise.
* @return The mutual information for this attribute split.
*/
public double get_mutual_info(boolean normalize, boolean penalize) {
//double result = super.get_mutual_info(false);
double result = super.get_unnormalized_mutual_info(); // Normalize later.
if (split_type() == realThresholdSplit ||
split_type() == multiRealThresholdSplit) {
// @@ should we penalize multithresholds more? probably so
// @@ penalize by (numSplits-1)*mdlPenalty
if (penalize)
result = Math.max(0.0, result - mdlPenalty);
}
if (normalize)
result = normalize_by_num_splits(result);
// Result right now >=0, but this is if we decide to avoid
// the max a few lines above and be like C4.5
// if (result < 0) {
// if (!mlc.approx_equal(result, 0)) {
// ibid->existSplit = FALSE;
// ibid->splitType = noReasonableSplit;
// }
// result = 0;
// }
return result;
}
/** Returns TRUE if there is a split stored in this SplitAttr.
* @return Returns TRUE if the SplitAttr contains a split, FALSE otherwise.
*/
public boolean exist_split(){return existSplit;}
/** Sets if the split should be penalized by minimum description length.
* @param choice TRUE if penalizing should occur, FALSE otherwise.
*/
public void set_penalize_by_mdl(boolean choice) {penalizeByMDL = choice;}
/** Helper function to do all processing for real thresholds. When a split is found,
* this also determines and stores the threshold, the mutual info, the cond info,
* and the mdl penalty for cost of storing the threshold.
*
* @param column The column of real values for this attribute and their associated label values.
* @param attrNum The number of the attribute.
* @param minSplit The minimum split value.
* @param smoothInst The instance to be smoothed towards.
* @param smoothFactor The factor by which real values are smoothed.
* @return A split threshold for a real valued attribute.
*/
public boolean make_real_split(RealAndLabelColumn column, int attrNum,
double minSplit, int smoothInst,
double smoothFactor) {
reset();
set_attr_num(attrNum);
DoubleRef thresh =new DoubleRef(0.0);
IntRef splitIndex =new IntRef(0);
IntRef numDistinct =new IntRef(0);
Entropy.find_best_threshold(column, minSplit, this, thresh, splitIndex,
numDistinct, smoothInst, smoothFactor);
save_real_split(thresh, splitIndex, numDistinct);
return existSplit;
}
/** The data calculated by find_best_threshold() is saved in the SplitAttr via this
* function.
*
* @param thresh The threshold to be saved.
* @param splitIndex The index of the split to be saved.
* @param numDistinct The number of distinct splits.
*/
public void save_real_split(DoubleRef thresh, IntRef splitIndex, IntRef numDistinct) {
existSplit = false;
if (thresh.value == Globals.UNDEFINED_REAL || splitIndex.value < 0 || numDistinct.value <= 0) {
reset();
return;
}
free_type_info();
splitType = realThresholdSplit;
if (MLJ.approx_equal(total_weight(), 0.0))
Error.fatalErr("SplitAttr::save_real_split: Total weight is near 0. "+
"Cannot continue, as need to divide by it");
// We always compute the MDL penalty, so that if callers turn
// it on or off, we have the number already computed (correctness,
// not efficient).
mdlPenalty = MLJ.log_bin(numDistinct.value) / total_weight();
typeInfo.theThreshold = thresh.value;
existSplit = true;
realValuedSplitIndex = splitIndex.value;
}
/** Delete and clear typeInfo.
*/
public void free_type_info() {
switch(splitType) {
case noReasonableSplit:
case nominalSplit:
break;
case realThresholdSplit:
typeInfo.theThreshold = Globals.UNDEFINED_INT;
break;
case multiRealThresholdSplit:
// delete typeInfo.thresholds;
typeInfo.thresholds = null;
break;
case partitionSplit:
// delete typeInfo.partition;
typeInfo.partition = null;
break;
default:
MLJ.Abort();
}
}
/** Reset values, except attribute number.
*/
public void reset() {
// DBG(ASSERT(attributeNum >= 0 || attributeNum == Globals.UNDEFINED_INT));
super.reset();
splitType = noReasonableSplit;
existSplit = false;
mdlPenalty = 0;
realValuedSplitIndex = Globals.UNDEFINED_INT;
}
/** Sets the attribute number for this split.
* @param num The number of the new attribute.
*/
public void set_attr_num(int num) {
if (num < 0 && num != Globals.UNDEFINED_INT) {
Error.fatalErr("SplitAttr::split_attr_num: attempt to set attribute number to "+
num+", which is neither non-negative nor UNDEFINED_INT ("+Globals.UNDEFINED_INT+")");
}
attributeNum = num;
}
/** Helper function to do all processing for nominals. Nominal splits always exist.
*
* @param instList The InstanceList over which to make a nominal split.
* @param attributeNumber The number of the attribute to be split.
* @return TRUE if a nominal split exists, FALSE if not.
*/
public boolean make_nominal_split(InstanceList instList,
int attributeNumber) {
reset();
set_attr_num(attributeNumber);
Schema schema = instList.get_schema();
if (!schema.attr_info(attributeNumber).can_cast_to_nominal())
Error.fatalErr("SplitAttr::make_nominal_split: attribute "+attributeNumber+
" can not be cast to nominal");
InstanceList ptr = instList;
InstanceList[] array = new InstanceList[1];
array[0] = ptr;
double[][] sAndLDist =
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -