📄 id3inducer.java
字号:
if (mutualInfoAboveMean || !foundScoreAboveMean) {
// MLJ.ASSERT(meanMutualInfo != Globals.UNDEFINED_REAL);
logOptions.LOG(3, ", and that "+split.score()+" > "+meanMutualInfo+'\n');
double score = split.score();
logOptions.LOG(3,"Testing attribute "+attrNum+" ("+schema.attr_name(attrNum)+"): "+'\n');
// logOptions.LOG(3,split);
// logOptions.LOG(3,"Comparing score ("+score+") against max score + eps ("+
// (maxScore+MLJ.realEpsilon())+")"+'\n');
if (score > maxScore + MLJ.realEpsilon ||
(!foundScoreAboveMean && mutualInfoAboveMean)) {
logOptions.LOG(2,"Chose attribute "+attrNum+'\n');
maxScore = score;
bestSplit[0] = split;
logOptions.LOG(3,"max score becomes "+maxScore+'\n');
}
if (mutualInfoAboveMean)
foundScoreAboveMean = true;
} else{
logOptions.LOG(3,"...not above mean"+'\n');
}
}
}
}
}
/** Checks if all attributes are multi-valued. An attribute is multivalued if
* an instance can have more than one value at one time. If the attribute
* contains values that are neither real or nominal, an abort message is issued.
* @return False if the values are real numbers or if the nominal values are
* not multivalued. Otherwise True is returned.
*/
public boolean all_attributes_multi_val()
{
boolean multiVal = true;
Schema schema = TS.get_schema();
for (int attrNum = 0; attrNum < schema.num_attr() && multiVal; attrNum++){
if (schema.attr_info(attrNum).can_cast_to_real())
multiVal = false;
else if (schema.attr_info(attrNum).can_cast_to_nominal())
multiVal = multi_val_attribute(attrNum);
else
MLJ.Abort();
}
return multiVal;
}
/** Compute the split information for a given attribute.
* @param attrNum The index number of this attribute column.
* @param split The attribute to be split on.
*/
public void split_info(int attrNum, SplitAttr split)
{
split_info(attrNum, split, null);
}
/** Compute the split information for a given attribute.
* @param attrNum The index number of this attribute column.
* @param split The attribute to be split on.
* @param realColumns The columns of values specified for each attribute over
* all instances in a data set.
*/
public void split_info(int attrNum, SplitAttr split,
RealAndLabelColumn[] realColumns)
{
Schema schema = TS.get_schema();
if (attrNum < 0 || attrNum > schema.num_attr())
Error.fatalErr("ID3Inducer::split_info: attrNum " + attrNum +
" not in range 0 to " + schema.num_attr());
logOptions.LOG(2,"Testing attribute "+attrNum+" ("+TS.get_schema().attr_name(attrNum)+"): ");
// split needs to know if the Minimum Description Length Adjustment
// for continuous attributes needs to be applied.
split.set_penalize_by_mdl(get_cont_mdl_adjust());
double minSplit = Entropy.min_split(instance_list(),
get_lower_bound_min_split_weight(),
get_upper_bound_min_split_weight(),
get_min_split_weight_percent());
double nominalMinSplit = minSplit;
if (get_nominal_lbound_only()) {
nominalMinSplit = get_lower_bound_min_split_weight();
if (nominalMinSplit < 1)
Error.fatalErr("split_info: lowerBoundMinSplit (" +
nominalMinSplit+") must be at least one");
}
logOptions.LOG(4,"Min split: "+(int)minSplit+", nominal min split: "+(int)nominalMinSplit+'\n');
// If the attribute is nominal and has more than one value, then use
// mutual_info or gain_ratio to determine the information gained by
// splitting on it.
if (schema.attr_info(attrNum).can_cast_to_nominal()) {
split.set_split_score_criterion(get_split_score_criterion());
if (SplitAttr.ok_to_split(attrNum, TS.counters(), nominalMinSplit)) {
split.make_nominal_split(instance_list(), attrNum);
logOptions.LOG(2,"Criterion score for attribute "+attrNum+" is "+split.score()+'\n');
MLJ.ASSERT(split.score() >= 0, "ID3Inducer::split_info(): split.score() >= 0");
}
}
// Otherwise the attribute should be real and real_mutual_info is used
// to determine the threshold which gives the maximum information
// gain when split on.
else if (schema.attr_info(attrNum).can_cast_to_real()) {
// Find the best threshold (in find_best_threshold(), invoked through
// make_real_split()) based on the default split criterion; then
// set the criterion back.
// @@ This will be an option for reals.
if (realColumns == null)
Error.fatalErr("ID3Inducer::split_info: can't split on real attribute "+
schema.attr_name(attrNum)+" -- realColumns is null");
RealAndLabelColumn column = realColumns[attrNum];
if (column == null)
Error.fatalErr("ID3Inducer::split_info: can't split on real attribute "+
schema.attr_name(attrNum)+" -- the given column is null");
column.sort();
split.set_split_score_criterion(SplitScore.defaultSplitScoreCriterion);
split.set_penalize_by_mdl(false);
split.make_real_split(column, attrNum, minSplit,
get_smooth_inst(), get_smooth_factor());
split.set_split_score_criterion(get_split_score_criterion());
split.set_penalize_by_mdl(get_cont_mdl_adjust());
if (split.exist_split()) {
logOptions.LOG(2,"Threshold: "+split.threshold()+", criterion score: "+
split.score()+", entropy: "+split.get_entropy()+'\n');
MLJ.ASSERT(split.score() >= 0, "ID3Inducer::split_info(): split.score() >= 0");
}
} else
MLJ.Abort();
if (split.split_type() == SplitAttr.noReasonableSplit)
logOptions.LOG(2,"No reasonable split"+'\n');
}
// public void set_unknown_edges(){ }
// public void display_struct(){ }
//Additions by JL
// private NO_DEFAULT_OPS(ID3Inducer);
/** Create an Inducer for recursive calls. Since TDDTInducer is an abstract
* class, it can't do the recursive call.
* @param descr The description of the new subinducer.
* @param aCgraph A previously defined Cgraph for the inducer.
* @return The new sub-ID3Inducer created.
*/
public TDDTInducer create_subinducer(String descr, CGraph aCgraph)
{
ID3Inducer inducer = new ID3Inducer(descr, aCgraph);
inducer.copy_options(this);
inducer.set_level(get_level() + 1);
return inducer;
}
/** Build categorizer for the given attribute. The specified SplitAttr is
* assumed to be valid.
* @param split The attribute to be split on.
* @param catNames The category names that an instance may be categorized
* under.
* @return The NodeCategorizer containing a categorizer that splits on this
* attribute.
*/
public NodeCategorizer split_to_cat(SplitAttr split,
LinkedList catNames)
{
int attrNum = split.get_attr_num();
Schema schema = TS.get_schema();
// MLJ.ASSERT(split.split_type() != SplitAttr::noReasonableSplit);
// MLJ.ASSERT(attrNum >= 0);
// MLJ.ASSERT(split.score() >= 0);
// Else, build the categorizer
String attrName = schema.attr_info(attrNum).name();
if (get_debug()) {
attrName = attrName+" (#=" + TS.num_instances()+
"\\nENT="+ split.get_entropy()+
"\\nMI=" + split.get_mutual_info(false, false)+
"\\nGAIN=" + split.get_gain_ratio(false);
if (split.get_penalize_by_mdl())
attrName =attrName+"\\nMDL penalty=" + split.penalty();
attrName =attrName+"\\nSCORE=" + split.score() + ")";
}
if (schema.attr_info(attrNum).can_cast_to_nominal()) {
// MLJ.ASSERT(split.split_type() == SplitAttr::nominalSplit);
NominalAttrInfo nai = schema.attr_info(attrNum).cast_to_nominal();
int size = nai.num_values() + 1; // +1 for unknown
// catNames = new String[size];//(Globals.UNKNOWN_CATEGORY_VAL, size);
catNames.add(Globals.UNKNOWN_CATEGORY_VAL,"?");
int cat = Globals.FIRST_CATEGORY_VAL;
for (int i = 1; i < size; i++, cat++)
catNames.add(cat,nai.get_value(i));
//UNKNOWN_NOMINAL_VAL is now 0 instead of -1. As a result, the
//original input value for get_value would have been out of bounds
//for the number of values. - JL
// catNames.add(cat,nai.get_value(i + Globals.UNKNOWN_NOMINAL_VAL));
return new AttrCategorizer(schema, attrNum, attrName);
}
else if (schema.attr_info(attrNum).can_cast_to_real()) {
if (split.split_type() == SplitAttr.realThresholdSplit) {
logOptions.LOG(5, split.threshold()+""+'\n');
ThresholdCategorizer cat = new
ThresholdCategorizer(schema, attrNum, split.threshold(),
attrName);
String[] categories = cat.real_edge_strings();
for(int z = 0; z < categories.length; z++)
catNames.add(z,categories[z]);
return cat;
} else {
Error.fatalErr("ID3Inducer::split_to_cat: bad split type for "+
"continuous attribute "+attrNum);
return null;
}
} else {
Error.fatalErr("ID3Inducer::split_to_cat: unrecognized attribute type"+
" for attribute "+attrNum);
return null;
}
}
/** Returns the reference to the copy of ID3Inducer with the same settings.
* @return A reference to an ID3Inducer.
*/
public Inducer copy()
{
Inducer ind = new ID3Inducer(this);
return ind;
}
/** Returns the class id of this of this inducer.
* @deprecated This method should be replaced with Java's instanceof operator.
* @return Integer assigned to this inducer.
*/
public int class_id(){ return ID3_INDUCER; }
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -