📄 jrip.java
字号:
FastVector ruleset = new FastVector();
double dl = defDL, minDL = defDL;
RuleStats rstats = null;
double[] rst;
// Check whether data have positive examples
boolean defHasPositive = true; // No longer used
boolean hasPositive = defHasPositive;
/********************** Building stage ***********************/
if(m_Debug)
System.err.println("\n*** Building stage ***");
while((!stop) && hasPositive){ // Generate new rules until
// stopping criteria met
RipperRule oneRule;
if(m_UsePruning){
/* Split data into Grow and Prune*/
// We should have stratified the data, but ripper seems
// to have a bug that makes it not to do so. In order
// to simulate it more precisely, we do the same thing.
//newData.randomize(m_Random);
newData = RuleStats.stratify(newData, m_Folds, m_Random);
Instances[] part = RuleStats.partition(newData, m_Folds);
growData=part[0];
pruneData=part[1];
//growData=newData.trainCV(m_Folds, m_Folds-1);
//pruneData=newData.testCV(m_Folds, m_Folds-1);
oneRule = new RipperRule();
oneRule.setConsequent(classIndex); // Must set first
if(m_Debug)
System.err.println("\nGrowing a rule ...");
oneRule.grow(growData); // Build the rule
if(m_Debug)
System.err.println("One rule found before pruning:"+
oneRule.toString(m_Class));
if(m_Debug)
System.err.println("\nPruning the rule ...");
oneRule.prune(pruneData, false); // Prune the rule
if(m_Debug)
System.err.println("One rule found after pruning:"+
oneRule.toString(m_Class));
}
else{
oneRule = new RipperRule();
oneRule.setConsequent(classIndex); // Must set first
if(m_Debug)
System.err.println("\nNo pruning: growing a rule ...");
oneRule.grow(newData); // Build the rule
if(m_Debug)
System.err.println("No pruning: one rule found:\n"+
oneRule.toString(m_Class));
}
// Compute the DL of this ruleset
if(rstats == null){ // First rule
rstats = new RuleStats();
rstats.setNumAllConds(m_Total);
rstats.setData(newData);
}
rstats.addAndUpdate(oneRule);
int last = rstats.getRuleset().size()-1; // Index of last rule
dl += rstats.relativeDL(last, expFPRate, m_CheckErr);
if(Double.isNaN(dl) || Double.isInfinite(dl))
throw new Exception("Should never happen: dl in "+
"building stage NaN or infinite!");
if(m_Debug)
System.err.println("Before optimization("+last+
"): the dl = "+dl+" | best: "+minDL);
if(dl < minDL)
minDL = dl; // The best dl so far
rst = rstats.getSimpleStats(last);
if(m_Debug)
System.err.println("The rule covers: "+rst[0]+
" | pos = " + rst[2] +
" | neg = " + rst[4]+
"\nThe rule doesn't cover: "+rst[1]+
" | pos = " + rst[5]);
stop = checkStop(rst, minDL, dl);
if(!stop){
ruleset.addElement(oneRule); // Accepted
newData = rstats.getFiltered(last)[1];// Data not covered
hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining?
if(m_Debug)
System.err.println("One rule added: has positive? "
+hasPositive);
}
else{
if(m_Debug)
System.err.println("Quit rule");
rstats.removeLast(); // Remove last to be re-used
}
}// while !stop
/******************** Optimization stage *******************/
RuleStats finalRulesetStat = null;
if(m_UsePruning){
for(int z=0; z < m_Optimizations; z++){
if(m_Debug)
System.err.println("\n*** Optimization: run #"
+z+" ***");
newData = data;
finalRulesetStat = new RuleStats();
finalRulesetStat.setData(newData);
finalRulesetStat.setNumAllConds(m_Total);
int position=0;
stop = false;
boolean isResidual = false;
hasPositive = defHasPositive;
dl = minDL = defDL;
oneRule:
while(!stop && hasPositive){
isResidual = (position>=ruleset.size()); // Cover residual positive examples
// Re-do shuffling and stratification
//newData.randomize(m_Random);
newData = RuleStats.stratify(newData, m_Folds, m_Random);
Instances[] part = RuleStats.partition(newData, m_Folds);
growData=part[0];
pruneData=part[1];
//growData=newData.trainCV(m_Folds, m_Folds-1);
//pruneData=newData.testCV(m_Folds, m_Folds-1);
RipperRule finalRule;
if(m_Debug)
System.err.println("\nRule #"+position +
"| isResidual?" + isResidual+
"| data size: "+newData.sumOfWeights());
if(isResidual){
RipperRule newRule = new RipperRule();
newRule.setConsequent(classIndex);
if(m_Debug)
System.err.println("\nGrowing and pruning"+
" a new rule ...");
newRule.grow(growData);
newRule.prune(pruneData, false);
finalRule = newRule;
if(m_Debug)
System.err.println("\nNew rule found: "+
newRule.toString(m_Class));
}
else{
RipperRule oldRule = (RipperRule)ruleset.elementAt(position);
boolean covers = false;
// Test coverage of the next old rule
for(int i=0; i<newData.numInstances(); i++)
if(oldRule.covers(newData.instance(i))){
covers = true;
break;
}
if(!covers){// Null coverage, no variants can be generated
finalRulesetStat.addAndUpdate(oldRule);
position++;
continue oneRule;
}
// 2 variants
if(m_Debug)
System.err.println("\nGrowing and pruning"+
" Replace ...");
RipperRule replace = new RipperRule();
replace.setConsequent(classIndex);
replace.grow(growData);
// Remove the pruning data covered by the following
// rules, then simply compute the error rate of the
// current rule to prune it. According to Ripper,
// it's equivalent to computing the error of the
// whole ruleset -- is it true?
pruneData = RuleStats.rmCoveredBySuccessives(pruneData,ruleset, position);
replace.prune(pruneData, true);
if(m_Debug)
System.err.println("\nGrowing and pruning"+
" Revision ...");
RipperRule revision = (RipperRule)oldRule.copy();
// For revision, first rm the data covered by the old rule
Instances newGrowData = new Instances(growData, 0);
for(int b=0; b<growData.numInstances(); b++){
Instance inst = growData.instance(b);
if(revision.covers(inst))
newGrowData.add(inst);
}
revision.grow(newGrowData);
revision.prune(pruneData, true);
double[][] prevRuleStats = new double[position][6];
for(int c=0; c < position; c++)
prevRuleStats[c] = finalRulesetStat.getSimpleStats(c);
// Now compare the relative DL of variants
FastVector tempRules = (FastVector)ruleset.copyElements();
tempRules.setElementAt(replace, position);
RuleStats repStat = new RuleStats(data, tempRules);
repStat.setNumAllConds(m_Total);
repStat.countData(position, newData, prevRuleStats);
//repStat.countData();
rst = repStat.getSimpleStats(position);
if(m_Debug)
System.err.println("Replace rule covers: "+rst[0]+
" | pos = " + rst[2] +
" | neg = " + rst[4]+
"\nThe rule doesn't cover: "+rst[1]+
" | pos = " + rst[5]);
double repDL = repStat.relativeDL(position, expFPRate,
m_CheckErr);
if(m_Debug)
System.err.println("\nReplace: "+
replace.toString(m_Class)
+" |dl = "+repDL);
if(Double.isNaN(repDL) || Double.isInfinite(repDL))
throw new Exception("Should never happen: repDL"+
"in optmz. stage NaN or "+
"infinite!");
tempRules.setElementAt(revision, position);
RuleStats revStat = new RuleStats(data, tempRules);
revStat.setNumAllConds(m_Total);
revStat.countData(position, newData, prevRuleStats);
//revStat.countData();
double revDL = revStat.relativeDL(position, expFPRate,
m_CheckErr);
if(m_Debug)
System.err.println("Revision: "
+ revision.toString(m_Class)
+" |dl = "+revDL);
if(Double.isNaN(revDL) || Double.isInfinite(revDL))
throw new Exception("Should never happen: revDL"+
"in optmz. stage NaN or "+
"infinite!");
rstats = new RuleStats(data, ruleset);
rstats.setNumAllConds(m_Total);
rstats.countData(position, newData, prevRuleStats);
//rstats.countData();
double oldDL = rstats.relativeDL(position, expFPRate,
m_CheckErr);
if(Double.isNaN(oldDL) || Double.isInfinite(oldDL))
throw new Exception("Should never happen: oldDL"+
"in optmz. stage NaN or "+
"infinite!");
if(m_Debug)
System.err.println("Old rule: "+
oldRule.toString(m_Class)
+" |dl = "+oldDL);
if(m_Debug)
System.err.println("\nrepDL: "+repDL+
"\nrevDL: "+revDL+
"\noldDL: "+oldDL);
if((oldDL <= revDL) && (oldDL <= repDL))
finalRule = oldRule; // Old the best
else if(revDL <= repDL)
finalRule = revision; // Revision the best
else
finalRule = replace; // Replace the best
}
finalRulesetStat.addAndUpdate(finalRule);
rst = finalRulesetStat.getSimpleStats(position);
if(isResidual){
dl += finalRulesetStat.relativeDL(position,
expFPRate,
m_CheckErr);
if(m_Debug)
System.err.println("After optimization: the dl"
+"="+dl+" | best: "+minDL);
if(dl < minDL)
minDL = dl; // The best dl so far
stop = checkStop(rst, minDL, dl);
if(!stop)
ruleset.addElement(finalRule); // Accepted
else{
finalRulesetStat.removeLast(); // Remove last to be re-used
position--;
}
}
else
ruleset.setElementAt(finalRule, position); // Accepted
if(m_Debug){
System.err.println("The rule covers: "+rst[0]+
" | pos = " + rst[2] +
" | neg = " + rst[4]+
"\nThe rule doesn't cover: "+rst[1]+
" | pos = " + rst[5]);
System.err.println("\nRuleset so far: ");
for(int x=0; x<ruleset.size(); x++)
System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class));
System.err.println();
}
//Data not covered
if(finalRulesetStat.getRulesetSize() > 0)// If any rules
newData = finalRulesetStat.getFiltered(position)[1];
hasPositive = Utils.gr(rst[5], 0.0); //Positives remaining?
position++;
} // while !stop && hasPositive
if(ruleset.size() > (position+1)){ // Hasn't gone through yet
for(int k=position+1; k<ruleset.size(); k++)
finalRulesetStat.addAndUpdate((Rule)ruleset.elementAt(k));
}
if(m_Debug)
System.err.println("\nDeleting rules to decrease"+
" DL of the whole ruleset ...");
finalRulesetStat.reduceDL(expFPRate, m_CheckErr);
if(m_Debug){
int del = ruleset.size() -
finalRulesetStat.getRulesetSize();
System.err.println(del+" rules are deleted"+
" after DL reduction procedure");
}
ruleset = finalRulesetStat.getRuleset();
rstats = finalRulesetStat;
} // For each run of optimization
} // if pruning is used
// Concatenate the ruleset for this class to the whole ruleset
if(m_Debug){
System.err.println("\nFinal ruleset: ");
for(int x=0; x<ruleset.size(); x++)
System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class));
System.err.println();
}
m_Ruleset.appendElements(ruleset);
m_RulesetStats.addElement(rstats);
if(ruleset.size() > 0)// If any rules for this class
return rstats.getFiltered(ruleset.size()-1)[1]; // Data not
else // covered
return data;
}
/**
* Check whether the stopping criterion meets
*
* @param rst the statistic of the ruleset
* @param minDL the min description length so far
* @param dl the current description length of the ruleset
* @return true if stop criterion meets, false otherwise
*/
private boolean checkStop(double[] rst, double minDL, double dl){
if(dl > minDL+MAX_DL_SURPLUS){
if(m_Debug)
System.err.println("DL too large: "+dl+" | "+minDL);
return true;
}
else if(!Utils.gr(rst[2], 0.0)){// Covered positives
if(m_Debug)
System.err.println("Too few positives.");
return true;
}
else if((rst[4]/rst[0]) >= 0.5){// Err rate
if(m_CheckErr){
if(m_Debug)
System.err.println("Error too large: "+
rst[4] + "/" + rst[0]);
return true;
}
else
return false;
}
else{// Not stops
if(m_Debug)
System.err.println("Continue.");
return false;
}
}
/**
* Prints the all the rules of the rule learner.
*
* @return a textual description of the classifier
*/
public String toString() {
if (m_Ruleset == null)
return "JRIP: No model built yet.";
StringBuffer sb = new StringBuffer("JRIP rules:\n"+
"===========\n\n");
for(int j=0; j<m_RulesetStats.size(); j++){
RuleStats rs = (RuleStats)m_RulesetStats.elementAt(j);
FastVector rules = rs.getRuleset();
for(int k=0; k<rules.size(); k++){
double[] simStats = rs.getSimpleStats(k);
sb.append(((RipperRule)rules.elementAt(k)).toString(m_Class)
+ " ("+simStats[0]+"/"+simStats[4]+")\n");
}
}
if(m_Debug){
System.err.println("Inside m_Ruleset");
for(int i=0; i<m_Ruleset.size(); i++)
System.err.println(((RipperRule)m_Ruleset.elementAt(i)).toString(m_Class));
}
sb.append("\nNumber of Rules : "
+ m_Ruleset.size() + "\n");
return sb.toString();
}
/**
* Main method.
*
* @param args the options for the classifier
*/
public static void main(String[] args) {
try {
System.out.println(Evaluation.evaluateModel(new JRip(), args));
} catch (Exception e) {
e.printStackTrace();
System.err.println(e.getMessage());
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -