📄 rulestats.java
字号:
double coverBits, uncoverBits; // What's the error?
double expErr; // Expected FP or FN
if(Utils.gr(cover, uncover)){
expErr = expFPOverErr*(fp+fn);
coverBits = subsetDL(cover, fp, expErr/cover);
uncoverBits = Utils.gr(uncover, 0.0) ?
subsetDL(uncover, fn, fn/uncover) : 0.0;
}
else{
expErr = (1.0-expFPOverErr)*(fp+fn);
coverBits = Utils.gr(cover, 0.0) ?
subsetDL(cover, fp, fp/cover) : 0.0;
uncoverBits = subsetDL(uncover, fn, expErr/uncover);
}
/*
System.err.println("!!!cover: " + cover + "|uncover" + uncover +
"|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+
"|FPRate: "+expFPOverErr + "|expErr: "+expErr+
"|fp: "+fp+"|fn: "+fn+"|total: "+totalBits);
*/
return (totalBits + coverBits + uncoverBits);
}
/**
* Calculate the potential to decrease DL of the ruleset,
* i.e. the possible DL that could be decreased by deleting
* the rule whose index and simple statstics are given.
* If there's no potentials (i.e. smOrEq 0 && error rate < 0.5),
* it returns NaN. <p>
*
* The way this procedure does is copied from original RIPPER
* implementation and is quite bizzare because it
* does not update the following rules' stats recursively
* any more when testing each rule, which means it assumes
* after deletion no data covered by the following rules (or
* regards the deleted rule as the last rule). Reasonable
* assumption?<p>
*
* @param index the index of the rule in m_Ruleset to be deleted
* @param expFPOverErr expected FP/(FP+FN)
* @param rulesetStat the simple statistics of the ruleset, updated
* if the rule should be deleted
* @param ruleStat the simple statistics of the rule to be deleted
* @param checkErr whether check if error rate >= 0.5
* @return the potential DL that could be decreased
*/
public double potential(int index, double expFPOverErr,
double[] rulesetStat, double[] ruleStat,
boolean checkErr){
//System.out.println("!!!inside potential: ");
// Restore the stats if deleted
double pcov = rulesetStat[0] - ruleStat[0];
double puncov = rulesetStat[1] + ruleStat[0];
double pfp = rulesetStat[4] - ruleStat[4];
double pfn = rulesetStat[5] + ruleStat[2];
double dataDLWith = dataDL(expFPOverErr, rulesetStat[0],
rulesetStat[1], rulesetStat[4],
rulesetStat[5]);
double theoryDLWith = theoryDL(index);
double dataDLWithout = dataDL(expFPOverErr, pcov, puncov, pfp, pfn);
double potential = dataDLWith + theoryDLWith - dataDLWithout;
double err = ruleStat[4] / ruleStat[0];
/*System.out.println("!!!"+dataDLWith +" | "+
theoryDLWith + " | "
+dataDLWithout+"|"+ruleStat[4] + " / " + ruleStat[0]);
*/
boolean overErr = Utils.grOrEq(err, 0.5);
if(!checkErr)
overErr = false;
if(Utils.grOrEq(potential, 0.0) || overErr){
// If deleted, update ruleset stats. Other stats do not matter
rulesetStat[0] = pcov;
rulesetStat[1] = puncov;
rulesetStat[4] = pfp;
rulesetStat[5] = pfn;
return potential;
}
else
return Double.NaN;
}
/**
* Compute the minimal data description length of the ruleset
* if the rule in the given position is deleted.<br>
* The min_data_DL_if_deleted = data_DL_if_deleted - potential
*
* @param index the index of the rule in question
* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
* @param checkErr whether check if error rate >= 0.5
* @param return the minDataDL
*/
public double minDataDLIfDeleted(int index, double expFPRate,
boolean checkErr){
//System.out.println("!!!Enter without: ");
double[] rulesetStat = new double[6]; // Stats of ruleset if deleted
int more = m_Ruleset.size() - 1 - index; // How many rules after?
FastVector indexPlus = new FastVector(more); // Their stats
// 0...(index-1) are OK
for(int j=0; j<index; j++){
// Covered stats are cumulative
rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
}
// Recount data from index+1
Instances data = (index == 0) ?
m_Data : ((Instances[])m_Filtered.elementAt(index-1))[1];
//System.out.println("!!!without: " + data.sumOfWeights());
for(int j=(index+1); j<m_Ruleset.size(); j++){
double[] stats = new double[6];
Instances[] split = computeSimpleStats(j, data, stats, null);
indexPlus.addElement(stats);
rulesetStat[0] += stats[0];
rulesetStat[2] += stats[2];
rulesetStat[4] += stats[4];
data = split[1];
}
// Uncovered stats are those of the last rule
if(more > 0){
rulesetStat[1] = ((double[])indexPlus.lastElement())[1];
rulesetStat[3] = ((double[])indexPlus.lastElement())[3];
rulesetStat[5] = ((double[])indexPlus.lastElement())[5];
}
else if(index > 0){
rulesetStat[1] =
((double[])m_SimpleStats.elementAt(index-1))[1];
rulesetStat[3] =
((double[])m_SimpleStats.elementAt(index-1))[3];
rulesetStat[5] =
((double[])m_SimpleStats.elementAt(index-1))[5];
}
else{ // Null coverage
rulesetStat[1] = ((double[])m_SimpleStats.elementAt(0))[0] +
((double[])m_SimpleStats.elementAt(0))[1];
rulesetStat[3] = ((double[])m_SimpleStats.elementAt(0))[3] +
((double[])m_SimpleStats.elementAt(0))[4];
rulesetStat[5] = ((double[])m_SimpleStats.elementAt(0))[2] +
((double[])m_SimpleStats.elementAt(0))[5];
}
// Potential
double potential = 0;
for(int k=index+1; k<m_Ruleset.size(); k++){
double[] ruleStat = (double[])indexPlus.elementAt(k-index-1);
double ifDeleted = potential(k, expFPRate, rulesetStat,
ruleStat, checkErr);
if(!Double.isNaN(ifDeleted))
potential += ifDeleted;
}
// Data DL of the ruleset without the rule
// Note that ruleset stats has already been updated to reflect
// deletion if any potential
double dataDLWithout = dataDL(expFPRate, rulesetStat[0],
rulesetStat[1], rulesetStat[4],
rulesetStat[5]);
//System.out.println("!!!without: "+dataDLWithout + " |potential: "+
// potential);
// Why subtract potential again? To reflect change of theory DL??
return (dataDLWithout - potential);
}
/**
* Compute the minimal data description length of the ruleset
* if the rule in the given position is NOT deleted.<br>
* The min_data_DL_if_n_deleted = data_DL_if_n_deleted - potential
*
* @param index the index of the rule in question
* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
* @param checkErr whether check if error rate >= 0.5
* @param return the minDataDL
*/
public double minDataDLIfExists(int index, double expFPRate,
boolean checkErr){
// System.out.println("!!!Enter with: ");
double[] rulesetStat = new double[6]; // Stats of ruleset if rule exists
for(int j=0; j<m_SimpleStats.size(); j++){
// Covered stats are cumulative
rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
if(j == m_SimpleStats.size()-1){ // Last rule
rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];
rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];
rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];
}
}
// Potential
double potential = 0;
for(int k=index+1; k<m_SimpleStats.size(); k++){
double[] ruleStat = (double[])getSimpleStats(k);
double ifDeleted = potential(k, expFPRate, rulesetStat,
ruleStat, checkErr);
if(!Double.isNaN(ifDeleted))
potential += ifDeleted;
}
// Data DL of the ruleset without the rule
// Note that ruleset stats has already been updated to reflect deletion
// if any potential
double dataDLWith = dataDL(expFPRate, rulesetStat[0],
rulesetStat[1], rulesetStat[4],
rulesetStat[5]);
//System.out.println("!!!with: "+dataDLWith + " |potential: "+
// potential);
return (dataDLWith - potential);
}
/**
* The description length (DL) of the ruleset relative to if the
* rule in the given position is deleted, which is obtained by: <br>
* MDL if the rule exists - MDL if the rule does not exist <br>
* Note the minimal possible DL of the ruleset is calculated(i.e. some
* other rules may also be deleted) instead of the DL of the current
* ruleset.<p>
*
* @param index the given position of the rule in question
* (assuming correct)
* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
* @param checkErr whether check if error rate >= 0.5
* @return the relative DL
*/
public double relativeDL(int index, double expFPRate, boolean checkErr){
return (minDataDLIfExists(index, expFPRate, checkErr)
+ theoryDL(index) -
minDataDLIfDeleted(index, expFPRate, checkErr));
}
/**
* Try to reduce the DL of the ruleset by testing removing the rules
* one by one in reverse order and update all the stats
* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
* @param checkErr whether check if error rate >= 0.5
*/
public void reduceDL(double expFPRate, boolean checkErr){
boolean needUpdate = false;
double[] rulesetStat = new double[6];
for(int j=0; j<m_SimpleStats.size(); j++){
// Covered stats are cumulative
rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];
rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];
rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];
if(j == m_SimpleStats.size()-1){ // Last rule
rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];
rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];
rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];
}
}
// Potential
double potential = 0;
for(int k=m_SimpleStats.size()-1; k>=0; k--){
double[] ruleStat = (double[])m_SimpleStats.elementAt(k);
// rulesetStat updated
double ifDeleted = potential(k, expFPRate, rulesetStat,
ruleStat, checkErr);
if(!Double.isNaN(ifDeleted)){
/*System.err.println("!!!deleted ("+k+"): save "+ifDeleted
+" | "+rulesetStat[0]
+" | "+rulesetStat[1]
+" | "+rulesetStat[4]
+" | "+rulesetStat[5]);
*/
if(k == (m_SimpleStats.size()-1))
removeLast();
else{
m_Ruleset.removeElementAt(k);
needUpdate = true;
}
}
}
if(needUpdate){
m_Filtered = null;
m_SimpleStats = null;
countData();
}
}
/**
* Remove the last rule in the ruleset as well as it's stats.
* It might be useful when the last rule was added for testing
* purpose and then the test failed
*/
public void removeLast(){
int last = m_Ruleset.size()-1;
m_Ruleset.removeElementAt(last);
m_Filtered.removeElementAt(last);
m_SimpleStats.removeElementAt(last);
if(m_Distributions != null)
m_Distributions.removeElementAt(last);
}
/**
* Static utility function to count the data covered by the
* rules after the given index in the given rules, and then
* remove them. It returns the data not covered by the
* successive rules.
*
* @param data the data to be processed
* @param rules the ruleset
* @param index the given index
* @return the data after processing
*/
public static Instances rmCoveredBySuccessives(Instances data, FastVector rules, int index){
Instances rt = new Instances(data, 0);
for(int i=0; i < data.numInstances(); i++){
Instance datum = data.instance(i);
boolean covered = false;
for(int j=index+1; j<rules.size();j++){
Rule rule = (Rule)rules.elementAt(j);
if(rule.covers(datum)){
covered = true;
break;
}
}
if(!covered)
rt.add(datum);
}
return rt;
}
/**
* Stratify the given data into the given number of bags based on the class
* values. It differs from the <code>Instances.stratify(int fold)</code>
* that before stratification it sorts the instances according to the
* class order in the header file. It assumes no missing values in the class.
*
* @param data the given data
* @param folds the given number of folds
* @param rand the random object used to randomize the instances
* @return the stratified instances
*/
public static final Instances stratify(Instances data, int folds, Random rand){
if(!data.classAttribute().isNominal())
return data;
Instances result = new Instances(data, 0);
Instances[] bagsByClasses = new Instances[data.numClasses()];
for(int i=0; i < bagsByClasses.length; i++)
bagsByClasses[i] = new Instances(data, 0);
// Sort by class
for(int j=0; j < data.numInstances(); j++){
Instance datum = data.instance(j);
bagsByClasses[(int)datum.classValue()].add(datum);
}
// Randomize each class
for(int j=0; j < bagsByClasses.length; j++)
bagsByClasses[j].randomize(rand);
for(int k=0; k < folds; k++){
int offset = k, bag = 0;
oneFold:
while (true){
while(offset >= bagsByClasses[bag].numInstances()){
offset -= bagsByClasses[bag].numInstances();
if (++bag >= bagsByClasses.length)// Next bag
break oneFold;
}
result.add(bagsByClasses[bag].instance(offset));
offset += folds;
}
}
return result;
}
/**
* Compute the combined DL of the ruleset in this class, i.e. theory
* DL and data DL. Note this procedure computes the combined DL
* according to the current status of the ruleset in this class
*
* @param expFPRate expected FP/(FP+FN), used in dataDL calculation
* @param predicted the default classification if ruleset covers null
* @return the combined class
*/
public double combinedDL(double expFPRate, double predicted){
double rt = 0;
if(getRulesetSize() > 0) {
double[] stats = (double[])m_SimpleStats.lastElement();
for(int j=getRulesetSize()-2; j >= 0; j--){
stats[0] += getSimpleStats(j)[0];
stats[2] += getSimpleStats(j)[2];
stats[4] += getSimpleStats(j)[4];
}
rt += dataDL(expFPRate, stats[0], stats[1],
stats[4], stats[5]); // Data DL
}
else{ // Null coverage ruleset
double fn = 0.0;
for(int j=0; j < m_Data.numInstances(); j++)
if((int)m_Data.instance(j).classValue() == (int)predicted)
fn += m_Data.instance(j).weight();
rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn);
}
for(int i=0; i<getRulesetSize(); i++) // Theory DL
rt += theoryDL(i);
return rt;
}
/**
* Patition the data into 2, first of which has (numFolds-1)/numFolds of
* the data and the second has 1/numFolds of the data
*
*
* @param data the given data
* @param numFolds the given number of folds
* @return the patitioned instances
*/
public static final Instances[] partition(Instances data, int numFolds){
Instances[] rt = new Instances[2];
int splits = data.numInstances() * (numFolds - 1) / numFolds;
rt[0] = new Instances(data, 0, splits);
rt[1] = new Instances(data, splits, data.numInstances()-splits);
return rt;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -