📄 rulestats.java
字号:
double coverBits, uncoverBits; // What's the error? double expErr; // Expected FP or FN if(Utils.gr(cover, uncover)){ expErr = expFPOverErr*(fp+fn); coverBits = subsetDL(cover, fp, expErr/cover); uncoverBits = Utils.gr(uncover, 0.0) ? subsetDL(uncover, fn, fn/uncover) : 0.0; } else{ expErr = (1.0-expFPOverErr)*(fp+fn); coverBits = Utils.gr(cover, 0.0) ? subsetDL(cover, fp, fp/cover) : 0.0; uncoverBits = subsetDL(uncover, fn, expErr/uncover); } /* System.err.println("!!!cover: " + cover + "|uncover" + uncover + "|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+ "|FPRate: "+expFPOverErr + "|expErr: "+expErr+ "|fp: "+fp+"|fn: "+fn+"|total: "+totalBits); */ return (totalBits + coverBits + uncoverBits); } /** * Calculate the potential to decrease DL of the ruleset, * i.e. the possible DL that could be decreased by deleting * the rule whose index and simple statstics are given. * If there's no potentials (i.e. smOrEq 0 && error rate < 0.5), * it returns NaN. <p> * * The way this procedure does is copied from original RIPPER * implementation and is quite bizzare because it * does not update the following rules' stats recursively * any more when testing each rule, which means it assumes * after deletion no data covered by the following rules (or * regards the deleted rule as the last rule). Reasonable * assumption?<p> * * @param index the index of the rule in m_Ruleset to be deleted * @param expFPOverErr expected FP/(FP+FN) * @param rulesetStat the simple statistics of the ruleset, updated * if the rule should be deleted * @param ruleStat the simple statistics of the rule to be deleted * @param checkErr whether check if error rate >= 0.5 * @return the potential DL that could be decreased */ public double potential(int index, double expFPOverErr, double[] rulesetStat, double[] ruleStat, boolean checkErr){ //System.out.println("!!!inside potential: "); // Restore the stats if deleted double pcov = rulesetStat[0] - ruleStat[0]; double puncov = rulesetStat[1] + ruleStat[0]; double pfp = rulesetStat[4] - ruleStat[4]; double pfn = rulesetStat[5] + ruleStat[2]; double dataDLWith = dataDL(expFPOverErr, rulesetStat[0], rulesetStat[1], rulesetStat[4], rulesetStat[5]); double theoryDLWith = theoryDL(index); double dataDLWithout = dataDL(expFPOverErr, pcov, puncov, pfp, pfn); double potential = dataDLWith + theoryDLWith - dataDLWithout; double err = ruleStat[4] / ruleStat[0]; /*System.out.println("!!!"+dataDLWith +" | "+ theoryDLWith + " | " +dataDLWithout+"|"+ruleStat[4] + " / " + ruleStat[0]); */ boolean overErr = Utils.grOrEq(err, 0.5); if(!checkErr) overErr = false; if(Utils.grOrEq(potential, 0.0) || overErr){ // If deleted, update ruleset stats. Other stats do not matter rulesetStat[0] = pcov; rulesetStat[1] = puncov; rulesetStat[4] = pfp; rulesetStat[5] = pfn; return potential; } else return Double.NaN; } /** * Compute the minimal data description length of the ruleset * if the rule in the given position is deleted.<br> * The min_data_DL_if_deleted = data_DL_if_deleted - potential * * @param index the index of the rule in question * @param expFPRate expected FP/(FP+FN), used in dataDL calculation * @param checkErr whether check if error rate >= 0.5 * @param return the minDataDL */ public double minDataDLIfDeleted(int index, double expFPRate, boolean checkErr){ //System.out.println("!!!Enter without: "); double[] rulesetStat = new double[6]; // Stats of ruleset if deleted int more = m_Ruleset.size() - 1 - index; // How many rules after? FastVector indexPlus = new FastVector(more); // Their stats // 0...(index-1) are OK for(int j=0; j<index; j++){ // Covered stats are cumulative rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0]; rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2]; rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4]; } // Recount data from index+1 Instances data = (index == 0) ? m_Data : ((Instances[])m_Filtered.elementAt(index-1))[1]; //System.out.println("!!!without: " + data.sumOfWeights()); for(int j=(index+1); j<m_Ruleset.size(); j++){ double[] stats = new double[6]; Instances[] split = computeSimpleStats(j, data, stats, null); indexPlus.addElement(stats); rulesetStat[0] += stats[0]; rulesetStat[2] += stats[2]; rulesetStat[4] += stats[4]; data = split[1]; } // Uncovered stats are those of the last rule if(more > 0){ rulesetStat[1] = ((double[])indexPlus.lastElement())[1]; rulesetStat[3] = ((double[])indexPlus.lastElement())[3]; rulesetStat[5] = ((double[])indexPlus.lastElement())[5]; } else if(index > 0){ rulesetStat[1] = ((double[])m_SimpleStats.elementAt(index-1))[1]; rulesetStat[3] = ((double[])m_SimpleStats.elementAt(index-1))[3]; rulesetStat[5] = ((double[])m_SimpleStats.elementAt(index-1))[5]; } else{ // Null coverage rulesetStat[1] = ((double[])m_SimpleStats.elementAt(0))[0] + ((double[])m_SimpleStats.elementAt(0))[1]; rulesetStat[3] = ((double[])m_SimpleStats.elementAt(0))[3] + ((double[])m_SimpleStats.elementAt(0))[4]; rulesetStat[5] = ((double[])m_SimpleStats.elementAt(0))[2] + ((double[])m_SimpleStats.elementAt(0))[5]; } // Potential double potential = 0; for(int k=index+1; k<m_Ruleset.size(); k++){ double[] ruleStat = (double[])indexPlus.elementAt(k-index-1); double ifDeleted = potential(k, expFPRate, rulesetStat, ruleStat, checkErr); if(!Double.isNaN(ifDeleted)) potential += ifDeleted; } // Data DL of the ruleset without the rule // Note that ruleset stats has already been updated to reflect // deletion if any potential double dataDLWithout = dataDL(expFPRate, rulesetStat[0], rulesetStat[1], rulesetStat[4], rulesetStat[5]); //System.out.println("!!!without: "+dataDLWithout + " |potential: "+ // potential); // Why subtract potential again? To reflect change of theory DL?? return (dataDLWithout - potential); } /** * Compute the minimal data description length of the ruleset * if the rule in the given position is NOT deleted.<br> * The min_data_DL_if_n_deleted = data_DL_if_n_deleted - potential * * @param index the index of the rule in question * @param expFPRate expected FP/(FP+FN), used in dataDL calculation * @param checkErr whether check if error rate >= 0.5 * @param return the minDataDL */ public double minDataDLIfExists(int index, double expFPRate, boolean checkErr){ // System.out.println("!!!Enter with: "); double[] rulesetStat = new double[6]; // Stats of ruleset if rule exists for(int j=0; j<m_SimpleStats.size(); j++){ // Covered stats are cumulative rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0]; rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2]; rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4]; if(j == m_SimpleStats.size()-1){ // Last rule rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1]; rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3]; rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5]; } } // Potential double potential = 0; for(int k=index+1; k<m_SimpleStats.size(); k++){ double[] ruleStat = (double[])getSimpleStats(k); double ifDeleted = potential(k, expFPRate, rulesetStat, ruleStat, checkErr); if(!Double.isNaN(ifDeleted)) potential += ifDeleted; } // Data DL of the ruleset without the rule // Note that ruleset stats has already been updated to reflect deletion // if any potential double dataDLWith = dataDL(expFPRate, rulesetStat[0], rulesetStat[1], rulesetStat[4], rulesetStat[5]); //System.out.println("!!!with: "+dataDLWith + " |potential: "+ // potential); return (dataDLWith - potential); } /** * The description length (DL) of the ruleset relative to if the * rule in the given position is deleted, which is obtained by: <br> * MDL if the rule exists - MDL if the rule does not exist <br> * Note the minimal possible DL of the ruleset is calculated(i.e. some * other rules may also be deleted) instead of the DL of the current * ruleset.<p> * * @param index the given position of the rule in question * (assuming correct) * @param expFPRate expected FP/(FP+FN), used in dataDL calculation * @param checkErr whether check if error rate >= 0.5 * @return the relative DL */ public double relativeDL(int index, double expFPRate, boolean checkErr){ return (minDataDLIfExists(index, expFPRate, checkErr) + theoryDL(index) - minDataDLIfDeleted(index, expFPRate, checkErr)); } /** * Try to reduce the DL of the ruleset by testing removing the rules * one by one in reverse order and update all the stats * @param expFPRate expected FP/(FP+FN), used in dataDL calculation * @param checkErr whether check if error rate >= 0.5 */ public void reduceDL(double expFPRate, boolean checkErr){ boolean needUpdate = false; double[] rulesetStat = new double[6]; for(int j=0; j<m_SimpleStats.size(); j++){ // Covered stats are cumulative rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0]; rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2]; rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4]; if(j == m_SimpleStats.size()-1){ // Last rule rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1]; rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3]; rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5]; } } // Potential double potential = 0; for(int k=m_SimpleStats.size()-1; k>=0; k--){ double[] ruleStat = (double[])m_SimpleStats.elementAt(k); // rulesetStat updated double ifDeleted = potential(k, expFPRate, rulesetStat, ruleStat, checkErr); if(!Double.isNaN(ifDeleted)){ /*System.err.println("!!!deleted ("+k+"): save "+ifDeleted +" | "+rulesetStat[0] +" | "+rulesetStat[1] +" | "+rulesetStat[4] +" | "+rulesetStat[5]); */ if(k == (m_SimpleStats.size()-1)) removeLast(); else{ m_Ruleset.removeElementAt(k); needUpdate = true; } } } if(needUpdate){ m_Filtered = null; m_SimpleStats = null; countData(); } } /** * Remove the last rule in the ruleset as well as it's stats. * It might be useful when the last rule was added for testing * purpose and then the test failed */ public void removeLast(){ int last = m_Ruleset.size()-1; m_Ruleset.removeElementAt(last); m_Filtered.removeElementAt(last); m_SimpleStats.removeElementAt(last); if(m_Distributions != null) m_Distributions.removeElementAt(last); } /** * Static utility function to count the data covered by the * rules after the given index in the given rules, and then * remove them. It returns the data not covered by the * successive rules. * * @param data the data to be processed * @param rules the ruleset * @param index the given index * @return the data after processing */ public static Instances rmCoveredBySuccessives(Instances data, FastVector rules, int index){ Instances rt = new Instances(data, 0); for(int i=0; i < data.numInstances(); i++){ Instance datum = data.instance(i); boolean covered = false; for(int j=index+1; j<rules.size();j++){ Rule rule = (Rule)rules.elementAt(j); if(rule.covers(datum)){ covered = true; break; } } if(!covered) rt.add(datum); } return rt; } /** * Stratify the given data into the given number of bags based on the class * values. It differs from the <code>Instances.stratify(int fold)</code> * that before stratification it sorts the instances according to the * class order in the header file. It assumes no missing values in the class. * * @param data the given data * @param folds the given number of folds * @param rand the random object used to randomize the instances * @return the stratified instances */ public static final Instances stratify(Instances data, int folds, Random rand){ if(!data.classAttribute().isNominal()) return data; Instances result = new Instances(data, 0); Instances[] bagsByClasses = new Instances[data.numClasses()]; for(int i=0; i < bagsByClasses.length; i++) bagsByClasses[i] = new Instances(data, 0); // Sort by class for(int j=0; j < data.numInstances(); j++){ Instance datum = data.instance(j); bagsByClasses[(int)datum.classValue()].add(datum); } // Randomize each class for(int j=0; j < bagsByClasses.length; j++) bagsByClasses[j].randomize(rand); for(int k=0; k < folds; k++){ int offset = k, bag = 0; oneFold: while (true){ while(offset >= bagsByClasses[bag].numInstances()){ offset -= bagsByClasses[bag].numInstances(); if (++bag >= bagsByClasses.length)// Next bag break oneFold; } result.add(bagsByClasses[bag].instance(offset)); offset += folds; } } return result; } /** * Compute the combined DL of the ruleset in this class, i.e. theory * DL and data DL. Note this procedure computes the combined DL * according to the current status of the ruleset in this class * * @param expFPRate expected FP/(FP+FN), used in dataDL calculation * @param predicted the default classification if ruleset covers null * @return the combined class */ public double combinedDL(double expFPRate, double predicted){ double rt = 0; if(getRulesetSize() > 0) { double[] stats = (double[])m_SimpleStats.lastElement(); for(int j=getRulesetSize()-2; j >= 0; j--){ stats[0] += getSimpleStats(j)[0]; stats[2] += getSimpleStats(j)[2]; stats[4] += getSimpleStats(j)[4]; } rt += dataDL(expFPRate, stats[0], stats[1], stats[4], stats[5]); // Data DL } else{ // Null coverage ruleset double fn = 0.0; for(int j=0; j < m_Data.numInstances(); j++) if((int)m_Data.instance(j).classValue() == (int)predicted) fn += m_Data.instance(j).weight(); rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn); } for(int i=0; i<getRulesetSize(); i++) // Theory DL rt += theoryDL(i); return rt; } /** * Patition the data into 2, first of which has (numFolds-1)/numFolds of * the data and the second has 1/numFolds of the data * * * @param data the given data * @param numFolds the given number of folds * @return the patitioned instances */ public static final Instances[] partition(Instances data, int numFolds){ Instances[] rt = new Instances[2]; int splits = data.numInstances() * (numFolds - 1) / numFolds; rt[0] = new Instances(data, 0, splits); rt[1] = new Instances(data, splits, data.numInstances()-splits); return rt; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -