📄 rulestats.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
    double coverBits, uncoverBits; // What's the error?    double expErr;                 // Expected FP or FN    if(Utils.gr(cover, uncover)){      expErr = expFPOverErr*(fp+fn);      coverBits = subsetDL(cover, fp, expErr/cover);      uncoverBits = Utils.gr(uncover, 0.0) ? 	subsetDL(uncover, fn, fn/uncover) : 0.0;	        }    else{      expErr = (1.0-expFPOverErr)*(fp+fn);      coverBits = Utils.gr(cover, 0.0) ? 	subsetDL(cover, fp, fp/cover) : 0.0;      uncoverBits = subsetDL(uncover, fn, expErr/uncover);    }	    /*      System.err.println("!!!cover: " + cover + "|uncover" + uncover +      "|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+      "|FPRate: "+expFPOverErr + "|expErr: "+expErr+      "|fp: "+fp+"|fn: "+fn+"|total: "+totalBits);    */    return (totalBits + coverBits + uncoverBits);  }      /**   * Calculate the potential to decrease DL of the ruleset,   * i.e. the possible DL that could be decreased by deleting   * the rule whose index and simple statstics are given.     * If there's no potentials (i.e. smOrEq 0 && error rate < 0.5),   * it returns NaN. <p>   *   * The way this procedure does is copied from original RIPPER   * implementation and is quite bizzare because it    * does not update the following rules' stats recursively    * any more when testing each rule, which means it assumes   * after deletion no data covered by the following rules (or   * regards the deleted rule as the last rule).  Reasonable    * assumption?<p>   *   * @param index the index of the rule in m_Ruleset to be deleted   * @param expFPOverErr expected FP/(FP+FN)   * @param rulesetStat the simple statistics of the ruleset, updated   *                    if the rule should be deleted   * @param ruleStat the simple statistics of the rule to be deleted   * @param checkErr whether check if error rate >= 0.5   * @return the potential DL that could be decreased   */  public double potential(int index, double expFPOverErr, 			  double[] rulesetStat, double[] ruleStat,			  boolean checkErr){    //System.out.println("!!!inside potential: ");    // Restore the stats if deleted    double pcov = rulesetStat[0] - ruleStat[0];    double puncov = rulesetStat[1] + ruleStat[0];    double pfp = rulesetStat[4] - ruleStat[4];    double pfn = rulesetStat[5] + ruleStat[2];    double dataDLWith = dataDL(expFPOverErr, rulesetStat[0], 			       rulesetStat[1], rulesetStat[4], 			       rulesetStat[5]);    double theoryDLWith = theoryDL(index);    double dataDLWithout = dataDL(expFPOverErr, pcov, puncov, pfp, pfn);    double potential = dataDLWith + theoryDLWith - dataDLWithout;    double err = ruleStat[4] / ruleStat[0];    /*System.out.println("!!!"+dataDLWith +" | "+       theoryDLWith + " | "       +dataDLWithout+"|"+ruleStat[4] + " / " + ruleStat[0]);    */    boolean overErr = Utils.grOrEq(err, 0.5);    if(!checkErr)      overErr = false;	    if(Utils.grOrEq(potential, 0.0) || overErr){       // If deleted, update ruleset stats.  Other stats do not matter      rulesetStat[0] = pcov;      rulesetStat[1] = puncov;      rulesetStat[4] = pfp;      rulesetStat[5] = pfn;      return potential;    }    else      return Double.NaN;  }          /**   * Compute the minimal data description length of the ruleset   * if the rule in the given position is deleted.<br>   * The min_data_DL_if_deleted = data_DL_if_deleted - potential   *   * @param index the index of the rule in question   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation   * @param checkErr whether check if error rate >= 0.5   * @param return the minDataDL   */  public double minDataDLIfDeleted(int index, double expFPRate,				   boolean checkErr){    //System.out.println("!!!Enter without: ");    double[] rulesetStat = new double[6]; // Stats of ruleset if deleted    int more = m_Ruleset.size() - 1 - index; // How many rules after?    FastVector indexPlus = new FastVector(more); // Their stats	    // 0...(index-1) are OK	    for(int j=0; j<index; j++){      // Covered stats are cumulative      rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];      rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];      rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];    }	    // Recount data from index+1    Instances data = (index == 0) ?        m_Data : ((Instances[])m_Filtered.elementAt(index-1))[1];	    //System.out.println("!!!without: " + data.sumOfWeights());    for(int j=(index+1); j<m_Ruleset.size(); j++){      double[] stats = new double[6];      Instances[] split = computeSimpleStats(j, data, stats, null);      indexPlus.addElement(stats);      rulesetStat[0] += stats[0];      rulesetStat[2] += stats[2];      rulesetStat[4] += stats[4];	         data = split[1];    }    // Uncovered stats are those of the last rule    if(more > 0){      rulesetStat[1] = ((double[])indexPlus.lastElement())[1];      rulesetStat[3] = ((double[])indexPlus.lastElement())[3];      rulesetStat[5] = ((double[])indexPlus.lastElement())[5];    }    else if(index > 0){      rulesetStat[1] = 	((double[])m_SimpleStats.elementAt(index-1))[1];      rulesetStat[3] =	((double[])m_SimpleStats.elementAt(index-1))[3];      rulesetStat[5] = 	((double[])m_SimpleStats.elementAt(index-1))[5];    }	    else{ // Null coverage      rulesetStat[1] = ((double[])m_SimpleStats.elementAt(0))[0] +	((double[])m_SimpleStats.elementAt(0))[1];      rulesetStat[3] = ((double[])m_SimpleStats.elementAt(0))[3] +	((double[])m_SimpleStats.elementAt(0))[4];      rulesetStat[5] = ((double[])m_SimpleStats.elementAt(0))[2] +	((double[])m_SimpleStats.elementAt(0))[5];	        }	    // Potential     double potential = 0;    for(int k=index+1; k<m_Ruleset.size(); k++){      double[] ruleStat = (double[])indexPlus.elementAt(k-index-1);      double ifDeleted = potential(k, expFPRate, rulesetStat, 				   ruleStat, checkErr);      if(!Double.isNaN(ifDeleted))	potential += ifDeleted;    }    // Data DL of the ruleset without the rule    // Note that ruleset stats has already been updated to reflect     // deletion if any potential		    double dataDLWithout = dataDL(expFPRate, rulesetStat[0], 				  rulesetStat[1], rulesetStat[4], 				  rulesetStat[5]);    //System.out.println("!!!without: "+dataDLWithout + " |potential: "+    //		   potential);    // Why subtract potential again?  To reflect change of theory DL??    return (dataDLWithout - potential);  }              /**   * Compute the minimal data description length of the ruleset   * if the rule in the given position is NOT deleted.<br>   * The min_data_DL_if_n_deleted = data_DL_if_n_deleted - potential   *   * @param index the index of the rule in question   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation   * @param checkErr whether check if error rate >= 0.5   * @param return the minDataDL   */  public double minDataDLIfExists(int index, double expFPRate,				  boolean checkErr){    //	System.out.println("!!!Enter with: ");    double[] rulesetStat = new double[6]; // Stats of ruleset if rule exists    for(int j=0; j<m_SimpleStats.size(); j++){      // Covered stats are cumulative      rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];      rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];      rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];      if(j == m_SimpleStats.size()-1){ // Last rule	rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];	rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];	rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];      }	        }	    // Potential     double potential = 0;    for(int k=index+1; k<m_SimpleStats.size(); k++){      double[] ruleStat = (double[])getSimpleStats(k);      double ifDeleted = potential(k, expFPRate, rulesetStat, 				   ruleStat, checkErr);      if(!Double.isNaN(ifDeleted))	potential += ifDeleted;    }	    // Data DL of the ruleset without the rule    // Note that ruleset stats has already been updated to reflect deletion    // if any potential	    double dataDLWith = dataDL(expFPRate, rulesetStat[0], 			       rulesetStat[1], rulesetStat[4], 			       rulesetStat[5]);	    //System.out.println("!!!with: "+dataDLWith + " |potential: "+    //		   potential);    return (dataDLWith - potential);  }          /**   * The description length (DL) of the ruleset relative to if the   * rule in the given position is deleted, which is obtained by: <br>   * MDL if the rule exists - MDL if the rule does not exist <br>   * Note the minimal possible DL of the ruleset is calculated(i.e. some   * other rules may also be deleted) instead of the DL of the current   * ruleset.<p>   *   * @param index the given position of the rule in question    *              (assuming correct)   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation   * @param checkErr whether check if error rate >= 0.5   * @return the relative DL   */  public double relativeDL(int index, double expFPRate, boolean checkErr){ 		     return (minDataDLIfExists(index, expFPRate, checkErr) 	    + theoryDL(index) - 	    minDataDLIfDeleted(index, expFPRate, checkErr));  }            /**   * Try to reduce the DL of the ruleset by testing removing the rules   * one by one in reverse order and update all the stats   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation   * @param checkErr whether check if error rate >= 0.5   */  public void reduceDL(double expFPRate, boolean checkErr){	    boolean needUpdate = false;    double[] rulesetStat = new double[6];    for(int j=0; j<m_SimpleStats.size(); j++){      // Covered stats are cumulative      rulesetStat[0] += ((double[])m_SimpleStats.elementAt(j))[0];      rulesetStat[2] += ((double[])m_SimpleStats.elementAt(j))[2];      rulesetStat[4] += ((double[])m_SimpleStats.elementAt(j))[4];      if(j == m_SimpleStats.size()-1){ // Last rule	rulesetStat[1] = ((double[])m_SimpleStats.elementAt(j))[1];	rulesetStat[3] = ((double[])m_SimpleStats.elementAt(j))[3];	rulesetStat[5] = ((double[])m_SimpleStats.elementAt(j))[5];      }	        }	    // Potential     double potential = 0;    for(int k=m_SimpleStats.size()-1; k>=0; k--){	    	      double[] ruleStat = (double[])m_SimpleStats.elementAt(k);      // rulesetStat updated      double ifDeleted = potential(k, expFPRate, rulesetStat, 				   ruleStat, checkErr);      if(!Double.isNaN(ifDeleted)){  	/*System.err.println("!!!deleted ("+k+"): save "+ifDeleted	  +" | "+rulesetStat[0]	  +" | "+rulesetStat[1]	  +" | "+rulesetStat[4]	  +" | "+rulesetStat[5]);	*/		if(k == (m_SimpleStats.size()-1))	    removeLast();	else{	    m_Ruleset.removeElementAt(k);	    needUpdate = true;	}      }    }	    if(needUpdate){      m_Filtered = null;      m_SimpleStats = null;      countData();    }  }      /**   * Remove the last rule in the ruleset as well as it's stats.   * It might be useful when the last rule was added for testing   * purpose and then the test failed   */  public void removeLast(){    int last = m_Ruleset.size()-1;    m_Ruleset.removeElementAt(last);    m_Filtered.removeElementAt(last);    m_SimpleStats.removeElementAt(last);	    if(m_Distributions != null)	m_Distributions.removeElementAt(last);  }  /**   * Static utility function to count the data covered by the    * rules after the given index in the given rules, and then   * remove them.  It returns the data not covered by the   * successive rules.   *   * @param data the data to be processed   * @param rules the ruleset   * @param index the given index   * @return the data after processing   */  public static Instances rmCoveredBySuccessives(Instances data, FastVector rules, int index){    Instances rt = new Instances(data, 0);    for(int i=0; i < data.numInstances(); i++){      Instance datum = data.instance(i);      boolean covered = false;	    	          for(int j=index+1; j<rules.size();j++){	Rule rule = (Rule)rules.elementAt(j);	if(rule.covers(datum)){	  covered = true;	  break;	}      }      if(!covered)	rt.add(datum);    }	    return rt;  }       /**    * Stratify the given data into the given number of bags based on the class   * values.  It differs from the <code>Instances.stratify(int fold)</code>   * that before stratification it sorts the instances according to the    * class order in the header file.  It assumes no missing values in the class.   *    * @param data the given data   * @param folds the given number of folds   * @param rand the random object used to randomize the instances   * @return the stratified instances   */  public static final Instances stratify(Instances data, int folds, Random rand){    if(!data.classAttribute().isNominal())      return data;	    Instances result = new Instances(data, 0);    Instances[] bagsByClasses = new Instances[data.numClasses()];	    for(int i=0; i < bagsByClasses.length; i++)      bagsByClasses[i] = new Instances(data, 0);	    // Sort by class	    for(int j=0; j < data.numInstances(); j++){      Instance datum = data.instance(j);      bagsByClasses[(int)datum.classValue()].add(datum);    }	    // Randomize each class    for(int j=0; j < bagsByClasses.length; j++)      bagsByClasses[j].randomize(rand);	    for(int k=0; k < folds; k++){      int offset = k, bag = 0;    oneFold:      while (true){		while(offset >= bagsByClasses[bag].numInstances()){	  offset -= bagsByClasses[bag].numInstances();	  if (++bag >= bagsByClasses.length)// Next bag	    break oneFold;      	   	}		    	result.add(bagsByClasses[bag].instance(offset));	offset += folds;				      }    }	    return result;  }  /**    * Compute the combined DL of the ruleset in this class, i.e. theory    * DL and data DL.  Note this procedure computes the combined DL   * according to the current status of the ruleset in this class   *    * @param expFPRate expected FP/(FP+FN), used in dataDL calculation   * @param predicted the default classification if ruleset covers null   * @return the combined class   */  public double combinedDL(double expFPRate, double predicted){    double rt = 0;        if(getRulesetSize() > 0) {      double[] stats = (double[])m_SimpleStats.lastElement();      for(int j=getRulesetSize()-2; j >= 0; j--){	stats[0] += getSimpleStats(j)[0];	stats[2] += getSimpleStats(j)[2];	stats[4] += getSimpleStats(j)[4];      }      rt += dataDL(expFPRate, stats[0], stats[1], 		   stats[4], stats[5]);     // Data DL          }    else{ // Null coverage ruleset      double fn = 0.0;      for(int j=0; j < m_Data.numInstances(); j++)	if((int)m_Data.instance(j).classValue() == (int)predicted)	  fn += m_Data.instance(j).weight();      rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn);	    }             for(int i=0; i<getRulesetSize(); i++) // Theory DL      rt += theoryDL(i);             return rt;  }    /**    * Patition the data into 2, first of which has (numFolds-1)/numFolds of   * the data and the second has 1/numFolds of the data   *   *    * @param data the given data   * @param numFolds the given number of folds   * @return the patitioned instances   */  public static final Instances[] partition(Instances data, int numFolds){    Instances[] rt = new Instances[2];    int splits = data.numInstances() * (numFolds - 1) / numFolds;        rt[0] = new Instances(data, 0, splits);    rt[1] = new Instances(data, splits, data.numInstances()-splits);        return rt;  }  }
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -