📄 jrip.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
      }
	    
      return isCover;
    }        
	
    /**
     * Whether this rule has antecedents, i.e. whether it is a default rule
     * 
     * @return the boolean value indicating whether the rule has antecedents
     */
    public boolean hasAntds(){
      if (m_Antds == null)
	return false;
      else
	return (m_Antds.size() > 0);
    }      
	
    /** 
     * the number of antecedents of the rule
     *
     * @return the size of this rule
     */
    public double size(){ return (double)m_Antds.size(); }		

	
    /**
     * Private function to compute default number of accurate instances
     * in the specified data for the consequent of the rule
     * 
     * @param data the data in question
     * @return the default accuracy number
     */
    private double computeDefAccu(Instances data){ 
      double defAccu=0;
      for(int i=0; i<data.numInstances(); i++){
	Instance inst = data.instance(i);
	if((int)inst.classValue() == (int)m_Consequent)
	  defAccu += inst.weight();
      }
      return defAccu;
    }
	
	
    /**
     * Build one rule using the growing data
     *
     * @param data the growing data used to build the rule
     * @exception if the consequent is not set yet
     */    
      public void grow(Instances data) throws Exception {
      if(m_Consequent == -1)
	throw new Exception(" Consequent not set yet.");
	    
      Instances growData = data;	         
      double sumOfWeights = growData.sumOfWeights();
      if(!Utils.gr(sumOfWeights, 0.0))
	return;
	    
      /* Compute the default accurate rate of the growing data */
      double defAccu = computeDefAccu(growData);
      double defAcRt = (defAccu+1.0)/(sumOfWeights+1.0); 
	    
      /* Keep the record of which attributes have already been used*/    
      boolean[] used=new boolean [growData.numAttributes()];
      for (int k=0; k<used.length; k++)
	used[k]=false;
      int numUnused=used.length;
	    
      // If there are already antecedents existing
      for(int j=0; j < m_Antds.size(); j++){
	Antd antdj = (Antd)m_Antds.elementAt(j);
	if(!antdj.getAttr().isNumeric()){ 
	  used[antdj.getAttr().index()]=true;
	  numUnused--;
	} 
      }	    
	    
      double maxInfoGain;	    
      while (Utils.gr(growData.numInstances(), 0.0) && 
	     (numUnused > 0) 
	     && Utils.sm(defAcRt, 1.0)
	     ){   
		
	// We require that infoGain be positive
	/*if(numAntds == originalSize)
	  maxInfoGain = 0.0; // At least one condition allowed
	  else
	  maxInfoGain = Utils.eq(defAcRt, 1.0) ? 
	  defAccu/(double)numAntds : 0.0; */
	maxInfoGain = 0.0; 
		
	/* Build a list of antecedents */
	Antd oneAntd=null;
	Instances coverData = null;
	Enumeration emAttr=growData.emerateAttributes();	      
		
	/* Build one condition based on all attributes not used yet*/
	while (emAttr.hasMoreElements()){
	  Attribute att= (Attribute)(emAttr.nextElement());
	  
	  if(m_Debug)
	    System.err.println("\nOne condition: size = " 
			       + growData.sumOfWeights());
		   
	  Antd antd =null;	
	  if(att.isNumeric())
	    antd = new NumericAntd(att);
	  else
	    antd = new NominalAntd(att);
		    
	  if(!used[att.index()]){
	    /* Compute the best information gain for each attribute,
	       it's stored in the antecedent formed by this attribute.
	       This procedure returns the data covered by the antecedent*/
	    Instances coveredData = computeInfoGain(growData, defAcRt,
						    antd);
	    if(coveredData != null){
	      double infoGain = antd.getMaxInfoGain();      
	      if(m_Debug)
		System.err.println("Test of \'"+antd.toString()+
				   "\': infoGain = "+
				   infoGain + " | Accuracy = " +
				   antd.getAccuRate()+
				   "="+antd.getAccu()
				   +"/"+antd.getCover()+
				   " def. accuracy: "+defAcRt);
			    
	      if(infoGain > maxInfoGain){         
		oneAntd=antd;
		coverData = coveredData;  
		maxInfoGain = infoGain;
	      }		    
	    }
	  }
	}
		
	if(oneAntd == null) break; // Cannot find antds		
	if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage
		
	//Numeric attributes can be used more than once
	if(!oneAntd.getAttr().isNumeric()){ 
	  used[oneAntd.getAttr().index()]=true;
	  numUnused--;
	}
		
	m_Antds.addElement(oneAntd);
	growData = coverData;// Grow data size is shrinking 	
	defAcRt = oneAntd.getAccuRate();
      }
    }
	
	
    /** 
     * Compute the best information gain for the specified antecedent
     *  
     * @param instances the data based on which the infoGain is computed
     * @param defAcRt the default accuracy rate of data
     * @param antd the specific antecedent
     * @param numConds the number of antecedents in the rule so far
     * @return the data covered by the antecedent
     */
    private Instances computeInfoGain(Instances instances, double defAcRt, 
				      Antd antd){
	Instances data = instances;
	
      /* Split the data into bags.
	 The information gain of each bag is also calculated in this procedure */
      Instances[] splitData = antd.splitData(data, defAcRt, 
					     m_Consequent); 
	    
      /* Get the bag of data to be used for next antecedents */
      if(splitData != null)
	return splitData[(int)antd.getAttrValue()];
      else return null;
    }
	
    /**
     * Prune all the possible final sequences of the rule using the 
     * pruning data.  The measure used to prune the rule is based on
     * flag given.
     *
     * @param pruneData the pruning data used to prune the rule
     * @param useWhole flag to indicate whether use the error rate of
     *                 the whole pruning data instead of the data covered
     */    
    public void prune(Instances pruneData, boolean useWhole){
	Instances data = pruneData;
	
      double total = data.sumOfWeights();
      if(!Utils.gr(total, 0.0))
	return;
	
      /* The default accurate # and rate on pruning data */
      double defAccu=computeDefAccu(data);
	    
      if(m_Debug)	
	System.err.println("Pruning with " + defAccu + 
			   " positive data out of " + total +
			   " instances");	
	    
      int size=m_Antds.size();
      if(size == 0) return; // Default rule before pruning
	    
      double[] worthRt = new double[size];
      double[] coverage = new double[size];
      double[] worthValue = new double[size];
      for(int w=0; w<size; w++){
	worthRt[w]=coverage[w]=worthValue[w]=0.0;
      }
	    
      /* Calculate accuracy parameters for all the antecedents in this rule */
      double tn = 0.0; // True negative if useWhole
      for(int x=0; x<size; x++){
	Antd antd=(Antd)m_Antds.elementAt(x);
	Attribute attr= antd.getAttr();
	Instances newData = data;
	data = new Instances(newData, 0); // Make data empty
		
	for(int y=0; y<newData.numInstances(); y++){
	  Instance ins=newData.instance(y);
		    
	  if(antd.covers(ins)){   // Covered by this antecedent
	    coverage[x] += ins.weight();
	    data.add(ins);                 // Add to data for further pruning
	    if((int)ins.classValue() == (int)m_Consequent) // Accurate prediction
	      worthValue[x] += ins.weight();
	  }
	  else if(useWhole){ // Not covered
	    if((int)ins.classValue() != (int)m_Consequent)
	      tn += ins.weight();
	  }			
	}
		
	if(useWhole){
	  worthValue[x] += tn;
	  worthRt[x] = worthValue[x] / total;
	}
	else // Note if coverage is 0, accuracy is 0.5
	  worthRt[x] = (worthValue[x]+1.0)/(coverage[x]+2.0);
      }
	    
      double maxValue = (defAccu+1.0)/(total+2.0);
      int maxIndex = -1;
      for(int i=0; i<worthValue.length; i++){
	if(m_Debug){
	  double denom = useWhole ? total : coverage[i];
	  System.err.println(i+"(useAccuray? "+!useWhole+"): "
			     + worthRt[i] + 
			     "="+worthValue[i]+
			     "/"+denom);
	}
	if(worthRt[i] > maxValue){ // Prefer to the 
	  maxValue = worthRt[i]; // shorter rule
	  maxIndex = i;
	}
      }
	    
      /* Prune the antecedents according to the accuracy parameters */
      for(int z=size-1;z>maxIndex;z--)
	m_Antds.removeElementAt(z);       
    }
	
    /**
     * Prints this rule
     *
     * @param classAttr the class attribute in the data
     * @return a textual description of this rule
     */
    public String toString(Attribute classAttr) {
      StringBuffer text =  new StringBuffer();
      if(m_Antds.size() > 0){
	for(int j=0; j< (m_Antds.size()-1); j++)
	  text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and ");
	text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")");
      }
      text.append(" => " + classAttr.name() +
		  "=" + classAttr.value((int)m_Consequent));
	    
      return text.toString();
    }
  }
    
  /**
   * Builds Ripper in the order of class frequencies.  For each class
   * it's built in two stages: building and optimization 
   *
   * @param instances the training data
   * @exception Exception if classifier can't be built successfully
   */
  public void buildClassifier(Instances instances) throws Exception {
     
    if(instances.numInstances() == 0)
      throw new Exception(" No instances with a class value!");
    
    if (instances.checkForStringAttributes()) 
      throw new UnsupportedAttributeTypeException(" Cannot handle string attributes!");
    
    if (!instances.classAttribute().isNominal()) 
      throw new UnsupportedClassTypeException(" Only nominal class, please.");
    
    m_Random = instances.getRandomNumberGenerator(m_Seed); 
    m_Total = RuleStats.numAllConditions(instances);
    if(m_Debug)
      System.err.println("Number of all possible conditions = "+m_Total);
    
    Instances data = null;
    m_Filter = new ClassOrder();
    ((ClassOrder)m_Filter).setSeed(m_Random.nextInt());	
    ((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND);
    m_Filter.setInputFormat(instances);
    data = Filter.useFilter(instances, m_Filter);
	
    if(data == null)
      throw new Exception(" Unable to randomize the class orders.");
	
    data.deleteWithMissingClass();
    if(data.numInstances() == 0)
      throw new Exception(" No instances with a class value!");
    
    if(data.numInstances() < m_Folds)
	throw new Exception(" Not enough data for REP.");
    
    m_Class = data.classAttribute();	
    m_Ruleset = new FastVector();
    m_RulesetStats = new FastVector();
    m_Distributions = new FastVector();

    // Sort by classes frequency
    double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts();
    if(m_Debug){
      System.err.println("Sorted classes:");
      for(int x=0; x < m_Class.numValues(); x++)
	System.err.println(x+": "+m_Class.value(x) + " has " +
			   orderedClasses[x] + " instances.");
    }
    // Iterate from less prevalent class to more frequent one
  oneClass:	
    for(int y=0; y < data.numClasses()-1; y++){ // For each class	      
	    
      double classIndex = (double)y;
      if(m_Debug){
	int ci = (int)classIndex;
	System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): "
			   + orderedClasses[y] + "instances\n"+
			   "=====================================\n");
      }
		
      if(Utils.eq(orderedClasses[y],0.0)) // No data for this class
	continue oneClass;		
	    
      // The expected FP/err is the proportion of the class
      double all = 0;
      for(int i=y; i<orderedClasses.length; i++)
	all += orderedClasses[i];
      double expFPRate = orderedClasses[y] / all;	    
		
      double classYWeights = 0, totalWeights = 0;
      for(int j=0; j < data.numInstances(); j++){
	  Instance datum = data.instance(j);
	  totalWeights += datum.weight();
	  if((int)datum.classValue() == y){
	      classYWeights += datum.weight();
	  }	          
      }	
          
      // DL of default rule, no theory DL, only data DL
      double defDL;
      if(classYWeights > 0)
	  defDL = RuleStats.dataDL(expFPRate, 
				   0.0,
				   totalWeights,
				   0.0,
				   classYWeights);	    
      else
	  continue oneClass; // Subsumed by previous rules

      if(Double.isNaN(defDL) || Double.isInfinite(defDL))
	throw new Exception("Should never happen: "+
			    "defDL NaN or infinite!");
      if(m_Debug)
	System.err.println("The default DL = "+defDL);
	    
      data = rulesetForOneClass(expFPRate, data, classIndex, defDL);
    }

    // Set the default rule
    RipperRule defRule = new RipperRule();
    defRule.setConsequent((double)(data.numClasses()-1));
    m_Ruleset.addElement(defRule);
	
    RuleStats defRuleStat = new RuleStats();
    defRuleStat.setData(data);
    defRuleStat.setNumAllConds(m_Total);
    defRuleStat.addAndUpdate(defRule);
    m_RulesetStats.addElement(defRuleStat);

    for(int z=0; z < m_RulesetStats.size(); z++){
	RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z);
	for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){
	    double[] classDist = oneClass.getDistributions(xyz);
	    Utils.normalize(classDist);
	    if(classDist != null)
		m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist));
	}	
    }    
  }
    
  /**
   * Classify the test instance with the rule learner and provide
   * the class distributions 
   *
   * @param datum the instance to be classified
   * @return the distribution
   */
    public double[] distributionForInstance(Instance datum){
	try{
	    for(int i=0; i < m_Ruleset.size(); i++){
		RipperRule rule = (RipperRule)m_Ruleset.elementAt(i);
		if(rule.covers(datum))
		    return (double[])m_Distributions.elementAt(i); 
	    }
	}catch(Exception e){
	    System.err.println(e.getMessage());
	    e.printStackTrace();
	}
	
	System.err.println("Should never happen!");
	return new double[datum.classAttribute().numValues()];
    }

  /** Build a ruleset for the given class according to the given data
   *
   * @param expFPRate the expected FP/(FP+FN) used in DL calculation
   * @param data the given data
   * @param classIndex the given class index
   * @param defDL the default DL in the data
   * @exception if the ruleset can be built properly
   */
  protected Instances rulesetForOneClass(double expFPRate, 
					 Instances data, 
					 double classIndex,
					 double defDL)
    throws Exception {
	
    Instances newData = data, growData, pruneData;  	
    boolean stop = false;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -