📄 lbr.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
      tempErrorsBest = subInstances.getNumInstancesSet() + 1;
      subInstances.setSequentialDataset(true);
      // Step 6: selecting an attribute.
      for (int attr = 0; attr < subInstances.m_NumSeqAttsSet; attr++){
        forCnt++;
        subAttrIndex = subInstances.m_SequentialAttIndexes[attr];
        // Step 7: get the corresponding subset.
        
        m_RemainderErrors = 0;

        // reset array to true
        for(int i = 0; i < m_numInsts; i++) {
          m_subOldErrorFlags[i] = true;
        }
        // reset indexes to reflect an empty dataset but with the same attrs as another dataset
        tempSubInstances.resetDatasetBasedOn(subInstances);
        // Get subset of the instances and its m_LastSecondErrors
        for(inst = 0; inst < subInstances.m_NumSeqInstsSet; inst++) {
          subInstIndex = subInstances.m_SequentialInstIndexes[inst];
          if (m_Instances.instance(subInstIndex).value(subAttrIndex) == testInstance.value(subAttrIndex))  {
            // add instance to subset list
            tempSubInstances.setInstanceIndex(subInstIndex, true);
            if (localErrorFlags[subInstIndex] == false ) {
              m_subOldErrorFlags[subInstIndex] = false;
            }
          }
          else  {
            if (localErrorFlags[subInstIndex] == false ) {
              m_RemainderErrors++;
            }
          }
        } // end of for

        // Step 7':
        if (tempSubInstances.m_NumInstsSet < subInstances.m_NumInstsSet) {
          // remove attribute from index
          tempSubInstances.setAttIndex(subAttrIndex, false);
          // Step 9: create a classifier on the subset.
          // Compute counts and priors
          // create sequential index of instances and attributes that are to be considered
                
          localNaiveBayes(tempSubInstances);
          
          subLocalErrors = leaveOneOut(tempSubInstances, m_Counts, m_Priors, subLocalErrorFlags);

          errorsNewNotOld = 0;
          errorsOldNotNew = 0;
          
          tempSubInstances.setSequentialDataset(true);
          
          for(int t_inst = 0; t_inst < tempSubInstances.m_NumSeqInstsSet; t_inst++) {
            tempInstIndex = tempSubInstances.m_SequentialInstIndexes[t_inst];
            if (subLocalErrorFlags[tempInstIndex] == false) {
              // The number of errors on New, Not on Old in the subset.
              if (m_subOldErrorFlags[tempInstIndex] == true) {
                errorsNewNotOld ++;
              }
            } else {
              // The number of errors on Old, Not on New in the subset.
              if(m_subOldErrorFlags[tempInstIndex] == false) {
                errorsOldNotNew ++;
              }
            }
          } //end of for
          
          // Step 10 and Step 11:
          int tempErrors = subLocalErrors + m_RemainderErrors;
          // Step 12:
          // Step 13: stopping criteria.
          if((tempErrors < tempErrorsBest) && (binomP(errorsNewNotOld, errorsNewNotOld + errorsOldNotNew, 0.5 ) < SIGNLOWER))      {
            // Step 14:
            tempCnt++;
            // --------------------------------------------------
            //tempD_subsetBest = new Indexes(tempSubInstances);
            
            // -------------------------------------------------------------------------------
            tempSubInstances.setSequentialDataset(true);
            tempD_subsetBestInsts = (int []) tempSubInstances.m_SequentialInstIndexes.clone();
            tempD_subsetBestAtts = (int []) tempSubInstances.m_SequentialAttIndexes.clone();
            // -------------------------------------------------------------------------------
            // Step 15:
            tempErrorsBest = tempErrors;

            tempErrorFlagBest = (boolean []) subLocalErrorFlags.clone();

            // Step 16:
            attributeBest = subAttrIndex;
          } // end of if
        } // end of if
      } // end of main for
      
      // Step 20:
      if(attributeBest != -1)  {
        bestCnt++;
        // Step 21:
        leftHand.add(testInstance.attribute(attributeBest));
        // ------------------------------------------------
        // Step 22:
        //tempD_subsetBest.setAttIndex(attributeBest, false);
        //subInstances = tempD_subsetBest;
        // ------------------------------------------------ 
        subInstances.setInsts(tempD_subsetBestInsts, true);
        subInstances.setAtts(tempD_subsetBestAtts, true);
        subInstances.setAttIndex(attributeBest, false);
        // -------------------------------------------------
        // Step 25:
        localErrors = tempErrorsBest;
        localErrorFlags =  tempErrorFlagBest;
        
      } else {
        break;
      }
    } // end of while
    
    // Step 27:
    localNaiveBayes(subInstances);
    return localDistributionForInstance(testInstance, subInstances);
  }
  
  /**
   * Returns a description of the classifier.
   *
   * @return a description of the classifier as a string.
   */
  public String toString() {
    
    if (m_Instances == null) {
      return "Lazy Bayesian Rule: No model built yet.";
    }
    
    try {
      StringBuffer text = new StringBuffer
      ("=== LBR Run information ===\n\n");
      
      text.append("Scheme:       weka.classifiers.LBR\n");
      
      text.append("Relation:     "
      + m_Instances.attribute(m_Instances.classIndex()).name()
      + "\n");
      
      text.append("Instances:    "+m_Instances.numInstances()+"\n");
      
      text.append("Attributes:   "+m_Instances.numAttributes()+"\n");
      
      // Remains are printed by Evaulation.java
      return text.toString();
    } catch (Exception e) {
      e.printStackTrace();
      return "Can't Print Lazy Bayes Rule Classifier!";
    }
  }

  /**
   * Leave-one-out strategy. For a given sample data set with n instances,
   * using (n - 1) instances by leaving one out and tested on the single
   * remaining case.
   * This is repeated n times in turn.
   * The final "Error" is the sum of the instances to be classified
   * incorrectly.
   *
   * @param instances set of instances serving as training data.
   * @param counts serving as all the counts of training data.
   * @param priors serving as the number of instances in each class.
   *
   * @return error flag array about each instance.
   *
   **/
  public int leaveOneOut(Indexes instanceIndex, int [][][] counts, int [] priors, boolean [] errorFlags)  throws Exception {
    
    
    // ###### START LEAVE ONE OUT #############
    int tempClassValue;
    double posteriors;
    double sumForPriors;
    double sumForCounts;
    double max = 0;
    int maxIndex = 0;
    int AIndex, attIndex, clss;
    int inst;
    int errors = 0;
    int instIndex;
    Instance instance;
    
    instanceIndex.setSequentialDataset(true);
    int tempInstanceClassValue;
    int [] tempAttributeValues = new int[(int)instanceIndex.m_NumSeqAttsSet+1];
    Instance tempInstance;
    for(inst = 0; inst < instanceIndex.m_NumSeqInstsSet; inst++) {
      instIndex = instanceIndex.m_SequentialInstIndexes[inst];
      //get the leave-one-out instance
      tempInstance = (Instance) m_Instances.instance(instIndex);
      if (!tempInstance.classIsMissing()) {
      tempInstanceClassValue = (int)tempInstance.classValue();
      // pointer to first index of counts matrix for efficiency
      int [][] countsPointer = counts[tempInstanceClassValue];
      // Compute the counts and priors for (n-1) instances.
      for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
        AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
        tempAttributeValues[attIndex] = (int)tempInstance.value(AIndex);
        countsPointer[AIndex][tempAttributeValues[attIndex]]--;
      }
      
      priors[tempInstanceClassValue]--;
      max = 0;
      maxIndex= 0;
      // ###### LOCAL CLASSIFY INSTANCE ###########
      sumForPriors = Utils.sum(priors);
      for (clss = 0; clss < m_numClasses; clss++) {
        posteriors = 0.0;
        posteriors = (priors[clss] + 1) / (sumForPriors + m_numClasses);
        
	countsPointer = counts[clss];
        for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
          AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
          if (!tempInstance.isMissing(AIndex)) {
            sumForCounts = Utils.sum(countsPointer[AIndex]);
            posteriors *= ((countsPointer[AIndex][tempAttributeValues[attIndex]] + 1) / (sumForCounts + (double)tempInstance.attribute(AIndex).numValues()));
          }
        }
        
        if (posteriors > max) {
          maxIndex = clss;
          max = posteriors;
        }
      } // end of for
      
      if (max > 0) {
        tempClassValue = maxIndex;
      } else {
        tempClassValue = (int)tempInstance.missingValue();
      }
      // ###### END LOCAL CLASSIFY INSTANCE ###########
      
      // Adjudge error. Here using classIndex is incorrect,
      // it is index of the class attribute.
      if(tempClassValue == tempInstanceClassValue){
        errorFlags[instIndex] = true;
      } else {
        errorFlags[instIndex] = false;
        errors++;
      }
      
      countsPointer = counts[tempInstanceClassValue];
      for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
        AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
        counts[tempInstanceClassValue][AIndex][tempAttributeValues[attIndex]]++;
      }
      
      priors[tempInstanceClassValue]++;
      }
    } // end of for
    // ###### END LEAVE ONE OUT #############
    return errors;
  }
 
  /**
   * Class for building and using a simple Naive Bayes classifier.
   * For more information, see<p>
   *
   * Richard Duda and Peter Hart (1973).<i>Pattern
   * Classification and Scene Analysis</i>. Wiley, New York.
   *
   * This method only get m_Counts and m_Priors.
   *
   * @param instances set of instances serving as training data
   * @exception Exception if m_Counts and m_Priors have not been
   *  generated successfully
   */
  public void localNaiveBayes(Indexes instanceIndex) throws Exception {
    int attIndex = 0;
    int i, v, AIndex;
    int attVal = 0;
    int classVal = 0;
    Instance instance;

    instanceIndex.setSequentialDataset(true);

    // reset local counts
    for(classVal = 0; classVal < m_numClasses; classVal++) {
      // counts pointer mcTimesaver
      int [][] countsPointer1 = m_Counts[classVal];
      for(attIndex = 0; attIndex < m_numAtts; attIndex++) {
        Attribute attribute = m_Instances.attribute(attIndex);
         // love those pointers for saving time
         int [] countsPointer2 = countsPointer1[attIndex];
        for(attVal = 0; attVal < attribute.numValues(); attVal++)  {
          countsPointer2[attVal] = 0;
        }
     }
     m_Priors[classVal] = 0;
   }

    for(i = 0; i < instanceIndex.m_NumSeqInstsSet; i++) {
      instance = (Instance) m_Instances.instance(instanceIndex.m_SequentialInstIndexes[i]);
      for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
        AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
        m_Counts[(int)instance.classValue()][AIndex][(int)instance.value(AIndex)]++;
      }
      m_Priors[(int)instance.classValue()]++;
    }
  }
    
  /**
   * Calculates the class membership probabilities.
   * for the given test instance.
   *
   * @param instance the instance to be classified
   * @param counts serving as all the counts of training data.
   * @param priors serving as the number of instances in each class.
   *
   * @return predicted class probability distribution
   * @exception Exception if distribution can't be computed
   */
  public double[] localDistributionForInstance(Instance instance, Indexes instanceIndex) throws Exception {
    
    double sumForPriors = 0;
    double sumForCounts = 0;
    int attIndex, AIndex;
    int numClassesOfInstance = instance.numClasses();
    
    sumForPriors = 0;
    sumForCounts = 0;
    instanceIndex.setSequentialDataset(true);
    // Calculate all of conditional probabilities.
    sumForPriors = Utils.sum(m_Priors) + numClassesOfInstance;
    for (int j = 0; j < numClassesOfInstance; j++) {
      // pointer to counts to make access more efficient in loop
      int [][] countsPointer = m_Counts[j];
      posteriorsArray[j] = (m_Priors[j] + 1) / (sumForPriors);
      for(attIndex = 0; attIndex < instanceIndex.m_NumSeqAttsSet; attIndex++) {
        AIndex = instanceIndex.m_SequentialAttIndexes[attIndex];
        sumForCounts = Utils.sum(countsPointer[AIndex]);
        if (!instance.isMissing(AIndex)) {
          posteriorsArray[j] *= ((countsPointer[AIndex][(int)instance.value(AIndex)] + 1) / (sumForCounts + (double)instance.attribute(AIndex).numValues()));
        }
      }
    }
    
    // Normalize probabilities
    Utils.normalize(posteriorsArray);
    
    return posteriorsArray;
  }
  
  /**
   * Significance test
   * binomp:
   *
   * @param double  r, double  n, double p.
   * @return returns the probability of obtaining r or fewer out of n
   * if the probability of an event is p.
   *
   */
  public double binomP(double r, double n, double p) throws Exception {
    
    if (n == r) return 1.0;
    return Statistics.incompleteBeta(n-r, r+1.0, 1.0-p);
  }
  
  /**
   * Main method for testing this class.
   *
   * @param argv the options
   */
  public static void main(String [] argv) {
    
    Classifier scheme;
    
    try {
      scheme = new LBR();
      System.out.println(Evaluation.evaluateModel(scheme, argv));
    } catch (Exception e) {
      System.err.println(e.getMessage());
    }
  }
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -