📄 naivebayescat.java

📁 Naive Bayes算法java代码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
	    attrImp[i] = 0;
        }
	  else {
	    double condEnt = Entropy.cond_entropy(instList.counters().value_counts()[i], instList.counters().attr_counts()[i], instList.total_weight());

	    if(condEnt < 0 && -condEnt < MLJ.realEpsilon) {
	       condEnt = 0;
          }
	    attrImp[i] = 100 - 100 * (condEnt / ent);
	    if(attrImp[i] < 0 && attrImp[i] >= -1000 * MLJ.realEpsilon) {
	       attrImp[i] = 0;  // avoid small negatives
          }
	    else if(attrImp[i] < 0) {
	       Error.fatalErr("compute_importance: attribute " + i +
		      " had importance " + attrImp[i] + "which is severly negative");
          }
	  }
      }
      else {
	 Error.fatalErr("compute_importance: attribute " + i + " has " +
	    "unsupported type.  Must be real or nominal.");
      }
    }
    return attrImp;
  }  /** Computes the distance metrics for all attributes.    * Should only be called once.    */  private void compute_kl_distances() {    if(unkIsVal != null) {      Error.fatalErr("NaiveBayesCat.compute_kl_distances: kl distances already computed");    }    unkIsVal = new boolean[get_schema().num_attr()];    for(int i=0; i<get_schema().num_attr(); i++) {      if(!get_schema().attr_info(i).can_cast_to_nominal()) {	  Error.fatalErr("NaiveBayesCat.categorize: UNKNOWN_IS_VALUE is set and " +	      get_schema().attr_name(i) + " is a real value with unknowns.  " +            "UNKNOWN_IS_VALUE settings of " +	      "yes and auto are not supported for undiscretized real values " +	      "with unknowns.");      }            double dist = kl_distance(i);      if(dist >= klThreshold) {	  logOptions.LOG(1, "k-l distance for attribute " + get_schema().attr_name(i)	      + " (" + dist + ") exceeds threshold" + endl);
	  unkIsVal[i] = true;
      }
      else {
        unkIsVal[i] = false;
      }
    }
  }  /** copyAttrImportance copys the array of doubles stored in the attrImportance
    * Array and returns the new Array. This function is used to copy NaiveBayesCat
    * Objects.    * @author James Plummer added to package for compatibility.    * @return the new copy of attrImportance.    */  private double[] copyAttrImportance() {
    if ( this.attrImportance != null ) {
      double[] result = new double[attrImportance.length];
      for (int i = 0; i < attrImportance.length; i++) {
        result[i] = attrImportance[i];
      }
      return result;
    }
    else {
      return null;
    }    
  }  /** copyContinNorm copys the array of NBNorms stored in the continNorm
    * Array and returns the new Array. This function is used to copy NaiveBayesCat
    * Objects.    * @author James Plummer added for compatiblity.    * @return the new copy of continNorm.    */  private NBNorm[][] copyContinNorm() {
    if ( this.continNorm != null ) {
      NBNorm[][] result = new NBNorm[continNorm.length][];
      for (int i = 0; i < continNorm.length; i++) {
        result[i] = new NBNorm[continNorm[i].length];
        for (int j = 0; j < continNorm[i].length; j++) {
          result[i][j] = new NBNorm(continNorm[i][j]);
        }
      }

      return result;
    }
    else {
      return null;
    }    
  }  /** Prints a readable representation of the Cat to the
    *given stream.    */  public void display_struct(BufferedWriter stream, DisplayPref dp) {
    try {
    if (stream != null) {
      logOptions.set_log_stream(stream);
    }
    stream.write("Simple NaiveBayes Cat " + this.description() + 
                 " categorizing using prevalence data in BagCounter: "  + endl + 
                 nominCounts + endl);
   
    if ( continNorm != null ) {
      stream.write("Categorizing uses Normal Density to estimate probability" +
	  " of continuous attributes.  The mean, variance, and standard" +
	  " deviation of each attribute,label combination is: " + endl);
      for (int i = 0; i < numAttributes; i++) {
	  if ( nominCounts.value_counts()[i] != null )    // nominal attribute
	    stream.write("Attribute " + i + ":" + " Nominal Attribute." + endl);
	  else {
	    stream.write("Attribute " + i + ":" + endl);
	    for (int j = 0; j < num_categories(); j++) 
	       stream.write("  Label " + j + "\t\t" + continNorm[i][j].mean +
		              "\t" + continNorm[i][j].var + endl);
	  }
      }
    }
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }  /** findMax finds the largest value for an array of doubles
    * @author James Plummer added to match C++ functionality.    * @param d - the array of doubles.
    * @return the maximum number.
    */  public static double findMax(double[] d) {
    double result = d[0];
    for (int i = 1; i < d.length; i++) {
      if (result < d[i]) {
        result = d[i];
      }
    }
    return result;
  }  /** findMin finds the smallest value for an array of doubles
    * @author James Plummer added to match C++ functionality.    * @param d - the array of doubles.
    * @return the minimum number.
    */
  public static double findMin(double[] d) {
    double result = d[0];
    for (int i = 1; i < d.length; i++) {
      if (result > d[i]) {
        result = d[i];
      }
    }
    return result;
  }  /** Helper function: generate a single probability using     * the Laplace correction.    * Evidence projection is not used if there's no data    * (labelCount == 0).    */  private double generate_cond_probability(double labelValCount, double labelCount,
					            int numAttrVals, int numAttr) {
    if(useEvidenceProjection && labelCount > 0) {
      double maxEvidence = MLJ.log_bin(1.0 + trainWeight*evidenceFactor);
      return CatDist.single_evidence_projection(labelValCount, labelCount, maxEvidence);
    }
    else if (useLaplace) {
      double effectiveMEstimate = mEstimateFactor;
      if(effectiveMEstimate == 0.0) {
	  effectiveMEstimate = 1.0 / trainWeight;
      }
      return (labelValCount + effectiveMEstimate) / (labelCount + numAttrVals * effectiveMEstimate);
    }
    else if (labelValCount == 0) {
      if(noMatchesFactor >= 0) {
	  return noMatchesFactor / trainWeight;
      }
      else if(noMatchesFactor == -1) {
	  return (double)(labelCount) / trainWeight / trainWeight;
      }
      else if(noMatchesFactor == -2) {
	 return (double)(labelCount) / trainWeight
	    / (trainWeight * this.get_schema().num_attr());
      }
      else {
	 Error.fatalErr("NaiveBayesCat.generate_cond_probability: noMatchesFactor has illegal value of " +
	    noMatchesFactor);
	 return 0;
      }
    }
    else {
      // if labelCount == 0, then labelValCount should also be 0 and we'll
      // choose the case above instead of this one.
      MLJ.ASSERT( (labelCount > 0), "NaiveBayesCat.generate_cond_probability()");
      return (double)(labelValCount / labelCount);
    }
  }  /** Helper function: generate a single probability.  Allow
    * for Laplace correction.    */  private double generate_probability_prior(double labelCount, int numLabels) {
   if(useEvidenceProjection) {
      double maxEvidence = MLJ.log_bin(1.0 + trainWeight*evidenceFactor);
      return CatDist.single_evidence_projection(labelCount, trainWeight, maxEvidence);
   }
   else if(useLaplace) {
      double effectiveMEstimate = mEstimateFactor;
      if(effectiveMEstimate == 0.0) {
	 effectiveMEstimate = 1.0 / trainWeight;
      }
      return (labelCount + effectiveMEstimate) / (trainWeight + numLabels * effectiveMEstimate);
   }
   else if(labelCount == 0)
      return 0;

   else {
      // if labelCount == 0, then labelValCount should also be 0 and we'll
      // choose the case above instead of this one.
      MLJ.ASSERT( (labelCount > 0), "NaiveBayesCat.generate_probablility()");
      return labelCount / trainWeight;
   }
  }    /** Removed function *//*  public void generate_viz(BufferedWriter stream, boolean[] autoDiscVector, int evivizVersion) throws IOException {
  }*/  /** fuctions for retrieving and setting optional variables. */  public double get_evidence_factor() { return evidenceFactor; }
  public double get_kl_threshold() { return klThreshold; }
  public double get_m_estimate_factor() { return mEstimateFactor; }
  public double get_no_matches_factor() { return noMatchesFactor; }
  public int get_unknown_is_value() { return unknownIsValue; }
  public boolean get_use_evidence_projection() { return useEvidenceProjection; }
  public boolean get_use_laplace() { return useLaplace; }
  public void set_evidence_factor(double f) { evidenceFactor = f; }
  public void set_kl_threshold(double th) { klThreshold = th; }
  /** Initialize the probabilities to be the class probabilities
    * P(L = l)     * @param nominCoutns - the BagCounter to initilize.    */  public static void init_class_prob(BagCounters nominCounts,
                      double trainWeight,
                      double[] prob, boolean useLaplace,
                      boolean useEvidenceProjection,
                      double evidenceFactor)
  {
    if (useEvidenceProjection) {
      for (int labelVal = 0; labelVal < prob.length; labelVal++) {
        prob[labelVal] = nominCounts.label_count(labelVal);
      }
      CatDist.apply_evidence_projection(prob, evidenceFactor, true);
    }
    else if (useLaplace) {
      int numLabels = prob.length - 1;
  
      // No laplace correction for unknown label.  This fixes bug #526924
      MLJ.ASSERT(nominCounts.label_count(Globals.UNKNOWN_CATEGORY_VAL) == 0,"NaiveBayesCat.init_class_prob()");
      prob[Globals.UNKNOWN_CATEGORY_VAL] = 0;
      for (int labelVal = 1; labelVal < prob.length; labelVal++) {
        prob[labelVal] = (double)(nominCounts.label_count(labelVal) + 1)/(trainWeight + numLabels);
      }
    }
    else {
      for (int labelVal = 0; labelVal < prob.length; labelVal++) {
        prob[labelVal] = (double)(nominCounts.label_count(labelVal))/(trainWeight);
      }
    }
    
    // Check that probabilities sum to about 1.0
    double probSum = sumArray(prob);
    MLJ.verify_approx_equal(probSum,1,"NaiveBayesCat.init_class_prob: prob does not sum to one");
  }  /** Compute the KL distance metric for a single attribute.    * If we don't have minimum support, we always return 0    * @param attrNum - the number of the attribute to compute distances.    * @return the distance for the attribute.    */  private double kl_distance(int attrNum)  {    int numLabelVals = get_schema().num_label_values();
    double[] p = new double[numLabelVals];
    double[] q = new double[numLabelVals];
       if(!get_schema().attr_info(attrNum).can_cast_to_nominal()) {
      Error.fatalErr("NaiveBayesCat.kl_distance: this function does not work " +
	 "for real attributes");
    }    double support = nominCounts.attr_count(attrNum, Globals.UNKNOWN_CATEGORY_VAL);
    MLJ.verify_strictly_greater(trainWeight,0.0,"NaiveBayesCat.kl_distance: " +
			       "total train weight is negative");
    if (support < 5) { // @@ make this an option
      return 0;         
    }
    int numLabelValues = get_schema().num_label_values();

    for(int i=0; i < numLabelValues; i++) {
      // Compute p(C) and p(C|?) with laplace correction so we
      //   avoid zeros and can do KL distance.
      q[i] = (nominCounts.label_count(i) + 1)/(trainWeight + numLabelValues);

      MLJ.ASSERT(support > 0,"NaiveBayesCat.kl_distance()");
      p[i]=(nominCounts.val_count(i, attrNum, Globals.UNKNOWN_CATEGORY_VAL) + 1)/(support + numLabelValues); 
    } 

    // now get the distance

    logOptions.LOG(3, "p=" + p + "\nq=" + q + endl);

    double dist = this.kullback_leibler_distance(p, q);
    logOptions.LOG(2, "k-l distance for attribute " + this.get_schema().attr_name(attrNum) +
        " (" + attrNum + "): " + dist + endl);
   
    return dist;
  }  /** Removed function *//*  boolean operator==(Categorizer rhs) {  }*/  /** Removed function *//*  public void make_persistent(PerCategorizer_ dat) {
*/  /** Compute a Kullback Leibler distance metric given an array    * of p(x) and q(x) for all x.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -