📄 naivebayescat.java

📁 Naive Bayes算法java代码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
package nb;import shared.AttrInfo;import shared.AugCategory;import shared.BagCounters;import shared.CatDist;import shared.Categorizer;import shared.DisplayPref;import shared.Entropy;import shared.Error;import shared.Globals;import shared.Instance;import shared.InstanceList;import shared.MLJ;import shared.NominalAttrInfo;import shared.Schema;import shared.StatData;import java.io.BufferedWriter;
import java.io.IOException;/** This categorizer returns the category (label) that had the  * greatest relative probability of being correct, assuming  * independence of attributes. Relative probability of a label  * is calculated by multiplying the relative probability for  * each attribute.  The calculation of relative probabity for a  * label on a single attribute depends on whether the attribute  * is descrete or continuous.  * By Bayes Theorem, P(L=l | X1=x1, X2=x2, ... Xn=xn)  * = P(X1=x1, X2=x2, ... Xn=xn | L=l)*P(L=l)/P(X)  * where P(X) is P(X1=x1, ..., Xn=xn).  * Since P(X) is constant independent of the classes, we  * can ignore it.  * The Naive Bayesian approach asssumes complete independence  * of the attributes GIVEN the label, thus  * P(X1=x1, X2=x2, ... Xn=xn | L=l) =  * P(X1=x1|L=l)*P(X2=x2|L)*... P(Xn=xn|L)  * and P(X1=x1|L=l) = P(X1=x1 ^ L=l)/P(L=l) where this  * quantity is approximated form the data.  * When the computed probabilities for two labels have the same  * value, we break the tie in favor of the most prevalent label.  *   * If the instance being categorized has the first attribute = 1,  * and in the training set label A occured 20 times, 10 of  * which had value 1 for the first attribute, then the  * relative probability is 10/20 = 0.5.  *  * For continuous (real) attributes, the relative probability  * is based on the Normal Distribution of the values of the  * attribute on training instances with the label.  The actual  * calculation is done with the Normal Density; constants,  * which do not affect the relative probability between labels,  * are ignored.  For example, say 3 training instances have  * label 1 and these instances have the following values for a  * continous attribute: 35, 50, 65.  The program would use the  * mean and variance of this "sample" along with the attribute  * value of the instance that is being categorized in the  * Normal Density equation.  The evaluation of the Normal  * Density equation, without constant factors, provides the  * relative probability.  *  * Unknown attributes are skipped over.  *   * Assumptions :  This method calculates the probability of a label as the  * product of the probabilities of each attribute.  * This is assuming that the attributes are  * independent, a condition not likely corresponding to  * reality.  Thus the "Naive" of the title.  * This method assumes that all continous attributes have a  * Normal distribution for each label value.  *   * Comments :   For nominal attributes, if a label does not have  * any occurences for a given attribute value  * of the test instance, a probability of  * noMatchesFactor * ( 1 / # instances in training set )  * is used.  *  * For nominal attributes, if an attribute value does not  * occur in the training set, the attribute is skipped  * in the categorizer, since it does not serve to  * differentiate the labels.  *    * The code can handle dealing with unknowns as a special  * value by doing the is_unknown only in the real attribute  * case.  *  * Helper class NBNorm is a simple structure to hold the  * parameters needed to calculate the Normal Distribution  * of each Attribute,Label pair.  The NBNorms are stored in  * a Array2 table "continNorm" which is indexed by attribute  * number and label value.  *  * For continuous attributes the variance must not equal 0 since  * it is in the denominator.  If the variance is undefined for  * a label value (e.g. if a label only has only one instance  * in the training set), NaiveBayesInd will declare the  * variance to be defaultVariance, a static variable.  In  * cases where the variance is defined but equal to 0,  * NaiveBayesInd will declare the variance to be epsilon,  * a very small static variable.  *  * For continous attributes, if a label does not occur in  * the training set, a zero relative probability is  * assigned.  If a label occurs in the training set but only  * has unknown values for the attribute, noMatchesFactor is  * used as in the nominal attribute case above.  *  * Complexity : categorize() is O(ln) where l = the number of categories  * and n = the number of attributes.  *  * @author James Plummer 5/15/2001 Ported to Java  * @author Eric Bauer and Clay Kunz 5/24/1996 Added L'aplace correction  * @author Robert Allen 12/03/94 Initial revision  */public class NaiveBayesCat extends Categorizer {
  public final static String endl = new String("\n");
  // Member data (also see public data)
  private BagCounters nominCounts;		// hold data on nominal attributs
  private NBNorm[][] continNorm;		        // hold data on real attributes
  private double trainWeight;
  private int numAttributes;
  private boolean useLaplace;                     // turn on to activate Laplace correction
  private double mEstimateFactor;                // noise in Laplace correction
  private double[] attrImportance;               // importance values per attribute
  private boolean[] unkIsVal;               // should unknowns be special values? // decisions per attribute

  /** Ported from C++ >     *   enum UnknownIsValueEnum { unknownNo, unknownYes, unknownAuto }; //C++ equivalent    */  public static final int unknownNo = 1;
  public static final int unknownYes = 2;
  public static final int unknownAuto = 3;
  private int unknownIsValue; // 1, 2, 3.
  private double klThreshold;
     /** Fraction of a single occurence to use in cases when a label    * has no occurences of a given nominal value in the training set:    */  private double noMatchesFactor;
  /** If true Evidence projection is used.    */  private boolean useEvidenceProjection;
  /** The scale factor to use with Evidence Projection.    */   private double evidenceFactor;
  /** Categorizer option defaults.    */  public static final double defaultMEstimateFactor = 1.0;
  public static final boolean defaultLaplaceCorrection = false;
  public static final int defaultUnknownIsValue = unknownNo;
  public static final double defaultKLThreshold = 0.1;
  public static final double defaultNoMatchesFactor = 0.0;
  public static final boolean defaultUseEvidenceProjection = false;
  public static final double defaultEvidenceFactor = 1.0;
  /** Value to use for Variance when actual variance = 0:    */  public static final double epsilon = .01;
  /** Value to use for Vaiance when actual variance is undefined becase there    * is only one occurance.    */  public static final double defaultVariance = 1.0;
/** Constructor   * @param dscr - the description of this Inducer.  * @param instList - training data.  */  public NaiveBayesCat(String dscr, InstanceList instList) {
    super(instList.num_categories(), dscr, instList.get_schema());
    nominCounts = instList.counters();
    trainWeight = instList.total_weight();
    numAttributes = instList.num_attr();
    logOptions.LOG(3, "NBC . . numAttributes = "+numAttributes);
    useLaplace = defaultLaplaceCorrection;
    mEstimateFactor = defaultMEstimateFactor;
    unkIsVal = null;
    unknownIsValue = defaultUnknownIsValue;
    klThreshold = defaultKLThreshold;
    noMatchesFactor = defaultNoMatchesFactor;
    useEvidenceProjection = defaultUseEvidenceProjection;
    evidenceFactor = defaultEvidenceFactor;
    attrImportance = this.compute_importance(instList);
    continNorm = this.compute_contin_norm(instList);
  }  /** Copy Constructor.    * @param source - the NaiveBayesCat to copy.    */  public NaiveBayesCat(NaiveBayesCat source) {
    super(source.num_categories(), source.description(), source.get_schema());
    nominCounts = new BagCounters(source.nominCounts);
    continNorm = source.copyContinNorm();
    attrImportance = source.copyAttrImportance();
    trainWeight = source.trainWeight;
    numAttributes = source.numAttributes;
    useLaplace = source.useLaplace;
    mEstimateFactor = source.mEstimateFactor;
    unkIsVal = null;
    unknownIsValue = source.unknownIsValue;
    klThreshold = source.klThreshold;
    noMatchesFactor = source.noMatchesFactor;
    useEvidenceProjection = source.useEvidenceProjection;
    evidenceFactor = source.evidenceFactor;
  }  /** Categorizes a single instances based upon the training data.    * @param instance - the instance to categorize.    * @return the predicted category.    */  public AugCategory categorize(Instance instance) {
    CatDist cDist = score(instance);
    AugCategory cat = cDist.best_category();
    return cat;
  }  /** Simple Method to return an ID.    * @return - an int representing this Categorizer.    * @deprecated CLASS_NB_CATEGORIZER has been deprecated    */  public int class_id() {return CLASS_NB_CATEGORIZER;}
  /** Returns a pointer to a deep copy of this NaiveBayesCat.    * @return - the copy of this Categorizer.    */  public Object clone() {
    if ( !(this instanceof NaiveBayesCat) ) { 
      Error.fatalErr("NaiveBayesCat.clone: invoked for improper class");    }    return new NaiveBayesCat(this);
  }  /** Compute the norms of the continuous attributes
    * @param instList - the instances to calculate.    * @return the array[][] of NBNorms.    */  public static NBNorm[][] compute_contin_norm(InstanceList instList) {
    int contAttrCount = 0;
    int numCategories = instList.num_categories();
    Schema schema = instList.get_schema();
    int numAttributes = schema.num_attr();
   
    // start labels at -1 for unknown
    NBNorm[][] normDens = new NBNorm[numAttributes][numCategories + 1]; // no initial value
    for (int m=0; m<normDens.length;m++) {
      for (int n=0; n<normDens[m].length;n++) {
        normDens[m][n] = new NBNorm();
        normDens[m][n].set_mean_and_var(0,0);
      }
    }
      
    // loop through each attribute, and process all instances for each
    // continuous one
    for (int attrNum = 0; attrNum < numAttributes; attrNum++) {
      AttrInfo attrinfo = schema.attr_info(attrNum);
      if (attrinfo.can_cast_to_real()) {
	  // this is a continuous attribute
	  contAttrCount++;
	 
	  // read each occurance in the list and feed the stats for attribute
	  StatData[] continStats = new StatData[numCategories + 1];
        for (int j=0; j<continStats.length;j++) {
          continStats[j]=new StatData();
        }
//	  for (ILPix pix(instList); pix; ++pix) { //What?
        for (int i = 0; i < instList.num_instances(); i++) {
	    Instance inst = new Instance((Instance)instList.instance_list().get(i));
	    int labelVal = schema.label_info().cast_to_nominal().get_nominal_val(inst.get_label()); //for some reason the label values for the instances are one number higher than the actual value
	    MLJ.ASSERT(labelVal < numCategories, " NaiveBayesCat.compute_contin_norm()");

            // Ignore unknowns.
	    if ( !attrinfo.is_unknown(inst.get_value(attrNum))) {
	       double value = attrinfo.get_real_val(inst.get_value(attrNum));
	       continStats[labelVal].insert( value );
	    }
	  }

	  double mean;
        double var;
	  // extract Normal Density parameters into normDens table
	  for (int label = 0; label < numCategories; label++) {
	    if (continStats[label].size() == 0 ) {
	       mean = 0;
	       var = defaultVariance;
	    }
	    else {
	       mean = continStats[label].mean();
	       if (continStats[label].size() == 1 )
		  var = defaultVariance;
	       
	       else if ( (var = continStats[label].variance(0))<=0 )   // var == 0
		  var = epsilon;
	    }
	    normDens[attrNum][label].set_mean_and_var(mean,var);

	    //@@ pass in a log option?
	    //LOG(3, " Continuous Attribute # " << attrNum <<
	    //", Label " << label << ": Mean = " << mean <<
	    //", Variation = " << var << endl );
	  }
      } // end of handling this continous attribute
    }    // end of loop through all attributes

    if (contAttrCount==0) {  // no continous attributes found
      normDens = null;
    }
    return normDens;
  }  /** Computes importance values for each nominal attribute using
    * the mutual_info (entropy).
    * Static function; used as helper by train() below.    * @param instList - the instances to use.    * @return - the array[] of importance values.    */  public static double[] compute_importance(InstanceList instList) {
    double[] attrImp = new double[instList.num_attr()];
    for (int i = 0; i < attrImp.length; i++) {
      attrImp[i] = 0;
    }
   
    double ent = Entropy.entropy(instList);
    if (ent == Globals.UNDEFINED_REAL) {
      Error.fatalErr("compute_importance: undefined entropy");
    }
    if(ent < 0 && -ent < MLJ.realEpsilon) {
      ent = 0;
    }
    for (int i=0; i<instList.num_attr(); i++) {
      if(instList.get_schema().attr_info(i).can_cast_to_real()) {
	  attrImp[i] = 0;
      }
      else if(instList.get_schema().attr_info(i).can_cast_to_nominal()) {
	  if(ent <= 0) {
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -