⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 id3inducer.java

📁 Decision Tree 决策树算法ID3 数据挖掘 分类
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
               if (mutualInfoAboveMean || !foundScoreAboveMean) {
//                  MLJ.ASSERT(meanMutualInfo != Globals.UNDEFINED_REAL);
                  logOptions.LOG(3, ", and that "+split.score()+" > "+meanMutualInfo+'\n');
                  double score = split.score();
                  logOptions.LOG(3,"Testing attribute "+attrNum+" ("+schema.attr_name(attrNum)+"): "+'\n');
//                  logOptions.LOG(3,split);
//                  logOptions.LOG(3,"Comparing score ("+score+") against max score + eps ("+
//                     (maxScore+MLJ.realEpsilon())+")"+'\n');
                  if (score > maxScore + MLJ.realEpsilon ||
                     (!foundScoreAboveMean && mutualInfoAboveMean)) {
                     logOptions.LOG(2,"Chose attribute "+attrNum+'\n');
                     maxScore = score;
                     bestSplit[0] = split;
                     logOptions.LOG(3,"max score becomes "+maxScore+'\n');
                  }
                  if (mutualInfoAboveMean)
                     foundScoreAboveMean = true;
               } else{
                  logOptions.LOG(3,"...not above mean"+'\n');
               }
            }
         }
      }
   }

   /** Checks if all attributes are multi-valued. An attribute is multivalued if
    * an instance can have more than one value at one time. If the attribute
    * contains values that are neither real or nominal, an abort message is issued.
    * @return False if the values are real numbers or if the nominal values are
    * not multivalued. Otherwise True is returned.
    */
   public boolean all_attributes_multi_val() 
   {
      boolean multiVal = true;
      Schema schema = TS.get_schema();
      for (int attrNum = 0; attrNum < schema.num_attr() && multiVal; attrNum++){
         if (schema.attr_info(attrNum).can_cast_to_real())
            multiVal = false;
         else if (schema.attr_info(attrNum).can_cast_to_nominal())
            multiVal = multi_val_attribute(attrNum);
               else
                  MLJ.Abort();
      }
      return multiVal;
   }
   /** Compute the split information for a given attribute.
    * @param attrNum The index number of this attribute column.
    * @param split The attribute to be split on.
    */
   public void split_info(int attrNum, SplitAttr split)
   {
      split_info(attrNum, split, null);
   }

   /** Compute the split information for a given attribute.
    * @param attrNum     The index number of this attribute column.
    * @param split       The attribute to be split on.
    * @param realColumns The columns of values specified for each attribute over
    * all instances in a data set.
    */
   public void split_info(int attrNum, SplitAttr split,
		   RealAndLabelColumn[] realColumns) 
   {
      Schema schema = TS.get_schema();

      if (attrNum < 0 || attrNum > schema.num_attr())
         Error.fatalErr("ID3Inducer::split_info: attrNum " + attrNum +
            " not in range 0 to " + schema.num_attr());
   
      logOptions.LOG(2,"Testing attribute "+attrNum+" ("+TS.get_schema().attr_name(attrNum)+"): ");

   // split needs to know if the Minimum Description Length Adjustment
   //   for continuous attributes needs to be applied.
      split.set_penalize_by_mdl(get_cont_mdl_adjust());
      double minSplit = Entropy.min_split(instance_list(),
			     get_lower_bound_min_split_weight(),
			     get_upper_bound_min_split_weight(),
			     get_min_split_weight_percent());
      double nominalMinSplit = minSplit;
      if (get_nominal_lbound_only()) {
         nominalMinSplit = get_lower_bound_min_split_weight();
         if (nominalMinSplit < 1)
            Error.fatalErr("split_info:  lowerBoundMinSplit (" +
               nominalMinSplit+") must be at least one");
      }
      logOptions.LOG(4,"Min split: "+(int)minSplit+", nominal min split: "+(int)nominalMinSplit+'\n');

   // If the attribute is nominal and has more than one value, then use
   //   mutual_info or gain_ratio to determine the information gained by
   //   splitting on it.
      if (schema.attr_info(attrNum).can_cast_to_nominal()) {
         split.set_split_score_criterion(get_split_score_criterion());
         if (SplitAttr.ok_to_split(attrNum, TS.counters(), nominalMinSplit)) {
            split.make_nominal_split(instance_list(), attrNum);
            logOptions.LOG(2,"Criterion score for attribute "+attrNum+" is "+split.score()+'\n');
	 MLJ.ASSERT(split.score() >= 0, "ID3Inducer::split_info(): split.score() >= 0");
         }
      }
   // Otherwise the attribute should be real and real_mutual_info is used
   //   to determine the threshold which gives the maximum information
   //   gain when split on.
      else if (schema.attr_info(attrNum).can_cast_to_real()) {
      // Find the best threshold (in find_best_threshold(), invoked through
      //   make_real_split()) based on the default split criterion; then
      //   set the criterion back.
      // @@ This will be an option for reals.
         if (realColumns == null)
            Error.fatalErr("ID3Inducer::split_info: can't split on real attribute "+
               schema.attr_name(attrNum)+" -- realColumns is null");
         RealAndLabelColumn column = realColumns[attrNum];
         if (column == null)
            Error.fatalErr("ID3Inducer::split_info: can't split on real attribute "+
               schema.attr_name(attrNum)+" -- the given column is null");
         column.sort();

         split.set_split_score_criterion(SplitScore.defaultSplitScoreCriterion);
         split.set_penalize_by_mdl(false);
         split.make_real_split(column, attrNum, minSplit,
            get_smooth_inst(), get_smooth_factor());
         split.set_split_score_criterion(get_split_score_criterion());
         split.set_penalize_by_mdl(get_cont_mdl_adjust());
         if (split.exist_split()) {
            logOptions.LOG(2,"Threshold: "+split.threshold()+", criterion score: "+
               split.score()+", entropy: "+split.get_entropy()+'\n');
            MLJ.ASSERT(split.score() >= 0, "ID3Inducer::split_info(): split.score() >= 0");
         }
      } else
         MLJ.Abort();
      
      if (split.split_type() == SplitAttr.noReasonableSplit)
      	logOptions.LOG(2,"No reasonable split"+'\n');
   }

//   public void set_unknown_edges(){ }

//   public void display_struct(){ }

//Additions by JL

//   private NO_DEFAULT_OPS(ID3Inducer);

   /** Create an Inducer for recursive calls. Since TDDTInducer is an abstract
    * class, it can't do the recursive call.
    * @param descr   The description of the new subinducer.
    * @param aCgraph A previously defined Cgraph for the inducer.
    * @return The new sub-ID3Inducer created.
    */
   public TDDTInducer create_subinducer(String descr, CGraph aCgraph) 
   {
      ID3Inducer inducer = new ID3Inducer(descr, aCgraph);
      inducer.copy_options(this);
      inducer.set_level(get_level() + 1);
      return inducer;
   }

   /** Build categorizer for the given attribute. The specified SplitAttr is
    * assumed to be valid.
    * @param split    The attribute to be split on.
    * @param catNames The category names that an instance may be categorized
    * under.
    * @return The NodeCategorizer containing a categorizer that splits on this
    * attribute.
    */
   public NodeCategorizer split_to_cat(SplitAttr split,
					 LinkedList catNames) 
   {
      int attrNum = split.get_attr_num();
      Schema schema = TS.get_schema();

//   MLJ.ASSERT(split.split_type() != SplitAttr::noReasonableSplit);
//   MLJ.ASSERT(attrNum >= 0);
//   MLJ.ASSERT(split.score() >= 0);
   
   // Else, build the categorizer
      String attrName = schema.attr_info(attrNum).name();
      if (get_debug()) {
         attrName = attrName+" (#=" + TS.num_instances()+
            "\\nENT="+ split.get_entropy()+
            "\\nMI=" + split.get_mutual_info(false, false)+
            "\\nGAIN=" + split.get_gain_ratio(false);
         if (split.get_penalize_by_mdl())
            attrName =attrName+"\\nMDL penalty=" + split.penalty();
         attrName =attrName+"\\nSCORE=" + split.score() + ")";
      }

      if (schema.attr_info(attrNum).can_cast_to_nominal()) {
//         MLJ.ASSERT(split.split_type() == SplitAttr::nominalSplit);
         NominalAttrInfo nai = schema.attr_info(attrNum).cast_to_nominal();
         int size = nai.num_values() + 1; // +1 for unknown
//         catNames = new String[size];//(Globals.UNKNOWN_CATEGORY_VAL, size);
         catNames.add(Globals.UNKNOWN_CATEGORY_VAL,"?");
         int cat = Globals.FIRST_CATEGORY_VAL;
         for (int i = 1; i < size; i++, cat++)
            catNames.add(cat,nai.get_value(i));

//UNKNOWN_NOMINAL_VAL is now 0 instead of -1. As a result, the 
//original input value for get_value would have been out of bounds 
//for the number of values. - JL
//            catNames.add(cat,nai.get_value(i + Globals.UNKNOWN_NOMINAL_VAL));
      
         return new AttrCategorizer(schema, attrNum, attrName);
      }
      else if (schema.attr_info(attrNum).can_cast_to_real()) {
         if (split.split_type() == SplitAttr.realThresholdSplit) {
            logOptions.LOG(5, split.threshold()+""+'\n');	 
            ThresholdCategorizer cat = new
            ThresholdCategorizer(schema, attrNum, split.threshold(),
               attrName);
		String[] categories = cat.real_edge_strings();
            for(int z = 0; z < categories.length; z++)
			catNames.add(z,categories[z]);
            return cat;
         } else {
            Error.fatalErr("ID3Inducer::split_to_cat: bad split type for "+
               "continuous attribute "+attrNum);
            return null;
         }
      } else {
         Error.fatalErr("ID3Inducer::split_to_cat: unrecognized attribute type"+
            " for attribute "+attrNum);
         return null;
      }
   }

   /** Returns the reference to the copy of ID3Inducer with the same settings.
    * @return A reference to an ID3Inducer.
    */
   public Inducer copy() 
   {
      Inducer ind = new ID3Inducer(this);
      return ind;
   }

   /** Returns the class id of this of this inducer.
    * @deprecated This method should be replaced with Java's instanceof operator.
    * @return Integer assigned to this inducer.
    */
   public int class_id(){ return ID3_INDUCER; }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -