⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 splitattr.java

📁 c4.5 ID3 分类决策数 公用java包 share
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package shared;
import java.lang.*;
import java.util.*;

/** A class for determining, holding, and returning the information associated with
 * an attribute split.
 *
 */
public class SplitAttr extends SplitScore{
    //ENUM
    /** SplitTypeEnum value.
     */
    static final public int noReasonableSplit = 1;		//SplitType enum
    /** SplitTypeEnum value.
     */
    static final public int realThresholdSplit = 2;		// "
    /** SplitTypeEnum value.
     */
    static final public int multiRealThresholdSplit = 3;	// "
    /** SplitTypeEnum value.
     */
    static final public int nominalSplit = 4;			// "
    /** SplitTypeEnum value.
     */
    static final public int partitionSplit = 5;			// "
    //END ENUM
    
    /** Names of SplitTypeEnum values.
     */
    public static String[] splitTypeEnum ={" ","no reasonable split","real threshold split",
    "multi real threshold split","nominal split","partition split"};
    
    private class TypeInfo {
        /** The partition for the split.
         */
        public int[] partition; // used in partitionSplit
        /** The threshold for the split.
         */
        public double theThreshold;
        /** Other threholds possible for the split.
         */
        public double[] thresholds;
        // any other info we need
    }
    
    private int attributeNum;
    private TypeInfo typeInfo;
    private double mdlPenalty;
    private boolean existSplit;
    private int splitType;
    private boolean penalizeByMDL;
    private int realValuedSplitIndex;
    
    /** Constructor.
     */
    public SplitAttr() {
        penalizeByMDL = false;
        attributeNum = Globals.UNDEFINED_INT;
        typeInfo = new TypeInfo();
        reset();
    }
    
    /** Returns the type value of this SplitAttr.
     * @see #noReasonableSplit
     * @see #realThresholdSplit
     * @see #multiRealThresholdSplit
     * @see #nominalSplit
     * @see #partitionSplit
     * @return The type value of this attribute.
     */
    public int split_type(){ return splitType; }
    
    /** Returns the mutual information. The mutual information must be >= 0.
     * @param normalize TRUE if the mutual info is to be normalized, FALSE otherwise.
     * @param penalize TRUE if the mutual info should be penalized, FALSE otherwise.
     * @return The mutual information for this attribute split.
     */
    public double get_mutual_info(boolean normalize, boolean penalize) {
        //double result = super.get_mutual_info(false);
        double result = super.get_unnormalized_mutual_info(); // Normalize later.
        if (split_type() == realThresholdSplit ||
        split_type() == multiRealThresholdSplit) {
            // @@ should we penalize multithresholds more? probably so
            // @@ penalize by (numSplits-1)*mdlPenalty
            if (penalize)
                result = Math.max(0.0, result - mdlPenalty);
        }
        if (normalize)
            result = normalize_by_num_splits(result);
        // Result right now >=0, but this is if we decide to avoid
        // the max a few lines above and be like C4.5
        // if (result < 0) {
        //    if (!mlc.approx_equal(result, 0)) {
        //	 ibid->existSplit = FALSE;
        //	 ibid->splitType = noReasonableSplit;
        //  }
        //   result = 0;
        // }
        return result;
    }
    
    /** Returns TRUE if there is a split stored in this SplitAttr.
     * @return Returns TRUE if the SplitAttr contains a split, FALSE otherwise.
     */
    public boolean exist_split(){return existSplit;}
    
    /** Sets if the split should be penalized by minimum description length.
     * @param choice TRUE if penalizing should occur, FALSE otherwise.
     */
    public void set_penalize_by_mdl(boolean choice) {penalizeByMDL = choice;}
    
    /** Helper function to do all processing for real thresholds. When a split is found,
     * this also determines and stores the threshold, the mutual info, the cond info,
     * and the mdl penalty for cost of storing the threshold.
     *
     * @param column The column of real values for this attribute and their associated label values.
     * @param attrNum The number of the attribute.
     * @param minSplit The minimum split value.
     * @param smoothInst The instance to be smoothed towards.
     * @param smoothFactor The factor by which real values are smoothed.
     * @return A split threshold for a real valued attribute.
     */
    public boolean make_real_split(RealAndLabelColumn column, int attrNum,
    double minSplit, int smoothInst,
    double smoothFactor) {
        reset();
        set_attr_num(attrNum);
        DoubleRef thresh =new DoubleRef(0.0);
        IntRef splitIndex =new IntRef(0);
        IntRef numDistinct =new IntRef(0);
        Entropy.find_best_threshold(column, minSplit, this, thresh, splitIndex,
        numDistinct, smoothInst, smoothFactor);
        save_real_split(thresh, splitIndex, numDistinct);
        return existSplit;
    }
    
    /** The data calculated by find_best_threshold() is saved in the SplitAttr via this
     * function.
     *
     * @param thresh The threshold to be saved.
     * @param splitIndex The index of the split to be saved.
     * @param numDistinct The number of distinct splits.
     */
    public void save_real_split(DoubleRef thresh, IntRef splitIndex, IntRef numDistinct) {
        existSplit = false;
        if (thresh.value == Globals.UNDEFINED_REAL || splitIndex.value < 0 || numDistinct.value <= 0) {
            reset();
            return;
        }
        
        free_type_info();
        splitType = realThresholdSplit;
        if (MLJ.approx_equal(total_weight(), 0.0))
            Error.fatalErr("SplitAttr::save_real_split: Total weight is near 0.   "+
            "Cannot continue, as need to divide by it");
        // We always compute the MDL penalty, so that if callers turn
        //   it on or off, we have the number already computed (correctness,
        //   not efficient).
        mdlPenalty = MLJ.log_bin(numDistinct.value) / total_weight();
        typeInfo.theThreshold = thresh.value;
        existSplit = true;
        realValuedSplitIndex = splitIndex.value;
    }
    
    /** Delete and clear typeInfo.
     */
    public void free_type_info() {
        switch(splitType) {
            case noReasonableSplit:
            case nominalSplit:
                break;
            case realThresholdSplit:
                typeInfo.theThreshold = Globals.UNDEFINED_INT;
                break;
            case multiRealThresholdSplit:
                //	 delete typeInfo.thresholds;
                typeInfo.thresholds = null;
                break;
            case partitionSplit:
                //	 delete typeInfo.partition;
                typeInfo.partition = null;
                break;
            default:
                MLJ.Abort();
        }
    }
    
    /** Reset values, except attribute number.
     */
    public void reset() {
        //   DBG(ASSERT(attributeNum >= 0 || attributeNum == Globals.UNDEFINED_INT));
        super.reset();
        splitType = noReasonableSplit;
        existSplit = false;
        mdlPenalty = 0;
        realValuedSplitIndex = Globals.UNDEFINED_INT;
    }
    
    /** Sets the attribute number for this split.
     * @param num The number of the new attribute.
     */
    public void set_attr_num(int num) {
        if (num < 0 && num != Globals.UNDEFINED_INT) {
            Error.fatalErr("SplitAttr::split_attr_num: attempt to set attribute number to "+
            num+", which is neither non-negative nor UNDEFINED_INT ("+Globals.UNDEFINED_INT+")");
        }
        attributeNum = num;
    }
    
    /** Helper function to do all processing for nominals. Nominal splits always exist.
     *
     * @param instList The InstanceList over which to make a nominal split.
     * @param attributeNumber The number of the attribute to be split.
     * @return TRUE if a nominal split exists, FALSE if not.
     */
    public boolean make_nominal_split(InstanceList instList,
    int attributeNumber) {
        reset();
        set_attr_num(attributeNumber);
        Schema schema = instList.get_schema();
        if (!schema.attr_info(attributeNumber).can_cast_to_nominal())
            Error.fatalErr("SplitAttr::make_nominal_split: attribute "+attributeNumber+
            " can not be cast to nominal");
        InstanceList ptr = instList;
        InstanceList[] array = new InstanceList[1];
        array[0] = ptr;
        double[][] sAndLDist =

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -