splitattr.java

来自「基于数据挖掘的决策树改进算法和贝叶斯改进算法」· Java 代码 · 共 453 行 · 第 1/2 页
JAVA
453 行
        Entropy.build_split_and_label_dist(array, attributeNumber);
        sAndLDist = super.set_split_and_label_dist(sAndLDist);
        return make_nominal_split();
    }
    
    
    /** Helper function to do all processing for nominals. Nominal splits always exist.
     * @return TRUE if a nominal split exists, FALSE if not.
     */
    public boolean make_nominal_split() {
        splitType = nominalSplit;
        existSplit = true;
        boolean normalize = (get_split_score_criterion() == normalizedMutualInfo);
        get_mutual_info(normalize, false); // Determines if split exists
        return existSplit;
    }
    
    /** Check if it is OK to make a split on the nominal attribute by making sure at
     * least two branches have more than minSplit instances. The need is to split into
     * 2 disjoint sets, both sets containing at least 'minSplit' instances (so there
     * needs to be at least twice 'minSplit' instances). This function checks to see if
     * there are enough instances for such a split to occur. The minSplit must be at
     * least 1.
     *
     * @param attrNum The number of the attribute to be checked.
     * @param counters Counters of the values for this attribute.
     * @param minSplit The minimum split value.
     * @return TRUE if the attribute is ok to split.
     */
    public static boolean ok_to_split(int attrNum, BagCounters counters, double minSplit) {
        //   DBG(ASSERT(minSplit >= 1));
        
        // If there aren't two values (or more), we clearly can't split.
        if (counters.attr_num_vals(attrNum) < 2)
            return false;
        
        
        if (!MLJ.approx_greater(minSplit,1.0))
            return true; // We know there are (at least) two values, each of which
        //   must have at least one instance.
        
        double[] ac = counters.attr_counts()[attrNum];
        if (ac == null)
            Error.fatalErr("ID3Inducer::ok_to_split: No counters");
        
        int numAboveMin = 0;
        for (int i = 0; numAboveMin < 2 && i <= ac.length; i++)
            if (ac[i] >= minSplit)
                numAboveMin++;
        
        return (numAboveMin >= 2);
    }
    
    /** Returns the number of attributes.
     * @return The number of attrbutes.
     */
    public int get_attr_num(){
        //	ASSERT(attributeNum != -1);
        return attributeNum;
    }
    
    /** Get penalty. Only valid if you are penalizing.
     * @return Returns the penalty value.
     */
    public double penalty() {
        if (!get_penalize_by_mdl())
            Error.fatalErr("SplitAttr::penalty: MDL penalty not set");
        return mdlPenalty;
    }
    
    /** Returns the minimum distance length penalty value.
     * @return The minimum distance length penalty value.
     */
    public boolean get_penalize_by_mdl(){return penalizeByMDL;}
    
    /** Returns the mutual gain-ratio.
     *
     * @param penalize TRUE if penalization should occur, FALSE otherwise.
     * @return The mutual gain-ratio.
     */
    public double get_gain_ratio(boolean penalize) {
        double numerator = get_mutual_info(false, penalize);
        double divisor   = get_split_entropy();
        if (MLJ.approx_equal(divisor, 0.0))
            Error.fatalErr("SplitAttr::get_gain_ratio: split entropy ("+divisor+
            ") is too close to zero for division");
        //   ASSERT(numerator != Globals.UNDEFINED_REAL);
        double gain = numerator / divisor;
        
        //   ASSERT(gain >= 0);
        return gain;
    }
    
    /** Return the threshold. Can only be called if the split exists and is a real
     * threshold split.
     *
     * @return The threshold.
     */
    public double threshold() {
        if (splitType != realThresholdSplit)
            Error.fatalErr("SplitAttr::threshold(): split is not realThreshold, it is "+
            name_from_value(splitType,splitTypeEnum));
        
        return typeInfo.theThreshold;
    }
    
    private String name_from_value(int value, String[] enumNames) {
        if(value >= 0 && value < enumNames.length)
            return enumNames[value];
        return "";
    }
    
    /** Copies the given SplitAttr inot this SplitAttr.
     * @param original The SplitAttr to be copied.
     */
    public void copy(SplitAttr original) {
        
        attributeNum = original.attributeNum;
        
        typeInfo = new TypeInfo();
        typeInfo.partition = (int[])typeInfo.partition.clone();
        typeInfo.theThreshold = typeInfo.theThreshold;
        typeInfo.thresholds = (double[])typeInfo.thresholds.clone();
        
        mdlPenalty = original.mdlPenalty;
        existSplit = original.existSplit;
        splitType = original.splitType;
        penalizeByMDL = original.penalizeByMDL;
        realValuedSplitIndex = original.realValuedSplitIndex;
    }
    
    /** Initialize attribute data and distribution arrays. The first version bases it on
     * splits that were done before calling us. The second version does the split
     * based on a given categorizer and computes its worth based on the resulting
     * instance lists.
     *
     * @param instLists The InstanceList to use in initialization.
     * @param attributeNumber The number of the attribute.
     */
    public void initialize(InstanceList[] instLists,int attributeNumber) {
        reset();
        set_attr_num(attributeNumber);
        
        int numLabels = instLists[0].get_schema().num_label_values();
        
        double[][] splitAndLabelDist = new double[numLabels][instLists.length];
        for (int child = 0; child <= instLists.length; child++) {
            //MLJ.ASSERT(instLists[child] != null);
            // everything in a set has the same attr value, by definition
            for (int labelValCount = 0; labelValCount < numLabels;
            labelValCount++) {
                BagCounters bc = instLists[child].counters();
                splitAndLabelDist[labelValCount][child] = bc.label_count(labelValCount);
            }
        }
        set_split_and_label_dist(splitAndLabelDist);
    }
    
    /** The criterion calculation depends on the score criterion. For gainRatio it's
     * (surprise) gain ratio. For mutualInfo and normalizedMutualInfo it's mutualInfo.
     *
     * @return The score for this split.
     */
    public double score() {
        switch (get_split_score_criterion()) {
            case mutualInfo:
                return get_mutual_info(false, penalizeByMDL);
            case normalizedMutualInfo:
                return get_mutual_info(true, penalizeByMDL);
            case gainRatio:
                return get_gain_ratio(penalizeByMDL);
            case mutualInfoRatio:
                return get_mutual_info_ratio();
            case externalScore:
                return get_external_score();
            default:
                Error.fatalErr("SplitAttr::score: split score criterion of " +
                get_split_score_criterion() + " is out of range");
                return 0;  // Can't get here.
        }
    }
    
    /** Computes the scores and updates the cache when there are being computed many
     * times for the same number of instances and entropy. This would happen, for
     * instance, when determining the best threshold for a split.
     * @param sAndLDist The split and label distribution.
     * @param sDist The split distribution.
     * @param lDist The label distribution.
     * @param entropy The entropy value.
     * @param totalWeight The total weight of instances.
     * @return The score for this split.
     */
    public double score(double[][] sAndLDist, double[] sDist,
    double[] lDist, double entropy, double totalWeight){
        return super.score(sAndLDist, sDist, lDist, entropy,totalWeight);
    }
    
    /** Computes the scores and updates the cache when there are being computed many
     * times for the same number of instances and entropy. This would happen, for
     * instance, when determining the best threshold for a split.
     * @param sAndLDist The split and label distribution.
     * @param sDist The split distribution.
     * @param lDist The label distribution.
     * @param entropy The entropy value.
     * @return The score for this split.
     */
    public double score(double[][] sAndLDist, double[] sDist,
    double[] lDist, double entropy){
        return super.score(sAndLDist, sDist, lDist, entropy,
        Globals.UNDEFINED_REAL);
    }
    
    /** Computes the scores and updates the cache when there are being computed many
     * times for the same number of instances and entropy. This would happen, for
     * instance, when determining the best threshold for a split.
     * @param sAndLDist The split and label distribution.
     * @param sDist The split distribution.
     * @param lDist The label distribution.
     * @return The score for this split.
     */
    public double score(double[][] sAndLDist, double[] sDist, double[] lDist){
        return super.score(sAndLDist, sDist, lDist, Globals.UNDEFINED_REAL,
        Globals.UNDEFINED_REAL);
    }
    
}//End of class
splitattr.java - 源码说明

本页面展示了「基于数据挖掘的决策树改进算法和贝叶斯改进算法」中的 splitattr.java 源码文件，采用 Java 编程语言编写，共 453 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与改进算法相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?