⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 splitscore.java

📁 java数据挖掘算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
     *
     * @return TRUE if there is a splitAndLabel distribution, FALSE otherwise.
     */
    public boolean has_distribution(){return has_distribution(true);}
    
    /** Checks if there exists a splitAndLabel distribution.
     *
     * @param fatalOnFalse TRUE if an error message is to be displayed if there is no splitAndLabel
     * distribution.
     * @return TRUE if there is a splitAndLabel distribution, FALSE otherwise.
     */
    public boolean has_distribution(boolean fatalOnFalse) {
        if (!(splitAndLabelDist == null))
            return true;
        if (fatalOnFalse)
            Error.err("SplitScore::has_distribution: no distribution-->"+
            "fatal_error");
        return false;
    }
    
    /** Returns cache.condEntropy, first checking to see if it has yet been set.
     * This method updates the cache.
     * @return The condEntropy stored in the cache.
     */
    public double get_cond_entropy() {
        valid_cache(); // Percolate validCache to the cache members.
        if (cache.condEntropy == Globals.UNDEFINED_REAL && has_distribution(true))
            this.cache.condEntropy =
            Entropy.cond_entropy(get_split_and_label_dist(),
            get_split_dist(), total_weight());
        return cache.condEntropy;
    }
    
    
    private boolean valid_cache() {
        if (validCache && has_distribution(false))
            return true;
        // When the cache is not valid, set the ancillary arrays NULL,
        //   invalidate the numeric data, and set the valid flag TRUE.
        cache.splitDist = null;
        cache.labelDist = null;
        cache.totalWeight = Globals.UNDEFINED_REAL;
        cache.mutualInfo = Globals.UNDEFINED_REAL;
        cache.entropy = Globals.UNDEFINED_REAL;
        cache.condEntropy = Globals.UNDEFINED_REAL;
        cache.gainRatio = Globals.UNDEFINED_REAL;
        cache.splitEntropy = Globals.UNDEFINED_REAL;
        validCache = true;
        return false;
    }
    
    /** Returns the total weight from the cache. This method updates the cache.
     *
     * @return The total weight.
     */
    public double total_weight() {
        valid_cache(); // Percolate validCache to the cache members.
        if (cache.totalWeight != Globals.UNDEFINED_REAL)
            return cache.totalWeight;
        if (!has_distribution(true))
            Error.err("SplitScore::num_instancess: splitAndLabelDist not yet "+
            "set-->fatal_error");
        else {
            if (cache.splitDist != null)
                cache.totalWeight = MLJArray.sum(cache.splitDist);
            else {
                get_label_dist();
                cache.totalWeight = MLJArray.sum(cache.labelDist);
            }
        }
        return cache.totalWeight;
    }
    
    /** The label distribution is calculated from the split and label distribution.
     * This method updates the cache.
     * @return The label distribution.
     */
    public double[] get_label_dist() {
        valid_cache(); // Percolate validCache to the cache members.
        if (cache.labelDist != null)
            return cache.labelDist;
        if (!has_distribution(false))
            Error.err("SplitScore::get_label_dist: splitAndLabelDist has not "+
            "been set-->fatal_error");
        else {
            //      cache.labelDist = new double[splitAndLabelDist[0].length];
            cache.labelDist = new double[splitAndLabelDist.length];
            Matrix.sum_rows(cache.labelDist,splitAndLabelDist);
        }
        return cache.labelDist;
    }
    
    /** Returns a reference to the requested distribution array.
     *
     * @return The splitAndLabel distribution array.
     */
    public double[][] get_split_and_label_dist() {
        valid_cache();
        if (splitAndLabelDist == null)
            Error.err("SplitScore::get_split_and_label_dist: Array has not "+
            "been allocated-->fatal_error");
        return splitAndLabelDist;
    }
    
    /** Returns cache.entropy, first checking to see if it has yet been set.
     * This method updates the cache.
     *
     *
     * @return The entropy stored in the cache.
     */
    public double get_entropy() {
        valid_cache(); // Percolate validCache to the cache members.
        if (cache.entropy == Globals.UNDEFINED_REAL && has_distribution(true))
            cache.entropy = Entropy.entropy(get_label_dist());
        return cache.entropy;
    }
    
    
    private void verify_strictly_greater(double lhs, double rhs,
    String additionalErrMsg) {//basicCore class function
        if (lhs <= (rhs + (MLJ.realEpsilon))) {
            Error.err(additionalErrMsg + "\n verify_strictly_greater(Real): "+
            "variable is not at least " + MLJ.realEpsilon + " greater than its"+
            "lower bound (" + rhs + ")-->fatal_error");
            //      Error.err(additionalErrMsg + "\n verify_strictly_greater(Real): variable (" + MString(lhs, 20, 0, MString::general) +
            //	  ") is not at least " + MLJ.realEpsilon + " greater than its lower bound (" + rhs + ")-->fatal_error");
        }
    }
    
    /** The criterion calculation depends on the score criterion. For gainRatio it's
     * (surprise) gain ratio.  For mutualInfo and normalizedMutualInfo it's mutualInfo.
     * For mutualInfoRatio it's mutualInfo / entropy. This method updates the cache.
     *
     * @return The score for the split.
     */
    public double score() {
        switch (get_split_score_criterion()) {
            case mutualInfo:
                return get_mutual_info(false);
            case normalizedMutualInfo:
                return get_mutual_info(true);
            case gainRatio:
                return get_gain_ratio();
            case mutualInfoRatio:
                return get_mutual_info_ratio();
            case externalScore:
                return get_external_score();
            default:
                Error.err("SplitScore::score: split score criterion of " +
                get_split_score_criterion() +
                " is out of range-->fatal_error");
                return 0;
        }
    }
    
    /** Computes the scores and updates the cache when there are being computed many
     * times for the same number of instances and entropy. This would happen, for
     * instance, when determining the best threshold for a split.
     * @param sAndLDist The split and label distribution.
     * @param sDist The split distribution.
     * @param lDist The label distribution.
     * @param passedEntropy The entropy value for this split.
     * @param passedWeight The weight of instances for this split.
     * @return The score for this split distribution.
     * @see Entropy#find_best_threshold
     */
    public double score(double[][] sAndLDist, double[] sDist,
    double[] lDist, double passedEntropy,
    double passedWeight) {
        // Distribution arrays are passed as consts; handed over to
        //   SplitScore; then released back to the invoker.
        
        // Save the both the current cache and the splitAndLabelDist reference.
        // Restore them prior to returning.  Note:  the cache saves the
        //   references to the old dists, not the dists themselves.
        double theOldExternalScore = theExternalScore;
        boolean oldValidCache = validCache;
        double[][] oldSplitAndLabelDist = splitAndLabelDist;
        CacheStruct oldCache = cache;
        splitAndLabelDist = null;
        cache.splitDist = null;
        cache.labelDist = null;
        
        double[][] sAndLDistP = sAndLDist;  // No const.
        set_split_and_label_dist(sAndLDistP);
        if (sDist != null) {
            double[] sDistP = sDist; // No const.
            set_split_dist(sDistP);
        }
        if (lDist != null) {
            double[] lDistP = lDist; // No const.
            set_split_dist(lDistP);
        }
        valid_cache();
        if (passedEntropy != Globals.UNDEFINED_REAL) {
            //      DBG(mlc.verify_approx_equal(passedEntropy, get_entropy(),
            //	                          "SplitScore::score: given entropy "
            //	                          "not equal to calculated entropy");
            //	  );
            cache.entropy = passedEntropy;
        }
        cache.totalWeight = passedWeight;
        
        double theScore = score();
        
        if (sDist != null) {
            double[] releasedSplitDist = release_split_dist();
            //      (void)releasedSplitDist;
            //      DBG(ASSERT(mlc.approx_equal(*releasedSplitDist, *sDist)));
        }
        if (lDist != null) {
            double[] releasedLabelDist = release_label_dist();
            //      (void)releasedLabelDist;
            //      DBG(ASSERT(mlc.approx_equal(*releasedLabelDist, *lDist)));
        }
        double[][] releasedSplitAndLabelDist = release_split_and_label_dist();
        //   (void)releasedSplitAndLabelDist;
        //   DBG(ASSERT(mlc.approx_equal(*releasedSplitAndLabelDist, *sAndLDist)));
        
        // Restore
        cache = oldCache;
        splitAndLabelDist = oldSplitAndLabelDist;
        validCache = oldValidCache;
        theExternalScore = theOldExternalScore;
        
        return theScore;
    }
    
    
    
    /** Returns the type of criterion used in scoring splits.
     * @return The scoring criterion.
     * @see #mutualInfo
     * @see #normalizedMutualInfo
     * @see #gainRatio
     * @see #mutualInfoRatio
     * @see #externalScore
     */
    public byte get_split_score_criterion()
    {return splitScoreCriterion;}
    
    /** Returns the value, set externally, for the score.
     *
     * @return The externally set score value.
     */
    public double get_external_score() {
        if (splitAndLabelDist == null && theExternalScore != Globals.UNDEFINED_REAL)
            Error.err("SplitScore::get_external_score:  splitAndLabelDist was "+
            "deleted without theExternalScore being invalidated-->fatal_error");
        if (theExternalScore == Globals.UNDEFINED_REAL)
            Error.err("SplitScore::get_external_score: no score set-->"+
            "fatal_error");
        return theExternalScore;
    }
    
    /** Returns the mutual information ratio, which is the ratio between the mutual
     * info and entropy. The mutual information must be >= 0. Although const, this
     * method updates the cache.
     *
     * @return Mutual information ratio.
     */
    public double get_mutual_info_ratio() {
        double denominator = get_entropy();
        MLJ.verify_strictly_greater(denominator, 0,
        "SplitScore::get_mutual_info_ratio: Need to divide by entropy, which "+
        "is too small");
        DoubleRef ratio = new DoubleRef(get_mutual_info(false) / denominator);
        MLJ.clamp_to_range(ratio, 0, 1, "SplitScore::get_mutual_info_ratio: "+
        "ratio not in required range [0, 1]");
        return ratio.value;
    }
    
    /** Determines, and returns, cache.gainRatio. This method updates the cache.
     *
     * @return The gainRatio stored in the cache.
     */
    public double get_gain_ratio() {
        valid_cache(); // Percolate validCache to the cache members.
        double gain = cache.gainRatio;
        if (cache.gainRatio == Globals.UNDEFINED_REAL && has_distribution(true)) {
            double numerator = get_mutual_info(false);
            double divisor = get_split_entropy();
            // If the divisor is zero, we abort.
            if (MLJ.approx_equal(divisor, 0.0))
                Error.err("SplitScore::get_gain_ratio: split entropy (" + divisor +
                ") is too close to zero for division. Split and Label Dist is: " +

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -