📄 entropy.java

📁 java数据挖掘算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
        return H.value;
    }
    
    /** Computes conditional entropy of the label given attribute X. From Ross,
     * Conditional entropy is defined as:                                          <BR>
     *             H(Y|X) = sum_x H(Y|X=x)*P(X=x).                            <BR>
     *                    = sum_x (-sum_y p(Y=y|X=x)log p(Y=y|X=x)) * P(X=x)  <BR>
     *             now derive Pagallo & Haussler's formula                    <BR>
     *                    = -sum_{x,y} p(Y=y, X=x) log p(Y=y|X=x)             <BR>
     *             Here we estimate p(Y=y, X=x) by counting, but if we
     *               have priors on the probabilities of the labels, then     <BR>
     *               p(x,y) = p(x|y)*p(y) = count(x,y)/s(y)* prior(y)         <BR>
     *               and p(x) = sum_y prior(y) count(x,y)/s(y).               <BR>
     *
     *             By counting we get the following:                          <BR>
     *             -sum_{x,y} num(Y=y,X=x)/num-rec * log num(Y=y,X=x)/num(X=x)
     *
     * @param instList The instance list over which conditional entropy is calculated.
     * @param attrNumX The number of the attribute for which conditional entropy is requested.
     * @return The conditional entropy.
     */
    public static double cond_entropy(InstanceList instList, int attrNumX) {
        return cond_entropy(instList.counters().value_counts()[attrNumX],
        instList.counters().attr_counts()[attrNumX],
        instList.total_weight());
    }
    
    /** Compute the mutual information which is defined as I(Y;X) = H(Y) - H(Y|X). Some
     * researchers like Quinlan call this "gain." This is the amount of information
     * gained about the category value of an instance after we test the variable X.
     *
     * @param ent Entropy value.
     * @param splitAndLabelDist Distributions over each split and label pair.
     * @param splitDist The distribution over splits.
     * @param totalWeight Total weight of the Instances trained on.
     * @return The mutual information value.
     */
    public static double mutual_info(double ent,double[][] splitAndLabelDist,
    double[] splitDist, double totalWeight) {
        double condEntropy = Entropy.cond_entropy(splitAndLabelDist, splitDist,
        totalWeight);
        DoubleRef mi = new DoubleRef(ent - condEntropy);
        // Mutual information should never be negative; the following
        //   accounts for possible numerical representation errors.
        MLJ.clamp_above(mi, 0, "mutual_info: negative values not allowed");
        return mi.value;
    }
    
    /** Compute the mutual information which is defined as I(Y;X) = H(Y) - H(Y|X). Some
     * researchers like Quinlan call this "gain." This is the amount of information
     * gained about the category value of an instance after we test the variable X.
     * @param instList The instance list over which mutual information is calculated.
     * @param attrNumX The number of the attribute for which mutual information is requested.
     * @return The mutual information value.
     */
    public static double mutual_info(InstanceList instList, int attrNumX) {
        if (instList.counters().attr_counts()[attrNumX] == null)
            Error.fatalErr("entropy::mutual_info: attribute "+attrNumX+
            " is not nominal (counts array is NULL)");
        
        double ent = entropy(instList.counters().label_counts(),
        instList.total_weight());
        return mutual_info(instList, ent, attrNumX);
    }
    
    /** Compute the mutual information which is defined as I(Y;X) = H(Y) - H(Y|X). Some
     * researchers like Quinlan call this "gain." This is the amount of information
     * gained about the category value of an instance after we test the variable X.
     * @param instList The instance list over which mutual information is calculated.
     * @param ent Entropy value.
     * @param attrNumX The number of the attribute for which mutual information is requested.
     * @return The mutual information value.
     */
    public static double mutual_info(InstanceList instList,
    double ent, int attrNumX) {
        if (instList.counters().attr_counts()[attrNumX] == null)
            Error.fatalErr("entropy::mutual_info: attribute "+attrNumX+
            " is not nominal (counts array is NULL)");
        
        return mutual_info(ent,
        instList.counters().value_counts()[attrNumX],
        instList.counters().attr_counts()[attrNumX],
        instList.total_weight());
    }
    
    
    /** Builds the distribution arrays necessary for calculating conditional entropy for
     * nominal attributes. All of the splitAndLabelDist arrays of the Instance Lists are
     * concatenated. The unaccounted instances allow the list of nodes to be partial,
     * i.e., not to contain all instances. The split will be created so that the
     * unaccounted instances are in an extra split with the same label, so that the
     * entropy will be decreased correctly as if they were in a pure node.
     *
     * @param currentLevel The list of instances in the current partition for which a split is being
     * determined.
     * @param attrNum The number of the attribute for which mutual information is requested.
     * @return The distribution over splits.
     */
    static public double[] build_nominal_attr_split_dist(InstanceList[] currentLevel,
    int attrNum) {
        return build_nominal_attr_split_dist(currentLevel,attrNum,0);
    }
    
    /** Builds the distribution arrays necessary for calculating conditional entropy for
     * nominal attributes. All of the splitAndLabelDist arrays of the Instance Lists are
     * concatenated. The unaccounted instances allow the list of nodes to be partial,
     * i.e., not to contain all instances. The split will be created so that the
     * unaccounted instances are in an extra split with the same label, so that the
     * entropy will be decreased correctly as if they were in a pure node.
     *
     * @param currentLevel The list of instances in the current partition for which a split is being
     * determined.
     * @param attrNum The number of the attribute for which mutual information is requested.
     * @param unaccountedWeight Weight that is not accounted for in the list of instances.
     * @return The distribution over splits.
     */
    static public double[] build_nominal_attr_split_dist(InstanceList[] currentLevel,
    int attrNum, double unaccountedWeight) {
        MLJ.ASSERT(currentLevel[0]!= null,"Entropy.build_nominal_attr_split_dist:currentLevel[0]== null.");
        Schema schema = currentLevel[0].get_schema();
        int numInstLists = currentLevel.length;
        int numAttrValues = schema.num_attr_values(attrNum);
        
        MLJ.ASSERT(numInstLists > 0,"Entropy.build_nominal_attr_split_dist:numInstLists <= 0");
        MLJ.ASSERT(numAttrValues > 0,"Entropy.build_nominal_attr_split_dist:numAttrValues <= 0");
        
        int unaccnt_wght_col = (unaccountedWeight > 0)? 1 : 0;
        double[] splitDist = new double[numInstLists * (numAttrValues + 1) + unaccnt_wght_col];
        int countSplitDist = Globals.UNKNOWN_CATEGORY_VAL;
        
        for (int instListCount = 0; instListCount < numInstLists; instListCount++) {
            MLJ.ASSERT(currentLevel[instListCount] != null,"Entropy.build_nominal_attr_split_dist:currentLevel[instListCount] == null");
            for (int attrCount = Globals.UNKNOWN_CATEGORY_VAL; attrCount < numAttrValues;
            attrCount++, countSplitDist++) {
                BagCounters bc = currentLevel[instListCount].counters();
                splitDist[countSplitDist] = bc.attr_counts()[attrNum][attrCount];
            }
        }
        if (unaccountedWeight > 0) {
            MLJ.ASSERT(countSplitDist == splitDist.length,"Entropy.build_nominal_attr_split_dist:countSplitDist != splitDist.length");
            splitDist[countSplitDist] = unaccountedWeight;
        }
        return splitDist;
    }
    
    
    /** Compute the J-measure. See papers by Goodman and Smyth, such as Data
     * Engineering, v.4, no.4, pp.301-316, 1992. The J-measure summed over all
     * values of x gives info-gain. The J-measure is                               <BR>
     * sum_y p(x,y)log(p(x,y)/(p(x)p(y)))                                          <BR>
     * 1/n * sum_y n(x,y)log(n(x,y)*n/(n(x)n(y)))                                  <BR>
     * Used in t_entropy.java.
     *
     * @return The j-measure value.
     * @param splitAndLabelDist Distributions over each split and label pair.
     * @param splitDist The distribution over splits.
     * @param labelCounts Counts of each label found in the data.
     * @param x The x value for the j-measure equation.
     * @param totalWeight Total weight of all data.
     */
    
    public static double j_measure(double[][] splitAndLabelDist,
    double[] splitDist, double[] labelCounts,
    int x, double totalWeight) {
        MLJ.verify_strictly_greater(totalWeight, 0, "j_measure: totalWeight is "+
        "too small");
        
        DoubleRef j = new DoubleRef();
        for (int y = 0;
        y < splitAndLabelDist.length; y++) {
            double num_xy = splitAndLabelDist[y][x];
            double num_x  = splitDist[x];
            double num_y  = labelCounts[y];
            if (!MLJ.approx_equal(num_xy, 0.0)) { // beware of log(0)
                if (Globals.DBG) MLJ.ASSERT((num_x > 0 && num_y > 0),"Entropy.j_measure: num_x <= 0 || num_y <= 0");
                j.value += num_xy *
                log_bin(totalWeight*(num_xy)/(num_x * num_y));
            }
        }
        j.value /= totalWeight; // We know this won't be division by zero.
        
        // Allow for possible numerical representation errors.
        MLJ.clamp_above(j, 0, "j_measure: negative j-measure not allowed");
        
        return j.value;
    }
    
    /** Compute the J-measure. See papers by Goodman and Smyth, such as Data
     * Engineering, v.4, no.4, pp.301-316, 1992. The J-measure summed over all
     * values of x gives info-gain. The J-measure is                               <BR>
     * sum_y p(x,y)log(p(x,y)/(p(x)p(y)))
     * 1/n * sum_y n(x,y)log(n(x,y)*n/(n(x)n(y)))                                  <BR>
     * Used in t_entropy.java.
     *
     * @param instList The list of Instances over which a j measure is to be
     * calculated.
     * @param attrNumX The number of attributes in the Schema of the Instances
     * supplied.
     * @param x The x value for the j-measure equation.
     * @return The j-measure value.
     */
    public static double j_measure(InstanceList instList, int attrNumX, int x) {
        return j_measure(instList.counters().value_counts()[attrNumX],
        instList.counters().attr_counts()[attrNumX],
        instList.counters().label_counts(), x,
        instList.total_weight());
    }
    
    
    /** Builds columns of real values and their associated label values. Invokes
     * InstanceList's transpose function to provide a single column for the passed
     * attribute number, sorts it, and returns the columns to the caller. The second
     * calling argument, if set to an attribute index, results in a single column
     * being transposed and sorted. When set to UNDEFINED_INT, all columns are
     * so treated.
     * @param instList The instance list containing the instance values for the attribute.
     * @param attrNum The number of the attribute for which the real and label column is
     * requested.
     * @return The columns of real values and their associated labels, organized by attribute.
     */
    public static RealAndLabelColumn[] build_real_and_label_columns(
    InstanceList instList, int attrNum) {
        // We initialize the array to FALSE, except for any element(s)
        //   we want to get the RealAndLabelColumn for, which is/are set to TRUE.
        boolean initializer = (attrNum == Globals.UNDEFINED_INT) ? true : false;
        boolean[] transp = new boolean[instList.get_schema().num_attr()];
        Arrays.fill(transp, initializer);
        if (attrNum != Globals.UNDEFINED_INT)
            transp[attrNum] = true;
        RealAndLabelColumn[] columns = instList.transpose(transp);
        
        // If a particular column was requested, check that it was transposed.
        if (attrNum != Globals.UNDEFINED_INT)
            if (columns[attrNum] != null)
                columns[attrNum].sort();
            else
                Error.fatalErr("build_real_and_label_columns: for attribute " +attrNum
                +", no column was built to sort");
        else
            for (int x = 0; x < instList.get_schema().num_attr(); x++)
                if (columns[x] != null)
                    columns[x].sort();
        return columns;
    }
    
    /** Builds a column of real values and their associated label values for the given
     * attribute. Invokes InstanceList's transpose function to provide a single column
     * for the passed attribute number, sorts it, and returns the columns to the caller.
     * The second calling argument, if set to an attribute index, results in a single
     * column being transposed and sorted. When set to UNDEFINED_INT, all columns are
     * so treated.
     * @param instList The instance list containing the instance values for the attribute.
     * @param attrNum The number of the attribute for which the real and label column is
     * requested.
     * @return The column of real values and their associated labels.
     */
    public static RealAndLabelColumn build_real_and_label_column(InstanceList
    instList, int attrNum) {
        RealAndLabelColumn[] columns = build_real_and_label_columns(instList, attrNum);
        // We want to pass the sorted column back to the caller, but delete
        //   the rest of the array.  Save a reference to the single desired
        //   column, and set the entry in the array that points to it to NULL so
        //   that when the array's deleted, the column isn't.  The caller must
        //   delete the single column.
        RealAndLabelColumn sortedColumn = columns[attrNum];
        columns[attrNum] = null;
        columns = null;
        return sortedColumn;
    }
    
}
上一页 1 23
💿 文件大小 441 K
👤 上传用户 l2335800
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#java #数据挖掘算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -