📄 statistics.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
     *     * If these probabilities are independent, the expected values of     * the matrix cells are:     *     * <blockquote><code>     *  E(both)= totalCount * P(One) * P(Two)     *  <br>     *  E(oneOnly) = totalCount * P(One) * (1-P(Two))     *  <br>     *  E(twoOnly) = totalCount * (1-P(One)) * P(Two)     *  <br>     *  E(neither) = totalCount * (1-P(One)) * (1-P(Two))     * </code></blockquote>     *     * These are used to derive the independence test statistic, which     * is the square differences between observed and expected values     * under the independence assumption, normalized by the expected     * values:     *     * <blockquote><code>     * C<sub><sub>2</sub></sub>     *  = (both - E(both))<sup><sup>2</sup></sup> / E(both)     * <br> &nbsp; &nbsp; &nbsp; &nbsp;     * + (oneOnly - E(oneOnly))<sup><sup>2</sup></sup> / E(oneOnly)     * <br> &nbsp; &nbsp; &nbsp; &nbsp;     * + (twoOnly - E(twoOnly))<sup><sup>2</sup></sup> / E(twoOnly)     * <br> &nbsp; &nbsp; &nbsp; &nbsp;     * + (neither - E(neither))<sup><sup>2</sup></sup> / E(neither)     * </code></blockquote>     *     * Unlike the higher dimensional case, this statistic applies as a     * hypothesis test only in the case when all expected values are     * at least 10.     * @param both Count of samples of both outcomes.     * @param oneOnly Count of samples with the first and not the     * second outcome.     * @param twoOnly Count of samples with the second and not the     * first outcome.     * @param neither Count of samples with with neither outcome.     * @throws IllegalArgumentException If any of the arguments are not     * non-negative finite numbers.     * @return Pearson's C<sub><sub>2</sub></sub> goodness-of-fit     * statistic for independence over the specified sample counts.     */    public static double chiSquaredIndependence(double both, double oneOnly,                                                double twoOnly,                                                double neither) {        assertNonNegative("both",both);        assertNonNegative("oneOnly",oneOnly);        assertNonNegative("twoOnly",twoOnly);        assertNonNegative("neither",neither);        double n = both + oneOnly + twoOnly + neither;        double p1 = (both + oneOnly) / n;        double p2 = (both + twoOnly) / n;        double eBoth = n * p1 * p2;        double eOneOnly = n * p1 * (1.0 - p2);        double eTwoOnly = n * (1.0 - p1) * p2;        double eNeither = n * (1.0 - p1) * (1.0 - p2);        return csTerm(both,eBoth)            + csTerm(oneOnly,eOneOnly)            + csTerm(twoOnly,eTwoOnly)            + csTerm(neither,eNeither);    }    /**     * Returns a two-element array of lineary regression coefficients     * for the specified x and y values.  The coefficients returned,     * <code>{ &beta;0, &beta;1 }</code>, define a linear function:     *     * <blockquote><code>     * f(x) = &beta;1 * x + &beta;0     * </code></blockquote>     *     * The coefficients returned produce the linear function <code>f(x)</code>     * with the smallest squared error:     *     * <blockquote><code>     * sqErr(f,xs,ys) =     * <big><big>&Sigma;</big></big><sub><sub>i</sub></sub>     * (f(xs[i]) - ys[i])<sup>2</sup>     * </code></blockquote>     *     * where all sums are for <code>0 &lt;< i &lt xs.length</code>.     *     * The funciton requires only a single pass through the two     * arrays, with <code>&beta;0</code> and <code>&beta;1</code>     * given by:     *     * <blockquote><pre>     * &beta;1 = n * <big><big>&Sigma;</big></big><sub>i</sub> x[i] * y[i]  -  (<big><big>&Sigma;</big></big><sub>i</sub> x[i])(<big><big>&Sigma;</big></big><sub>i</sub> y[i])     *      ------------------------------------------     *          n * <big><big>&Sigma;</big></big><sub>i</sub> x[i]*x[i]  -  (<big><big>&Sigma;</big></big><sub>i</sub> x[i])<sup>2</sup>     * </pre></blockquote>     *     * <blockquote><pre>     * &beta;0 = <big><big>&Sigma;</big></big><sub>i</sub> y[i] - &beta;1 <big><big>&Sigma;</big></big><sub>i</sub> x[i]     *      ---------------------     *              n     * </pre></blockquote>     *     * where <code>n = xs.length = ys.length</code>.     *     * @param xs Array of x values.     * @param ys Parallel array of y values.     * @return Pair of regression coefficients.     * @throws IllegalArgumentException If the arrays are of length     * less than 2, or if the arrays are not of the same length.     */    public static double[] linearRegression(double[] xs, double[] ys) {        if (xs.length != ys.length) {            String msg = "Require parallel arrays of x and y values."                + " Found xs.length=" + xs.length                + " ys.length=" + ys.length;            throw new IllegalArgumentException(msg);        }        if (xs.length < 2) {            String msg = "Require arrays of length >= 2."                + " Found xs.length=" + xs.length;            throw new IllegalArgumentException(msg);        }        double n = xs.length;        double xSum = 0.0;        double ySum = 0.0;        double xySum = 0.0;        double xxSum = 0.0;        for (int i = 0; i < xs.length; ++i) {            double x = xs[i];            double y = ys[i];            xSum += x;            ySum += y;            xxSum += x * x;            xySum += x * y;        }        double denominator = n * xxSum - xSum * xSum;        if (denominator == 0.0) {            String msg = "Ill formed input. Denominator for beta1 is zero."                + " Most likely cause is fewer than 2 distinct inputs.";            throw new IllegalArgumentException(msg);        }        double beta1 = (n * xySum - xSum * ySum) / denominator;        double beta0 = (ySum - beta1 * xSum) / n;        return new double[] { beta0, beta1 };    }    /**     * Returns a two-element array of logistic regression coefficients     * for the specified x and y values.  The coefficients returned,     * <code>{ &beta;0, &beta;1 }</code>, define the logistic function:     *     * <blockquote><pre>     *                L     * f(x) =  ---------------     *         1 + <i>e</i> <sup><sup>&beta;1 * x + &beta;0</sup></sup>     * </pre></blockquote>     *     * with the minimum squared error.  See {@link     * #linearRegression(double[],double[])} for a definition of     * squared error.  This function takes real values in the the open     * interval <code>(0, L)</code>.     *     * <p>Logistic regression coefficients are computed using     * linear regression, after transforming the y values.  This     * is possible because of the following linear relation:     *     * <blockquote><pre>     * log ((L - y) / y) = &beta;1 * x + &beta;0     * </pre></blockquote>     *     * @param xs Array of x values.     * @param ys Array of y values.     * @param maxValue Maximum value of function.     * @return Binary array of logistic regression coordinates.     * @throws IllegalArgumentException If the maximum value is not     * positive and finite, if the arrays are of length less than 2,     * or if the arrays are not of the same length.     */    public static double[] logisticRegression(double[] xs, double[] ys,                                              double maxValue) {        if (maxValue <= 0.0 || Double.isInfinite(maxValue) || Double.isNaN(maxValue)) {            String msg = "Require finite max value > 0."                + " Found maxValue=" + maxValue;            throw new IllegalArgumentException(msg);        }        double[] logisticYs = new double[ys.length];        for (int i = 0; i < ys.length; ++i)            logisticYs[i] = Math.log((maxValue - ys[i]) / ys[i]);        return linearRegression(xs,logisticYs);    }    /**     * Returns the value of Pearson's C<sub><sub>2</sub></sub>     * goodness of fit statistic for independence over the specified     * contingency matrix.  Asymptotically, this statistic has a     * &chi;<sup>2</sup> distribution with     * <code>(numRows-1)*(numCols-1)</code> degrees of freedom.  The     * higher the value, the <i>less</i> likely the two outcomes are     * independent.     *     * Pearson's C<sub><sub>2</sub></sub> statistic is defined as follows:     *     * <blockquote><code>     * C<sub><sub>2</sub></sub>     * = <big><big><big>&Sigma;</big></big></big><sub><sub>i</sub></sub>     *   <big><big><big>&Sigma;</big></big></big><sub><sub>j</sub></sub>     *   (matrix[i][j] - expected(i,j,matrix))<sup>2</sup> / expectedCount(i,j,matrix)     * </code></blockquote>     *     * where the expected count is the total count times the max     * likelihood estimates of row <code>i</code> probability times     * column <code>j</code> probability:     *     * <blockquote><code>     *  expectedCount(i,j,matrix)     *  <br>     *  = totalCount(matrix)     *  <br> &nbsp; &nbsp;     *       * rowCount(i,matrix)/totalCount(matrix)     *  <br> &nbsp; &nbsp;     *       * colCount(j,matrix)/totalCount(matrix)     * <br>     * = rowCount(i,matrix) * colCount(j,matrix) / totalCount(matrix)     * </code></blockquote>     *     * where     *     * <blockquote><code>     * rowCount(i,matrix)     * = <big><big>&Sigma;</big></big><sub><sub>0&lt;=j&lt;=numCols</sub></sub>     *   matrix[i][j]     *     * <br>     * colCount(j,matrix)     * = <big><big>&Sigma;</big></big><sub><sub>0&lt;=i&lt;=numRows</sub></sub>     *   matrix[i][j]     * <br>     * totalCount(matrix)     * = <big><big>&Sigma;</big></big><sub><sub>0&lt;=i&lt;=numRows</sub></sub>     * = <big><big>&Sigma;</big></big><sub><sub>0&lt;=j&lt;=numCols</sub></sub>     *   matrix[i][j]     * </code></blockquote>     *     * <P>The &chi;<sup>2</sup> test is a large sample test and is only     * valid if all of the expected counts are at least 5.  This restriction     * is often ignored for ranking purposes.     *     * @param contingencyMatrix The specified contingency matrix.     * @return Pearson's C<sub><sub>2</sub></sub> statistic for the independence     * testing over the contingency matrix.     * @throws Illegal argument exception if the matrix is not rectangular     * or if all values are not non-negative finite numbers.     */    public static double chiSquaredIndependence(double[][] contingencyMatrix) {        int numRows = contingencyMatrix.length;        if (numRows < 2) {            String msg = "Require at least two rows."                + " Found numRows=" + numRows;            throw new IllegalArgumentException(msg);        }        int numCols = contingencyMatrix[0].length;        if (numCols < 2) {            String msg = "Require at least two cols."                + " Found numCols=" + numCols;            throw new IllegalArgumentException(msg);        }        double[] rowSums = new double[numRows];        Arrays.fill(rowSums,0.0);        double[] colSums = new double[numCols];        Arrays.fill(colSums,0.0);        double totalCount = 0.0;        for (int i = 0; i < numRows; ++i) {            if (contingencyMatrix[i].length != numCols) {                String msg = "Matrix must be rectangular."                    + "Row 0 length=" + numCols                    + "Row " + i + " length=" + contingencyMatrix[i].length;                throw new IllegalArgumentException(msg);            }            for (int j = 0; j < numCols; ++j) {                double val = contingencyMatrix[i][j];                if (Double.isInfinite(val) || val < 0.0                    || Double.isNaN(val)) {                    String msg = "Values must be finite non-negative."                        + " Found matrix[" + i + "][" + j + "]="                        + val;                    throw new IllegalArgumentException(msg);                }                rowSums[i] += val;                colSums[j] += val;                totalCount += val;            }        }        double result = 0.0;        for (int i = 0; i < numRows; ++i)            for (int j = 0; j < numCols; ++j)                result += csTerm(contingencyMatrix[i][j],                                 rowSums[i] * colSums[j] / totalCount);        return result;    }    /**     * Return an array of probabilities resulting from normalizing the     * specified probability ratios.  The resulting array of     * probabilities is the same length as the input ratio array and     * each probability is simply the input array's value divided by     * the sum of the ratios.     *     * <P><b>Warning:</b> This method is implemented by summing the     * probability ratios and then dividing each element by the sum.     * Because of the limited precision of <code>double</code>-based     * arithmetic, if the largest ratio is much larger than the next     * largest ratio, then the largest normalized probability will be     * one and all others will be zero.  Java double values follow the     * IEEE 754 arithmetic standard and thus use 52 bits for their     * mantissas.  Thus only ratios within     * <code>2<sup><sup>52</sup></sup>~10<sup><sup>16</sup></sup> of     * the maximum ratio will be non-zero.     *     * @param probabilityRatios Ratios of probabilities.     * @return Probabilities resulting from normalizing the ratios.     * @throws IllegalArgumentException If the input contains a value     * that is not a finite non-negative number, or if the input does     * not contain at least one non-zero entry.     */    public static double[] normalize(double[] probabilityRatios) {        for (int i = 0; i < probabilityRatios.length; ++i) {            if (probabilityRatios[i] < 0.0                || Double.isInfinite(probabilityRatios[i])                || Double.isNaN(probabilityRatios[i])) {                String msg = "Probabilities must be finite non-negative."                    + " Found probabilityRatios[" + i + "]="                    + probabilityRatios[i];                throw new IllegalArgumentException(msg);            }        }        double sum = com.aliasi.util.Math.sum(probabilityRatios);        if (sum <= 0.0) {            String msg = "Ratios must sum to number greater than zero."                + " Found sum=" + sum;            throw new IllegalArgumentException(msg);        }        double[] result = new double[probabilityRatios.length];        for (int i = 0; i < probabilityRatios.length; ++i)            result[i] = probabilityRatios[i]/sum;        return result;    }    /**     * Returns the value of the kappa statistic for the specified     * observed and expected probabilities.  The kappa statistic     * provides a kind of adjustment for the exptected (random)     * difficulty of a problem.  It is defined by:     *     * <blockquote><code>     * kappa(p,e) = (p - e) / (1 - e)     * </code></blockquote>     *     * <P>The most typical use for kappa is in evaluating     * classification problems of a machine versus a gold standard or     * between two human annotators to asses inter-annotator     * agreement.     *     * @param observedProb Observed probability.     * @param expectedProb Expected probability.     * @return The value of the kappa statistic for the specified     * probability 8 and expected probability.     */    public static double kappa(double observedProb, double expectedProb) {        return (observedProb - expectedProb)/(1 - expectedProb);    }    /**     * Returns the mean of the specified array of input values.  The mean     * of an array is defined by:
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -