📄 statistics.java
字号:
* * If these probabilities are independent, the expected values of * the matrix cells are: * * <blockquote><code> * E(both)= totalCount * P(One) * P(Two) * <br> * E(oneOnly) = totalCount * P(One) * (1-P(Two)) * <br> * E(twoOnly) = totalCount * (1-P(One)) * P(Two) * <br> * E(neither) = totalCount * (1-P(One)) * (1-P(Two)) * </code></blockquote> * * These are used to derive the independence test statistic, which * is the square differences between observed and expected values * under the independence assumption, normalized by the expected * values: * * <blockquote><code> * C<sub><sub>2</sub></sub> * = (both - E(both))<sup><sup>2</sup></sup> / E(both) * <br> * + (oneOnly - E(oneOnly))<sup><sup>2</sup></sup> / E(oneOnly) * <br> * + (twoOnly - E(twoOnly))<sup><sup>2</sup></sup> / E(twoOnly) * <br> * + (neither - E(neither))<sup><sup>2</sup></sup> / E(neither) * </code></blockquote> * * Unlike the higher dimensional case, this statistic applies as a * hypothesis test only in the case when all expected values are * at least 10. * @param both Count of samples of both outcomes. * @param oneOnly Count of samples with the first and not the * second outcome. * @param twoOnly Count of samples with the second and not the * first outcome. * @param neither Count of samples with with neither outcome. * @throws IllegalArgumentException If any of the arguments are not * non-negative finite numbers. * @return Pearson's C<sub><sub>2</sub></sub> goodness-of-fit * statistic for independence over the specified sample counts. */ public static double chiSquaredIndependence(double both, double oneOnly, double twoOnly, double neither) { assertNonNegative("both",both); assertNonNegative("oneOnly",oneOnly); assertNonNegative("twoOnly",twoOnly); assertNonNegative("neither",neither); double n = both + oneOnly + twoOnly + neither; double p1 = (both + oneOnly) / n; double p2 = (both + twoOnly) / n; double eBoth = n * p1 * p2; double eOneOnly = n * p1 * (1.0 - p2); double eTwoOnly = n * (1.0 - p1) * p2; double eNeither = n * (1.0 - p1) * (1.0 - p2); return csTerm(both,eBoth) + csTerm(oneOnly,eOneOnly) + csTerm(twoOnly,eTwoOnly) + csTerm(neither,eNeither); } /** * Returns a two-element array of lineary regression coefficients * for the specified x and y values. The coefficients returned, * <code>{ β0, β1 }</code>, define a linear function: * * <blockquote><code> * f(x) = β1 * x + β0 * </code></blockquote> * * The coefficients returned produce the linear function <code>f(x)</code> * with the smallest squared error: * * <blockquote><code> * sqErr(f,xs,ys) = * <big><big>Σ</big></big><sub><sub>i</sub></sub> * (f(xs[i]) - ys[i])<sup>2</sup> * </code></blockquote> * * where all sums are for <code>0 << i < xs.length</code>. * * The funciton requires only a single pass through the two * arrays, with <code>β0</code> and <code>β1</code> * given by: * * <blockquote><pre> * β1 = n * <big><big>Σ</big></big><sub>i</sub> x[i] * y[i] - (<big><big>Σ</big></big><sub>i</sub> x[i])(<big><big>Σ</big></big><sub>i</sub> y[i]) * ------------------------------------------ * n * <big><big>Σ</big></big><sub>i</sub> x[i]*x[i] - (<big><big>Σ</big></big><sub>i</sub> x[i])<sup>2</sup> * </pre></blockquote> * * <blockquote><pre> * β0 = <big><big>Σ</big></big><sub>i</sub> y[i] - β1 <big><big>Σ</big></big><sub>i</sub> x[i] * --------------------- * n * </pre></blockquote> * * where <code>n = xs.length = ys.length</code>. * * @param xs Array of x values. * @param ys Parallel array of y values. * @return Pair of regression coefficients. * @throws IllegalArgumentException If the arrays are of length * less than 2, or if the arrays are not of the same length. */ public static double[] linearRegression(double[] xs, double[] ys) { if (xs.length != ys.length) { String msg = "Require parallel arrays of x and y values." + " Found xs.length=" + xs.length + " ys.length=" + ys.length; throw new IllegalArgumentException(msg); } if (xs.length < 2) { String msg = "Require arrays of length >= 2." + " Found xs.length=" + xs.length; throw new IllegalArgumentException(msg); } double n = xs.length; double xSum = 0.0; double ySum = 0.0; double xySum = 0.0; double xxSum = 0.0; for (int i = 0; i < xs.length; ++i) { double x = xs[i]; double y = ys[i]; xSum += x; ySum += y; xxSum += x * x; xySum += x * y; } double denominator = n * xxSum - xSum * xSum; if (denominator == 0.0) { String msg = "Ill formed input. Denominator for beta1 is zero." + " Most likely cause is fewer than 2 distinct inputs."; throw new IllegalArgumentException(msg); } double beta1 = (n * xySum - xSum * ySum) / denominator; double beta0 = (ySum - beta1 * xSum) / n; return new double[] { beta0, beta1 }; } /** * Returns a two-element array of logistic regression coefficients * for the specified x and y values. The coefficients returned, * <code>{ β0, β1 }</code>, define the logistic function: * * <blockquote><pre> * L * f(x) = --------------- * 1 + <i>e</i> <sup><sup>β1 * x + β0</sup></sup> * </pre></blockquote> * * with the minimum squared error. See {@link * #linearRegression(double[],double[])} for a definition of * squared error. This function takes real values in the the open * interval <code>(0, L)</code>. * * <p>Logistic regression coefficients are computed using * linear regression, after transforming the y values. This * is possible because of the following linear relation: * * <blockquote><pre> * log ((L - y) / y) = β1 * x + β0 * </pre></blockquote> * * @param xs Array of x values. * @param ys Array of y values. * @param maxValue Maximum value of function. * @return Binary array of logistic regression coordinates. * @throws IllegalArgumentException If the maximum value is not * positive and finite, if the arrays are of length less than 2, * or if the arrays are not of the same length. */ public static double[] logisticRegression(double[] xs, double[] ys, double maxValue) { if (maxValue <= 0.0 || Double.isInfinite(maxValue) || Double.isNaN(maxValue)) { String msg = "Require finite max value > 0." + " Found maxValue=" + maxValue; throw new IllegalArgumentException(msg); } double[] logisticYs = new double[ys.length]; for (int i = 0; i < ys.length; ++i) logisticYs[i] = Math.log((maxValue - ys[i]) / ys[i]); return linearRegression(xs,logisticYs); } /** * Returns the value of Pearson's C<sub><sub>2</sub></sub> * goodness of fit statistic for independence over the specified * contingency matrix. Asymptotically, this statistic has a * χ<sup>2</sup> distribution with * <code>(numRows-1)*(numCols-1)</code> degrees of freedom. The * higher the value, the <i>less</i> likely the two outcomes are * independent. * * Pearson's C<sub><sub>2</sub></sub> statistic is defined as follows: * * <blockquote><code> * C<sub><sub>2</sub></sub> * = <big><big><big>Σ</big></big></big><sub><sub>i</sub></sub> * <big><big><big>Σ</big></big></big><sub><sub>j</sub></sub> * (matrix[i][j] - expected(i,j,matrix))<sup>2</sup> / expectedCount(i,j,matrix) * </code></blockquote> * * where the expected count is the total count times the max * likelihood estimates of row <code>i</code> probability times * column <code>j</code> probability: * * <blockquote><code> * expectedCount(i,j,matrix) * <br> * = totalCount(matrix) * <br> * * rowCount(i,matrix)/totalCount(matrix) * <br> * * colCount(j,matrix)/totalCount(matrix) * <br> * = rowCount(i,matrix) * colCount(j,matrix) / totalCount(matrix) * </code></blockquote> * * where * * <blockquote><code> * rowCount(i,matrix) * = <big><big>Σ</big></big><sub><sub>0<=j<=numCols</sub></sub> * matrix[i][j] * * <br> * colCount(j,matrix) * = <big><big>Σ</big></big><sub><sub>0<=i<=numRows</sub></sub> * matrix[i][j] * <br> * totalCount(matrix) * = <big><big>Σ</big></big><sub><sub>0<=i<=numRows</sub></sub> * = <big><big>Σ</big></big><sub><sub>0<=j<=numCols</sub></sub> * matrix[i][j] * </code></blockquote> * * <P>The χ<sup>2</sup> test is a large sample test and is only * valid if all of the expected counts are at least 5. This restriction * is often ignored for ranking purposes. * * @param contingencyMatrix The specified contingency matrix. * @return Pearson's C<sub><sub>2</sub></sub> statistic for the independence * testing over the contingency matrix. * @throws Illegal argument exception if the matrix is not rectangular * or if all values are not non-negative finite numbers. */ public static double chiSquaredIndependence(double[][] contingencyMatrix) { int numRows = contingencyMatrix.length; if (numRows < 2) { String msg = "Require at least two rows." + " Found numRows=" + numRows; throw new IllegalArgumentException(msg); } int numCols = contingencyMatrix[0].length; if (numCols < 2) { String msg = "Require at least two cols." + " Found numCols=" + numCols; throw new IllegalArgumentException(msg); } double[] rowSums = new double[numRows]; Arrays.fill(rowSums,0.0); double[] colSums = new double[numCols]; Arrays.fill(colSums,0.0); double totalCount = 0.0; for (int i = 0; i < numRows; ++i) { if (contingencyMatrix[i].length != numCols) { String msg = "Matrix must be rectangular." + "Row 0 length=" + numCols + "Row " + i + " length=" + contingencyMatrix[i].length; throw new IllegalArgumentException(msg); } for (int j = 0; j < numCols; ++j) { double val = contingencyMatrix[i][j]; if (Double.isInfinite(val) || val < 0.0 || Double.isNaN(val)) { String msg = "Values must be finite non-negative." + " Found matrix[" + i + "][" + j + "]=" + val; throw new IllegalArgumentException(msg); } rowSums[i] += val; colSums[j] += val; totalCount += val; } } double result = 0.0; for (int i = 0; i < numRows; ++i) for (int j = 0; j < numCols; ++j) result += csTerm(contingencyMatrix[i][j], rowSums[i] * colSums[j] / totalCount); return result; } /** * Return an array of probabilities resulting from normalizing the * specified probability ratios. The resulting array of * probabilities is the same length as the input ratio array and * each probability is simply the input array's value divided by * the sum of the ratios. * * <P><b>Warning:</b> This method is implemented by summing the * probability ratios and then dividing each element by the sum. * Because of the limited precision of <code>double</code>-based * arithmetic, if the largest ratio is much larger than the next * largest ratio, then the largest normalized probability will be * one and all others will be zero. Java double values follow the * IEEE 754 arithmetic standard and thus use 52 bits for their * mantissas. Thus only ratios within * <code>2<sup><sup>52</sup></sup>~10<sup><sup>16</sup></sup> of * the maximum ratio will be non-zero. * * @param probabilityRatios Ratios of probabilities. * @return Probabilities resulting from normalizing the ratios. * @throws IllegalArgumentException If the input contains a value * that is not a finite non-negative number, or if the input does * not contain at least one non-zero entry. */ public static double[] normalize(double[] probabilityRatios) { for (int i = 0; i < probabilityRatios.length; ++i) { if (probabilityRatios[i] < 0.0 || Double.isInfinite(probabilityRatios[i]) || Double.isNaN(probabilityRatios[i])) { String msg = "Probabilities must be finite non-negative." + " Found probabilityRatios[" + i + "]=" + probabilityRatios[i]; throw new IllegalArgumentException(msg); } } double sum = com.aliasi.util.Math.sum(probabilityRatios); if (sum <= 0.0) { String msg = "Ratios must sum to number greater than zero." + " Found sum=" + sum; throw new IllegalArgumentException(msg); } double[] result = new double[probabilityRatios.length]; for (int i = 0; i < probabilityRatios.length; ++i) result[i] = probabilityRatios[i]/sum; return result; } /** * Returns the value of the kappa statistic for the specified * observed and expected probabilities. The kappa statistic * provides a kind of adjustment for the exptected (random) * difficulty of a problem. It is defined by: * * <blockquote><code> * kappa(p,e) = (p - e) / (1 - e) * </code></blockquote> * * <P>The most typical use for kappa is in evaluating * classification problems of a machine versus a gold standard or * between two human annotators to asses inter-annotator * agreement. * * @param observedProb Observed probability. * @param expectedProb Expected probability. * @return The value of the kappa statistic for the specified * probability 8 and expected probability. */ public static double kappa(double observedProb, double expectedProb) { return (observedProb - expectedProb)/(1 - expectedProb); } /** * Returns the mean of the specified array of input values. The mean * of an array is defined by:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -