📄 statistics.java
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.stats;import java.util.Arrays;import java.util.Random;/** * The <code>Statistics</code> class provides static utility methods * for statistical computations. * * @author Bob Carpenter * @version 3.5 * @since LingPipe2.0 */public class Statistics { // don't allow instances private Statistics() { /* do nothing */ } /** * Returns the Kullback-Leibler divergence of the second specified * Dirichlet distribution relative to the first. The Dirichlet * distributions are specified by their distribution and concentration, * which are folded into a single count argument. * * <p>The KL divergence between two Dirichlet distributions with * parameters <code>xs</code> and <code>ys</code> of dimensionality * <code>n</code> is: * * <blockquote><pre> * D<sub><sub>KL</sub></sub>(Dirichlet(xs) || Dirichlet(ys)) * = <big><big>∫</big></big> p(θ|xs) log ( p(θ|xs) / p(θ|ys) ) <i>d</i>θ * = log Γ(<big><big>Σ</big></big><sub><sub>i < n</sub></sub> xs[i]) * - log Γ(<big><big>Σ</big></big><sub><sub>i < n</sub></sub> ys[i]) * - <big><big>Σ</big></big><sub><sub>i < n</sub></sub> log Γ(xs[i]) * + <big><big>Σ</big></big><sub><sub>i < n</sub></sub> log Γ(ys[i]) * + <big><big>Σ</big></big><sub><sub>i < n</sub></sub> (xs[i] - ys[i]) * (Ψ(xs[i]) - Ψ(<big>Σ</big><sub><sub>k < n</sub></sub> xs[i]))</pre></blockquote> * * where <code>Γ</code> is the gamma function (see {@link * com.aliasi.util.Math#log2Gamma(double)}), and where * <code>Ψ</code> is the digamma function defined in {@link * com.aliasi.util.Math#digamma(double)}. * * <p>This method in keeping with other information-theoretic * methods returns the answer in bits (base 2) rather than nats (base <code><i>e</i></code>). * The return is rescaled from the natural-log based divergence: * * <blockquote><pre> * klDivergenceDirichlet(xs,ys) * = D<sub><sub>KL</sub></sub>(Dirichlet(xs) || Dirichlet(ys)) / log<sub><sub>2</sub></sub> <i>e</i></pre></blockquote> * * <p>Further descriptions of the computation of KL-divergence between * Dirichlets may be found in: * * <ul> * <li>W.D. Penny. 2001. <a * href="http://www.fil.ion.ucl.ac.uk/~wpenny/publications/densities.ps">Kullback-Liebler * Divergences of Normal, Gamma, Dirichlet and Wishart * Densities.</a> Technical report, Wellcome Department of * Cognitive Neurology, 2001.</li> <li> * <li>Blei, Franks, Jordan and Mian. 2006. <a href="">Statistical * modeling of biomedical corpora</a>. <i>BMC Bioinformatics</i> * <b>7</b>:250. (Note: This paper has sign errors; see Penny for * the correct version.)</li> * </ul> * * @param xs Dirichlet parameter for the first distribution. * @param ys Dirichlet parameter for the second distribution. * @return The KL-divergence of the second Dirichlet distribution * relative to the first. */ public static double klDivergenceDirichlet(double[] xs, double[] ys) { verifyDivergenceDirichletArgs(xs,ys); // verifyDivergenceArgs(xs,ys); double sumXs = sum(xs); double sumYs = sum(ys); double divergence = (logGamma(sumXs) - logGamma(sumYs)); double digammaSumXs = com.aliasi.util.Math.digamma(sumXs); for (int i = 0; i < xs.length; ++i) { divergence += (logGamma(ys[i]) - logGamma(xs[i])) + (xs[i] - ys[i]) * (com.aliasi.util.Math.digamma(xs[i]) - digammaSumXs); } return divergence; } static void verifyDivergenceDirichletArgs(double[] xs, double[] ys) { if (xs.length != ys.length) { String msg = "Parameter arrays must be the same length." + " Found xs.length=" + xs.length + " ys.length=" + ys.length; throw new IllegalArgumentException(msg); } for (int i = 0; i < xs.length; ++i) { if (xs[i] <= 0.0 || Double.isInfinite(xs[i]) || Double.isNaN(xs[i])) { String msg = "All parameters must be positive and finite." + " Found xs[" + i + "]=" + xs[i]; throw new IllegalArgumentException(msg); } } for (int i = 0; i < ys.length; ++i) { if (ys[i] <= 0.0 || Double.isInfinite(ys[i]) || Double.isNaN(ys[i])) { String msg = "All parameters must be positive and finite." + " Found ys[" + i + "]=" + ys[i]; throw new IllegalArgumentException(msg); } } } /** * Returns the symmetric version of the Kullback-Leibler divergence * between the Dirichlet distributions determined by the specified * parameters. * * <p>The symmetrized KL divergence for Dirichlets is just the * average of both relative divergences: * * <blockquote><pre> * D<sub><sub>SKL</sub></sub>(Dirichlet(xs), Dirichlet(ys)) * = (D<sub><sub>KL</sub></sub>(Dirichlet(xs) || Dirichlet(ys)) + D<sub><sub>KL</sub></sub>(Dirichlet(ys) || Dirichlet(xs))) / 2 * </pre></blockquote> * * <p>See the method documentation for {@link * #klDivergenceDirichlet(double[],double[])} for a definition of * the relative divergence for Dirichlet distributions. * * @param xs Dirichlet parameter for the first distribution. * @param ys Dirichlet parameter for the second distribution. * @return The symmetrized KL-divergence of the distributions. */ public static double symmetrizedKlDivergenceDirichlet(double[] xs, double[] ys) { return (klDivergenceDirichlet(xs,ys) + klDivergenceDirichlet(ys,xs)) / 2.0; } static double logGamma(double x) { return com.aliasi.util.Math.log2Gamma(x) / com.aliasi.util.Math.log2(java.lang.Math.E); } static double sum(double[] xs) { double sum = 0.0; for (int i = 0; i < xs.length; ++i) sum += xs[i]; return sum; } /** * Returns the Kullback-Leibler divergence of the second * specified multinomial relative to the first. * * <p>The K-L divergence of a multinomial <code>q</code> relative * to a multinomial <code>p</code>, both with <code>n</code> * outcomes, is: * * <blockquote><pre> * D<sub><sub>KL</sub></sub>(p||q) * = <big><big><big>Σ</big></big></big><sub><sub>i < n</sub></sub> p(i) log<sub>2</sub> (p(i) / q(i))</pre></blockquote> * * The value is guaranteed to be non-negative, and will be 0.0 * only if the two distributions are identicial. If any outcome * has zero probability in <code>q</code> and non-zero probability * in <code>p</code>, the result is infinite. * * <p>KL divergence is not symmetric. That is, there are * <code>p</code> and <code>q</code> such that * <code>D<sub><sub>KL</sub></sub>(p||q) != * D<sub><sub>KL</sub></sub>(q||p)</code>. See {@link * #symmetrizedKlDivergence(double[],double[])} and {@link * #jsDivergence(double[],double[])} for symmetric variants. * * <p>KL divergence is equivalent to conditional entropy, although * it is written in the opposite order. If <code>H(p,q)</code> is * the joint entropy of the distributions <code>p</code> and * <code>q</code>, and <code>H(p)</code> is the entropy of * <code>p</code>, then: * * <blockquote><pre> * D<sub><sub>KL</sub></sub>(p||q) = H(p,q) - H(p)</pre></blockquote> * * @param p First multinomial distribution. * @param q Second multinomial distribution. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double klDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); double divergence = 0.0; int len = p.length; for (int i = 0; i < len; ++i) { if (p[i] > 0.0 && p[i] != q[i]) divergence += p[i] * com.aliasi.util.Math.log2(p[i] / q[i]); } return divergence; } static void verifyDivergenceArgs(double[] p, double[] q) { if (p.length != q.length) { String msg = "Input distributions must have same length." + " Found p.length=" + p.length + " q.length=" + q.length; throw new IllegalArgumentException(msg); } int len = p.length; for (int i = 0; i < len; ++i) { if (p[i] < 0.0 || p[i] > 1.0 || Double.isNaN(p[i]) || Double.isInfinite(p[i])) { String msg = "p[i] must be between 0.0 and 1.0 inclusive." + " found p[" + i + "]=" + p[i]; throw new IllegalArgumentException(msg); } if (q[i] < 0.0 || q[i] > 1.0 || Double.isNaN(q[i]) || Double.isInfinite(q[i])) { String msg = "q[i] must be between 0.0 and 1.0 inclusive." + " found q[" + i + "] =" + q[i]; throw new IllegalArgumentException(msg); } } } /** * Returns the symmetrized KL-divergence between the specified distributions. * The symmetrization is carried out by averaging their relative * divergences: * * <blockquote><pre> * D<sub><sub>SKL</sub></sub>(p,q) * = ( D<sub><sub>KL</sub></sub>(p,q) + D<sub><sub>KL</sub></sub>(q,p) ) / 2 * </pre></blockquote> * * @param p First multinomial. * @param q Second multinomial. * @return The Symmetrized KL divergence between the multinomials. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double symmetrizedKlDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); return (klDivergence(p,q) + klDivergence(q,p)) / 2.0; } /** * Return the Jenson-Shannon divergence between the specified * multinomial distributions. The JS divergence is defined by * * <blockquote><pre> * D<sub><sub>JS</sub></sub>(p,q) * = ( D<sub><sub>KL</sub></sub>(p,m) + D<sub><sub>KL</sub></sub>(q,m) ) / 2</pre></blockquote> * * where <code>m</code> is defined as the balanced linear * interpolation (that is, the average) of <code>p</code> and * <code>q</code>: * * <pre><blockquote> * m[i] = (p[i] + q[i]) / 2</pre></blockquote> * * The JS divergence is non-zero, equal to zero only if <code>p</code> * and <code>q</code> are the same distribution, and symmetric. * * @param p First multinomial. * @param q Second multinomial. * @return The JS divergence between the multinomials. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double jsDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); double[] m = new double[p.length]; for (int i = 0; i < p.length; ++i) m[i] = (p[i] + q[i])/2.0; return (klDivergence(p,m) + klDivergence(q,m)) / 2.0; } /** * Returns a permutation of the integers between 0 and * the specified length minus one. * * @param length Size of permutation to return. * @return Permutation of the specified length. */ public static int[] permutation(int length) { return permutation(length,new Random()); } /** * Returns a permutation of the integers between 0 and * the specified length minus one using the specified * randomizer. * * @param length Size of permutation to return. * @param random Randomizer to use for permutation. * @return Permutation of the specified length. */ public static int[] permutation(int length, Random random) { int[] xs = new int[length]; for (int i = 0; i < xs.length; ++i) xs[i] = i; for (int i = xs.length; --i > 0; ) { int pos = random.nextInt(i); int temp = xs[pos]; xs[pos] = xs[i]; xs[i] = temp; } return xs; } /** * Returns Pearson's C<sub><sub>2</sub></sub> goodness-of-fit * statistic for independence over the specified binary * contingency matrix. Asymptotically, this statistic has a * χ<sup>2</sup> distribution with one degree of freedom. The * higher the value, the <i>less</i> likely the two outcomes are * independent. Note that this method is just a special case of * the general chi-squared independence test: * * <pre> * chiSquaredIndependence(both,oneOnly,twoOnly,neither) * = chiSquaredIndependence(new double[][] { {both, oneOnly}, * {twoOnly, neither} }); * </pre> * * The specified values make up the following contingency matrix * for boolean outcomes labeled <code>One</code> and * <code>Two</code>: * * <blockquote> * <table border='1' cellpadding='5'> * <tr><td> </td><td>+Two</td><td>-Two</td></tr> * <tr><td>+One</td><td><code>both</code></td><td><code>oneOnly</code></td></tr> * <tr><td>-One</td><td><code>twoOnly</code></td><td><code>neither</code></td></tr> * </table> * </blockquote> * * <P>The value <code>both</code> is the count of events where both * outcomes occurred, <code>oneOnly</code> for where only the * first outcome occurred, <code>twoOnly</code> for where only * the second outcome occurred, and <code>neither</code> for * when neither occurred. Let <code>totalCount</code> be * the sum of all of the cells in the matrix. * * <P>From the contingency matrix, marginal probabilities for * the two outcomes may be computed: * * <blockquote><code> * P(One) = (both + oneOnly) / totalCount * <br> * P(Two) = (both + twoOnly) / totalCount * </code></blockquote>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -