llr.c
来自「EM算法的改进」· C语言 代码 · 共 747 行 · 第 1/2 页
C
747 行
/* * $Id: llr.c 1339 2006-09-21 19:46:28Z tbailey $ * * $Log$ * Revision 1.2 2006/03/08 20:50:11 nadya * merge chamges from v3_5_2 branch * * Revision 1.1.1.1.4.2 2006/01/26 09:16:27 tbailey * Rename local function getline() to getline2() to avoid conflict with * function defined in stdio.h. * * Revision 1.1.1.1.4.1 2006/01/24 20:44:08 nadya * update copyright * * Revision 1.1.1.1 2005/07/29 00:22:18 nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************* Copyright ** (1999-2006) The Regents of the University of California. ** All Rights Reserved. ** Author: Timothy L. Bailey* ** Permission to use, copy, modify, and distribute any part of ** this software for educational, research and non-profit purposes,** without fee, and without a written agreement is hereby granted, ** provided that the above copyright notice, this paragraph and ** the following three paragraphs appear in all copies. ** ** Those desiring to incorporate this software into commercial ** products or use for commercial purposes should contact the ** Technology Transfer Office, University of California, San Diego,** 9500 Gilman Drive, La Jolla, California, 92093-0910, ** Ph: (619) 534 5815. ** ** IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ** ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR ** CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF ** THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA ** HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ** ** THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE ** UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE ** MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. ** THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND ** EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, ** INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ** MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT ** THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT, ** TRADEMARK OR OTHER RIGHTS. *************************************************************************//***********************************************************************//* Information Content Statistics Routines*//***********************************************************************/#ifdef SO#define DEFINE_GLOBALS#endif#include <llr.h>#include <macros.h>#include <general.h>#include <io.h>#include <stdio.h>#include <clock.h>#include <logs.h>#include <message.h>/* probability distribution */ typedef struct distr { int w; /* maximum width of alignment */ double alpha; /* scale factor for distribution */ int *range; /* range of distributions w = [1..w] */ int *offset; /* offset of distributions w = [1..w] */ double **d; /* array of (log) distributions w = [1..w] */ double **cdf; /* array of llr (log) 1 - CDFs w = [1..w] */ double mean; /* mean of llr for w = 1 */} DISTR;static int ndistrs = -1; /* largest N for which distr known */static DISTR *distrs = NULL; /* array of distributions for N = [1..ndistrs]*//************************************************************************//* init_llr_pv_tables Initialize the single-column log likelihood ratio p-value tables. Note: Palindromes have effectively double the apparent sites but half the width.*//************************************************************************/extern void init_llr_pv_tables( int min, /* minimum number of sites */ int max, /* maximum number of sites */ int alength, /* alphabet length */ double *back, /* background frequencies */ BOOLEAN pal /* sites are palindromes */){ int nsites; /* number of sites */ /* set effective number of sites to double if pal */ if (pal) { min *= 2; max *= 2; } if (!NO_STATUS) fprintf(stderr, "Initializing the motif probability tables for %d to %d sites...\n", min, max); /* make sure the distr table gets initialized on all nodes */ (void) get_llr_pv(0, 1, 1, LLR_RANGE, 1.0, alength, back); for (nsites=min; nsites<=max; nsites += pal ? 2 : 1) { /* nsites */ /* allocate space for table */ (void) get_llr_pv(0, nsites, 0, LLR_RANGE, 1.0, alength, back); if (!load_balance_llr(nsites, pal)) { continue; /* for parallel */ } /* create table */ if (!NO_STATUS) { fprintf(stderr, "nsites = %d\r", nsites); } (void) get_llr_pv(0, nsites, 1, LLR_RANGE, 1.0, alength, back); } /* nsites */ broadcast_llr(min, max, pal); /* for parallel; collect the tables */#ifdef DEBUG /* print results */ for (n=min; n<=max; n++) { int I; int w = 1; printf("# N I llr 1-cdf\n"); for (I=0; I<=distrs[n].range[w]; I++) { /* LLR */ double m2, e2; if (distrs[n].cdf[w][I] == LOGZERO) { m2 = e2 = 0; } else { exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1); } printf("%3d %3d %5.1f %3.1fe%+05.0f\n", n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m2, e2); } /* LLR */ }#endif if (!NO_STATUS)fprintf(stderr, "\nDone initializing\n");} /* init_llr_pv_tables *//******************************************************************************//* llr_distr Compute the probability distribution of the log-likelihood ratio (llr) statistic for a discrete distribution. Because of roundoff, LLR may be as small as -1 (instead of 0). Returns an array p where p[i] = log(Pr(LLR=i))*//******************************************************************************/double *llr_distr( int A, /* dimension of discrete distribution */ double *dd, /* discrete distribution */ int N, /* number of samples */ int desired_range, /* desired range for scaled LLR */ double frac, /* fraction of scores to use */ double *alpha, /* scale factor for scaled LLR */ int *offset, /* prob[0] = prob(offset) */ int *range /* range for scaled LLR */){ int i; /* index over alphabet */ int n; /* index over samples */ int k; /* other index */ int I; /* LLR */ double dd_sum; /* sum of dd */ int **IP; /* I'_i[n] */ int *minI=NULL; /* minimum intermediate value of I */ int *maxI=NULL; /* maximum intermediate value of I */ int Irange; /* maxI-minI+1 */ double logNfact; /* log N! */ double **logP; /* log P_i[n] */ double **logSP; /* log script_P[# samples][LLR] */ double *prob=NULL; /* final probability distribution */ double min, max, min_dd; /* create space for IP, P, minI and maxI */ create_2array(IP, int, A, N+1); create_2array(logP, double, A, N+1); Resize(minI, N+1, int); Resize(maxI, N+1, int); /* make sure distribution sums to 1.0 and has no 0's */ for (i=dd_sum=0; i<A; i++) dd_sum += dd[i] + EPSILON; for (i=0; i<A; i++) dd[i] = (dd[i]+EPSILON)/dd_sum; /* compute N! */ logNfact = 0; for (i=2; i<=N; i++) logNfact += log(i); /* get estimates of minimum and miximum values of llr */ for (i=0, min_dd=1; i<A; i++) min_dd = MIN(min_dd, dd[i]); max = NINT(-N * log(min_dd)); for (i=min=0; i<A; i++) min += dd[i]*N*(log(dd[i]) - log(dd[i])); min = NINT(min); /*printf("min = %f max = %f\n", min, max);*/ /* set alpha to achieve the desired range */ *alpha = desired_range/((max-min)); /* *alpha = NINT(((int)desired_range)/((max-min))); if (*alpha < 1) *alpha = 1;*/ /*fprintf(stderr, "range %d max %f min %f alpha = %f\n",desired_range, max, min, *alpha);*/ /* compute I', P, minI and maxI */ for (n=0; n<=N; n++) minI[n] = maxI[n] = 0; for (i=0; i<A; i++) { /* index over alphabet */ double logdd = LOG(dd[i]); /* log(dd[i]) */ IP[i][0] = 0; /* I'_i(0) */ logP[i][0] = 0; /* log P_i(0) */ for (n=1; n<=N; n++) { /* index over samples */ IP[i][n] = NINT(*alpha*n*log(n/(N*dd[i]))); /* I'_i(n) */ logP[i][n] = logP[i][n-1] + logdd - log(n); /* log P_i(n) */ for (k=1; k<=n; k++) { /* index over samples of new */ minI[n] = MIN(minI[n], minI[n-k] + IP[i][k]); maxI[n] = MAX(maxI[n], maxI[n-k] + IP[i][k]); } } } /* get overall minI and maxI */ for (n=1; n<=N; n++) { /*printf("minI[%d] %d maxI[%d] %d\n", n, minI[n], n, maxI[n]);*/ minI[0] = MIN(minI[0], minI[n]); /* min for intermediates */ maxI[0] = MAX(maxI[0], maxI[n]); /* max for intermediates */ minI[n] = LOGZEROI; maxI[n] = 0; } Irange = maxI[0] - minI[0] + 2; *offset = minI[0] - 1; /* I offset: I=-1 is array 0 */ /*printf("minI %d maxI %d Irange %d\n", minI[0], maxI[0], Irange);*/ minI[0] = LOGZEROI; maxI[0] = 0; /* create script_P arrays with enough space for intermediate calculations */ create_2array(logSP, double, N+1, Irange+1); /* clear intermediate probability array */ for (n=0; n<=N; n++) for(I=0; I<Irange; I++) logSP[n][I] = LOGZERO; /* init probability array for first letter in alphabet */ for (n=0; n<=N; n++) { I = IP[0][n] - *offset; /* offset I */ logSP[n][I] = logNfact + logP[0][n]; /* init */ minI[n] = maxI[n] = I; } /* compute probabilities recursively */ for (i=1; i<A; i++) { /* index over (rest of) alphabet */ for (n=N; n>=0; n--) { /* index over samples */ for (k=1; k<=n; k++) { /* index over samples of new letter */ int min = minI[n-k]; int max = MAX(min, maxI[n-k] - (1-frac)*(maxI[n-k]-minI[n-k]+1)); /*printf("min %d maxI %d max %d\n", min, maxI[n-k], max);*/ for (I=min; I<=max; I++) { /* index over I */ if (logSP[n-k][I] > LOGZERO) { /*printf("i %d old: %d %d new: %d %d\n", i, n-k, I, n,I+IP[i][k]);*/ logSP[n][I+IP[i][k]] = LOGL_SUM(logSP[n][I+IP[i][k]], logP[i][k] + logSP[n-k][I]); } } /* get current minimum and maximum I in intermediate arrays */ minI[n] = MIN(minI[n], minI[n-k]+IP[i][k]); maxI[n] = MAX(maxI[n], maxI[n-k]+IP[i][k]); } if (n==N && i==A-1) break; /* all done */ } } /* compute range */ /*printf("minI[N] %d maxI[N] %d\n", minI[N], maxI[N]);*/ *range = maxI[N] - minI[N]; /* move to probability array with prob(offset) in position 0 */ *offset += minI[N]; /* prob[0] = prob(offset) */ Resize(prob, *range+2, double); for (I=minI[N]; I<=maxI[N]; I++) prob[I-minI[N]] = logSP[N][I]; /*fprintf(stderr, "N= %d range= %d offset= %d alpha= %f\n", N, *range, *offset, *alpha);*/ /* free up space */ free_2array(IP, A); free_2array(logP, A); free_2array(logSP, N+1); myfree(minI); myfree(maxI); return prob;} /* llr_distr *//******************************************************************************//* sum_distr Compute the distribution of the sum of two integer-valued random variables whose ranges are [0..r1] and [0..r2], respectively, given the log of their distributions.*//******************************************************************************/double *sum_distr( double *d1, /* (log) distribution of RV1 */ int r1, /* range of RV1 */ double *d2, /* (log) distribution of RV2 */ int r2, /* range of RV2 */ int *r_sum /* range of sum of RV1 and RV2 */){ int i, j, k; int range = r1 + r2; /* potential range of sum */ double *d_sum = NULL; /* distribution of sum */ Resize(d_sum, range+1, double); /* space for distribution */ for (i=0; i<=range; i++) { /* value of sum */ d_sum[i] = LOGZERO; } for (i=0; i<=r1; i++) { /* range of RV1 */ if (d1[i]==LOGZERO) continue; for (j=0, k=i; j<=r2; j++, k++) { /* range of RV2 */ if (d2[j]==LOGZERO) continue; d_sum[k] = LOGL_SUM(d_sum[k], d1[i]+d2[j]); } /* RV2 */ } /* RV1 */ for (i=range; i>=0; i--) { /* value of sum */ if (d_sum[i] > LOGZERO) break; } *r_sum = i; /* non-zero range */ return d_sum;} /* sum_distr *//******************************************************************************//* cdf Compute (log) 1-CDF of an integer-valued (log) distribution. Smooths the CDF by linear interpolation so that adjacent positions in the table will have different values if possible.*//******************************************************************************/static double *cdf( double *d, /* integer valued distribution */ int r /* range [0..r] */){ double *cdf=NULL, slope=0; int I, i, j, k; Resize(cdf, r+1, double); cdf[r] = d[r]; for (I=r-1; I>=0; I--) { cdf[I] = LOGL_SUM(cdf[I+1], d[I]); }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?