llr.c

来自「EM算法的改进」· C语言 代码 · 共 747 行 · 第 1/2 页

C
747
字号
/* * $Id: llr.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.2  2006/03/08 20:50:11  nadya * merge chamges from v3_5_2 branch * * Revision 1.1.1.1.4.2  2006/01/26 09:16:27  tbailey * Rename local function getline() to getline2() to avoid conflict with * function defined in stdio.h. * * Revision 1.1.1.1.4.1  2006/01/24 20:44:08  nadya * update copyright * * Revision 1.1.1.1  2005/07/29 00:22:18  nadya * Importing from meme-3.0.14, and adding configure/make * *//*************************************************************************	Copyright							**	(1999-2006) The Regents of the University of California.	**	All Rights Reserved.						**	Author: Timothy L. Bailey*									**	Permission to use, copy, modify, and distribute any part of 	**	this software for educational, research and non-profit purposes,**	without fee, and without a written agreement is hereby granted, **	provided that the above copyright notice, this paragraph and 	**	the following three paragraphs appear in all copies.		**									**	Those desiring to incorporate this software into commercial 	**	products or use for commercial purposes should contact the 	**	Technology Transfer Office, University of California, San Diego,**	9500 Gilman Drive, La Jolla, California, 92093-0910, 		**	Ph: (619) 534 5815.						**									**	IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO 	**	ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR 	**	CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF 	**	THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA 	**	HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 		**									**	THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE **	UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE 		**	MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.  **	THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND 	**	EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, **	INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 	**	MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT 	**	THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT, 		**	TRADEMARK OR OTHER RIGHTS.  					*************************************************************************//***********************************************************************//*	Information Content Statistics Routines*//***********************************************************************/#ifdef SO#define DEFINE_GLOBALS#endif#include <llr.h>#include <macros.h>#include <general.h>#include <io.h>#include <stdio.h>#include <clock.h>#include <logs.h>#include <message.h>/* probability distribution */ typedef struct distr {  int w;			/* maximum width of alignment */  double alpha;			/* scale factor for distribution */  int *range;			/* range of distributions w = [1..w] */  int *offset;			/* offset of distributions w = [1..w] */  double **d;			/* array of (log) distributions w = [1..w] */  double **cdf;			/* array of llr (log) 1 - CDFs w = [1..w] */  double mean;			/* mean of llr for w = 1 */} DISTR;static int ndistrs = -1;	/* largest N for which distr known */static DISTR *distrs = NULL;	/* array of distributions for N = [1..ndistrs]*//************************************************************************//*	init_llr_pv_tables	Initialize the single-column log likelihood ratio p-value tables.        Note: Palindromes have effectively double the apparent sites but half         the width.*//************************************************************************/extern void init_llr_pv_tables(  int min,				/* minimum number of sites */  int max,				/* maximum number of sites */  int alength,				/* alphabet length */  double *back,				/* background frequencies */  BOOLEAN pal				/* sites are palindromes */){  int nsites;				/* number of sites */  /* set effective number of sites to double if pal */  if (pal) { min *= 2; max *= 2; }  if (!NO_STATUS)  fprintf(stderr,    "Initializing the motif probability tables for %d to %d sites...\n",      min, max);  /* make sure the distr table gets initialized on all nodes */  (void) get_llr_pv(0, 1, 1, LLR_RANGE, 1.0, alength, back);  for (nsites=min; nsites<=max; nsites += pal ? 2 : 1) {   /* nsites */    /* allocate space for table */    (void) get_llr_pv(0, nsites, 0, LLR_RANGE, 1.0, alength, back);    if (!load_balance_llr(nsites, pal)) {      continue;	/* for parallel */    }     /* create table */    if (!NO_STATUS) { fprintf(stderr, "nsites = %d\r", nsites); }    (void) get_llr_pv(0, nsites, 1, LLR_RANGE, 1.0, alength, back);  } /* nsites */  broadcast_llr(min, max, pal);		/* for parallel; collect the tables */#ifdef DEBUG  /* print results */  for (n=min; n<=max; n++) {    int I;    int w = 1;    printf("# N    I    llr         1-cdf\n");    for (I=0; I<=distrs[n].range[w]; I++) {		/* LLR */      double m2, e2;      if (distrs[n].cdf[w][I] == LOGZERO) {        m2 = e2 = 0;      } else {        exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1);      }      printf("%3d %3d %5.1f %3.1fe%+05.0f\n",        n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m2, e2);    } /* LLR */  }#endif  if (!NO_STATUS)fprintf(stderr, "\nDone initializing\n");} /* init_llr_pv_tables *//******************************************************************************//*	llr_distr	Compute the probability distribution of the log-likelihood	ratio (llr) statistic for a discrete distribution.	Because of roundoff, LLR may be as small as -1 (instead of 0).	Returns an array p where		p[i] = log(Pr(LLR=i))*//******************************************************************************/double *llr_distr(  int A,				/* dimension of discrete distribution */  double *dd,				/* discrete distribution */  int N,				/* number of samples */  int desired_range,			/* desired range for scaled LLR */  double frac,				/* fraction of scores to use */  double *alpha,			/* scale factor for scaled LLR */  int *offset,				/* prob[0] = prob(offset) */  int *range				/* range for scaled LLR */){  int i; 				/* index over alphabet */  int n;				/* index over samples */  int k;				/* other index */  int I;				/* LLR */  double dd_sum;			/* sum of dd */  int **IP;				/* I'_i[n] */  int *minI=NULL;			/* minimum intermediate value of I */  int *maxI=NULL;			/* maximum intermediate value of I */  int Irange;				/* maxI-minI+1 */  double logNfact;			/* log N! */  double **logP;			/* log P_i[n] */  double **logSP;			/* log script_P[# samples][LLR] */  double *prob=NULL;			/* final probability distribution */  double min, max, min_dd;  /* create space for IP, P, minI and maxI */  create_2array(IP, int, A, N+1);  create_2array(logP, double, A, N+1);  Resize(minI, N+1, int);  Resize(maxI, N+1, int);  /* make sure distribution sums to 1.0 and has no 0's */  for (i=dd_sum=0; i<A; i++) dd_sum += dd[i] + EPSILON;  for (i=0; i<A; i++) dd[i] = (dd[i]+EPSILON)/dd_sum;  /* compute N! */  logNfact = 0;  for (i=2; i<=N; i++) logNfact += log(i);   /* get estimates of minimum and miximum values of llr */  for (i=0, min_dd=1; i<A; i++) min_dd = MIN(min_dd, dd[i]);  max = NINT(-N * log(min_dd));  for (i=min=0; i<A; i++) min += dd[i]*N*(log(dd[i]) - log(dd[i]));  min = NINT(min);  /*printf("min = %f max = %f\n", min, max);*/  /* set alpha to achieve the desired range */  *alpha = desired_range/((max-min));  /* *alpha = NINT(((int)desired_range)/((max-min)));  if (*alpha < 1) *alpha = 1;*/  /*fprintf(stderr, "range %d max %f min %f alpha = %f\n",desired_range, max, min, *alpha);*/  /* compute I', P, minI and maxI */   for (n=0; n<=N; n++) minI[n] = maxI[n] = 0;  for (i=0; i<A; i++) {				/* index over alphabet */    double logdd = LOG(dd[i]);			/* log(dd[i]) */    IP[i][0] = 0; 				/* I'_i(0) */    logP[i][0] = 0;				/* log P_i(0) */    for (n=1; n<=N; n++) {			/* index over samples */      IP[i][n] = NINT(*alpha*n*log(n/(N*dd[i]))); 	/* I'_i(n) */      logP[i][n] = logP[i][n-1] + logdd - log(n);	/* log P_i(n) */      for (k=1; k<=n; k++) {			/* index over samples of new */	minI[n] = MIN(minI[n], minI[n-k] + IP[i][k]);	maxI[n] = MAX(maxI[n], maxI[n-k] + IP[i][k]);      }    }  }  /* get overall minI and maxI */  for (n=1; n<=N; n++) {    /*printf("minI[%d] %d maxI[%d] %d\n", n, minI[n], n, maxI[n]);*/    minI[0] = MIN(minI[0], minI[n]);		/* min for intermediates */    maxI[0] = MAX(maxI[0], maxI[n]);		/* max for intermediates */    minI[n] = LOGZEROI;    maxI[n] = 0;  }  Irange = maxI[0] - minI[0] + 2;  *offset = minI[0] - 1;			/* I offset: I=-1 is array 0 */  /*printf("minI %d maxI %d Irange %d\n", minI[0], maxI[0], Irange);*/  minI[0] = LOGZEROI;  maxI[0] = 0;  /* create script_P arrays with enough space for intermediate calculations */  create_2array(logSP, double, N+1, Irange+1);    /* clear intermediate probability array */  for (n=0; n<=N; n++) for(I=0; I<Irange; I++) logSP[n][I] = LOGZERO;  /* init probability array for first letter in alphabet */  for (n=0; n<=N; n++) {    I = IP[0][n] - *offset;			/* offset I */    logSP[n][I] = logNfact + logP[0][n];	/* init */    minI[n] = maxI[n] = I;  }  /* compute probabilities recursively */  for (i=1; i<A; i++) {			/* index over (rest of) alphabet */    for (n=N; n>=0; n--) {		/* index over samples */      for (k=1; k<=n; k++) {		/* index over samples of new letter */        int min = minI[n-k];        int max = MAX(min, maxI[n-k] - (1-frac)*(maxI[n-k]-minI[n-k]+1));        /*printf("min %d maxI %d max %d\n", min, maxI[n-k], max);*/        for (I=min; I<=max; I++) {	/* index over I */          if (logSP[n-k][I] > LOGZERO) {	    /*printf("i %d old: %d %d new: %d %d\n", i, n-k, I, n,I+IP[i][k]);*/            logSP[n][I+IP[i][k]] =               LOGL_SUM(logSP[n][I+IP[i][k]], logP[i][k] + logSP[n-k][I]);          }	}	/* get current minimum and maximum I in intermediate arrays */	minI[n] = MIN(minI[n], minI[n-k]+IP[i][k]);	maxI[n] = MAX(maxI[n], maxI[n-k]+IP[i][k]);      }      if (n==N && i==A-1) break;	/* all done */    }  }  /* compute range */  /*printf("minI[N] %d maxI[N] %d\n", minI[N], maxI[N]);*/  *range = maxI[N] - minI[N];   /* move to probability array with prob(offset) in position 0 */  *offset += minI[N];			/* prob[0] = prob(offset) */  Resize(prob, *range+2, double);  for (I=minI[N]; I<=maxI[N]; I++) prob[I-minI[N]] = logSP[N][I];  /*fprintf(stderr, "N= %d range= %d offset= %d alpha= %f\n", N, *range,     *offset, *alpha);*/        /* free up space */  free_2array(IP, A);  free_2array(logP, A);  free_2array(logSP, N+1);  myfree(minI);  myfree(maxI);  return prob;} /* llr_distr *//******************************************************************************//*	sum_distr	Compute the distribution of the sum of two integer-valued	random variables whose ranges are [0..r1] and [0..r2], 	respectively, given the log of their distributions.*//******************************************************************************/double *sum_distr(  double *d1,				/* (log) distribution of RV1 */  int r1,				/* range of RV1 */  double *d2,				/* (log) distribution of RV2 */  int r2, 				/* range of RV2 */  int *r_sum				/* range of sum of RV1 and RV2 */){  int i, j, k;  int range = r1 + r2;			/* potential range of sum */  double *d_sum = NULL;			/* distribution of sum */  Resize(d_sum, range+1, double);	/* space for distribution */  for (i=0; i<=range; i++) {		/* value of sum */    d_sum[i] = LOGZERO;  }  for (i=0; i<=r1; i++) {		/* range of RV1 */    if (d1[i]==LOGZERO) continue;    for (j=0, k=i; j<=r2; j++, k++) {	/* range of RV2 */      if (d2[j]==LOGZERO) continue;      d_sum[k] = LOGL_SUM(d_sum[k], d1[i]+d2[j]);    } /* RV2 */  } /* RV1 */  for (i=range; i>=0; i--) {		/* value of sum */    if (d_sum[i] > LOGZERO) break;  }  *r_sum = i;				/* non-zero range */  return d_sum;} /* sum_distr *//******************************************************************************//*	cdf	Compute (log) 1-CDF of an integer-valued (log) distribution.	Smooths the CDF by linear interpolation so that adjacent positions	in the table will have different values if possible.*//******************************************************************************/static double *cdf(  double *d,				/* integer valued distribution */  int r					/* range [0..r] */){  double *cdf=NULL, slope=0;  int I, i, j, k;  Resize(cdf, r+1, double);  cdf[r] = d[r];  for (I=r-1; I>=0; I--) {    cdf[I] = LOGL_SUM(cdf[I+1], d[I]);  }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?