llr.c

来自「EM算法的改进」· C语言 代码 · 共 747 行 · 第 1/2 页

C
747
字号
  /* smooth cdf by linear interpolation in logs */  for (i=r; i>0; i=j) {    for (j=i-1; j>0 && d[j]==LOGZERO; j--) ;	/* find next non-zero p */     if (i!=j) slope = (cdf[i]-cdf[j])/(i-j);	/* slope */    for (k=j+1; k<i; k++) cdf[k] = cdf[j] + (k-j)*slope;  }  return cdf;} /* cdf *//******************************************************************************//*	get_scaled_llr	Given a set of observed counts (o_i) and expected counts (e_i)	and a scale factor alpha, return the scaled information content: 		Sum_i=1..A NINT(alpha * o_i log(o_i/e_i))*//******************************************************************************/int get_scaled_llr(  double alpha,			/* scale factor */  int A,			/* size of alphabet */  double *o,			/* observed counts */  double *e 			/* expected counts */) {  int i;  int llr = 0;  for (i=0; i<A; i++) {    if (o[i]) {      llr += NINT(alpha * o[i] * log(o[i]/e[i]));    }  }  return llr >= 0 ? llr : 0;} /* get_scaled_llr *//******************************************************************************//*	brute	Calculate the distribution of scaled LLR using a brute force	algorithm (for small values of n only).  Used for checking	get_llr.*//******************************************************************************/void brute1(  int A,				/* dimension of discrete distribution */  double *dd,				/* discrete distribution */  int N,				/* number of samples of distribution */  double alpha,				/* scale factor for llr */  double *o,  double *e,  double *p,  double prob){  int i;  int llr;  if (N==0) {    llr = get_scaled_llr(alpha, A, o, e);    p[llr] += prob;  } else {    for (i=0; i<A; i++) {      o[i]++;      brute1(A, dd, N-1, alpha, o, e, p, prob*dd[i]);      o[i]--;    }  } } /* brute1 */double *brute(  int A,				/* dimension of discrete distribution */  double *dd,				/* discrete distribution */  int N,				/* number of samples of distribution */  int range,				/* desired range for scaled LLR */  double alpha  			/* scale factor for llr */){  int i;  double *o=NULL;  double *e=NULL;  double *p=NULL;  Resize(o, A, double);  Resize(e, A, double);  Resize(p, range+2, double);  for (i=0; i<A; i++) {     o[i] = 0;    e[i] = dd[i] * N;  }  for (i=0; i<range+2; i++) {     p[i] = 0;  }  brute1(A, dd, N, alpha, o, e, p, 1.0);   return p;} /* brute *//******************************************************************************//*	get_llr_pv	Get the log p-value of a weighted log-likelihood ratio given:		llr	log-likelihood ratio		n	(weighted) number of sequences in the alignment		w	width of the alignment		range 	desired number of values in range		frac	speedup factor		alength	number of letters in alphabet		dd[]	alphabet frequency distribution	Computes the distributions on the fly only as needed.	Range should be the same each time get_llr_pv is called.	Call with integral n and w=0 to allocate table in parallel mode.	Returns the p-value.*//******************************************************************************/extern double get_llr_pv(  double llr,				/* log likelihood ratio */  double n,				/* wgtd number sequences in alignment */  int w,				/* width of alignment */  int range,				/* desired range for resolution */  double frac,				/* speedup factor */  int alength,				/* length of alphabet */  double *dd 				/* alphabet frequency distribution */){  int i, N;  double I;				/* weighted log likelihood ratio */  int I0, I1;				/* position of llr in table */  double logpv;				/* log pvalue */  double n0, n1;			/* floor and ceil of n */  double alpha;				/* scale factor used */  if (n<=1) return 0.0;			/* only one site p-value = 1.0 */  /* return geometric mean if N is not integral */  if ( (n0=floor(n)) != (n1=ceil(n)) )    return (       (n1-n)*get_llr_pv(llr, n0, w, range, frac, alength, dd) +      (n-n0)*get_llr_pv(llr, n1, w, range, frac, alength, dd)     );  N = (int) n;				/* make n an integer */  /* N larger than any previous N? */  if (ndistrs < N) {			/* first call */    Resize(distrs, N+1, DISTR);		/* create array of distributions */    for (i=ndistrs+1; i<=N; i++) {      distrs[i].w = 0;      distrs[i].offset = NULL;      distrs[i].range = NULL;      distrs[i].d = NULL;      distrs[i].cdf = NULL;      distrs[i].mean = 0;    }    ndistrs = N;				/* set maximum N */  }  /* done if w == 0 */  if (w == 0) return 0.0;  /* w larger than any previous w for this N? */  if (distrs[N].w < w) {			/* larger w */    Resize(distrs[N].d, w+1, double *);    Resize(distrs[N].cdf, w+1, double *);    Resize(distrs[N].offset, w+1, int);    Resize(distrs[N].range, w+1, int);    /* first time? */     if (distrs[N].w == 0) {			/* get the w=1 distribution */      distrs[N].d[1] = llr_distr(alength, dd, N, range, frac,        &distrs[N].alpha, &distrs[N].offset[1], &distrs[N].range[1]);      /* get mean of LLR for w = 1 */      for (i=0; i<=distrs[N].range[1]; i++) {        double llr = (i + distrs[N].offset[1]) / distrs[N].alpha;        distrs[N].mean += exp(distrs[N].d[1][i])*llr;      }      distrs[N].cdf[1] = cdf(distrs[N].d[1], distrs[N].range[1]);      distrs[N].w = 1;    } /* first time */    /* get the distributions for widths oldw .. maxw */    /*fprintf(stderr, "enter cdf N= %d w= %d oldw= %d\n", N, w, distrs[N].w);*/    for (i=distrs[N].w+1; i<=w; i++) {		/* width */      distrs[N].d[i] = sum_distr(        distrs[N].d[i-1],         distrs[N].range[i-1],         distrs[N].d[1],         distrs[N].range[1],         &distrs[N].range[i]      );      distrs[N].offset[i] = distrs[N].offset[i-1] + distrs[N].offset[1];      distrs[N].cdf[i] = cdf(distrs[N].d[i], distrs[N].range[i]);    } /* width */    /*fprintf(stderr, "leave cdf\n");*/    distrs[N].w = w; 				/* set maximum w */  } /* new w */    /* get position in table */  alpha = distrs[N].alpha;  I = (alpha * llr) - distrs[N].offset[w];		/* position in table */  I0 = (int) I;						/* floor of position */  I1 = I0 + 1;						/* ceil. of position */  if (I < 0) {						/* lower bound I */    logpv = distrs[N].cdf[w][0];  } else if (I0 >= distrs[N].range[w]) {		/* upper bound I */    logpv = distrs[N].cdf[w][distrs[N].range[w]];  } else {						/* lin. interpolate */    logpv =      distrs[N].cdf[w][I0] + (I-I0)*(distrs[N].cdf[w][I1]-distrs[N].cdf[w][I0]);  }  /* return log p-value */  return logpv;} /* get_llr_pv *//******************************************************************************//*	get_llr_mean	Returns the mean of the LLR distribution for w=1.	Assumes get_llr_pv has already been called.*//******************************************************************************/extern double get_llr_mean(  double n 				/* wgt. number sequences in alignment */){  return(distrs[NINT(n)].mean);} /* get_llr_mean */#ifdef SO/************************************************************************//*	llr <alength> <freq_file> <N>	Compute the probability distribution for the scaled information	content (LLR) of N letters.  Only the most probable <frac> LLR values	are used to speed the calculation.*//************************************************************************/extern int main(  int argc,  char** argv){  int i, j, I, n;  int A=0;				/* length of alphabet */  char *ffreq=NULL;			/* name of frequency file */  FILE *ffile;				/* frequency file */  double *dd = NULL;			/* discrete distribution */  char *alphabet = NULL;		/* alphabet */  int N=2, minN=0, maxN=0;		/* number of observations */  int w=1;				/* maximum width of alignment */  double frac = 1;			/* fraction of scores to use */  int range = 100;			/* desired range per N */  double *p2=NULL;			/* LLR statistic distribution */  double *pv2=NULL;  int brute_maxN = 0;			/* maximum N for checking llr_distr */  char *line;				/* input buffer line */  double alpha;				/* scale factor used */  i = 1;  argv[0] = "llr";  DO_STANDARD_COMMAND_LINE(2,    USAGE(<alength> <freq_file> [options]);    USAGE(\n\t<alength>\tnumber of letters in alphabet);     USAGE(\t<freq_file>\tfile containing alphabet and letter frequencies);    USAGE(\t\t\twhere each line is: <letter> <freq>);    NON_SWITCH(1,\r,      switch (i++) {        case 1: A = atoi(_OPTION_); break;        case 2: ffreq = _OPTION_; break;        default: COMMAND_LINE_ERROR;      }    );    DATA_OPTN(1, N, <N>, \tnumber of observations; default=2,      N = atof(_OPTION_));    DATA_OPTN(1, minN, <minN>, \tminimum number of observations; default=2,      minN = atoi(_OPTION_));    DATA_OPTN(1, maxN, <maxN>, \tmaximum number of observations; default=2,      maxN = atoi(_OPTION_));    DATA_OPTN(1, w, <w>, \tmaximum width of alignment; default=1,      w = atoi(_OPTION_));    DATA_OPTN(1, frac, <frac>, \tfraction of possible scores to use; default=1,      frac = atoi(_OPTION_));    DATA_OPTN(1, range, <range>, scale llr to have <range> values; default=100,      range = atoi(_OPTION_));    USAGE(\n\tCompute the probability distribution for the log-likelihood);    USAGE(\tratio (LLR) of N letters.  If a range for N is given);    USAGE(\tthe distributions for each value in the range are computed.);    USAGE(\tIf <frac> is given only the most probable <frac> fraction of);    USAGE(\tvalues are used to speed the calculation.);    USAGE(\n\tCopyright);    USAGE(\t(2000-2006) The Regents of the University of California);    USAGE(\tAll Rights Reserved.);    USAGE(\tAuthor: Timothy L. Bailey);  );  init_log();  init_exp();  /* set up range for N */  if (minN==0) minN = N;  if (maxN==0) maxN = N;  if (minN > maxN) {    fprintf(stderr, "You must specify -maxN larger than -minN.\n");    exit(1);  }  /* get alphabet and frequencies */  ffile = fopen(ffreq, "r");   Resize(dd, A, double);  Resize(alphabet, A, char);  i = 0;  while ((line = getline2(ffile))) {    char c;    double f;    if (line[0] == '#') continue;		/* skip comments */    if (sscanf(line, "%c %lf", &c, &f) != 2) break;    alphabet[i] = c;    dd[i++] = f;    myfree(line);  }  if (i != A) {    fprintf(stderr, "%d frequencies were found; %d were expected.\n", i,A);    for (j=0; j<i; j++) printf("%c %f\n", alphabet[j], dd[j]);    exit(1);  }  printf("# Alphabet frequency distribution:\n");  for (j=0; j<A; j++) printf("# %c %f\n", alphabet[j], dd[j]);  /* get distribution and print it */  for (n=minN; n<=maxN; n++) { 			/* number of observations */    printf("# desired range %d\n", range);    /* get the distribution set */    get_llr_pv(0, (double) n, w, range, frac, A, d);    printf("# alpha %f\n", alpha);    /* get the check distribution */    if (n <= brute_maxN) {       p2 = brute(A, dd, n, range, alpha);       pv2 = cdf(p2, range);    }    /* print results */    printf("# N    I    llr         p     1-cdf\n");    for (I=0; I<=distrs[n].range[w]; I++) {		/* LLR */      double m1, e1, m2, e2;      if (distrs[n].d[w][I] == LOGZERO) {        m1 = e1 = 0;      } else {        exp10_logx(distrs[n].d[w][I]/log(10.0), m1, e1, 1);      }      if (distrs[n].cdf[w][I] == LOGZERO) {        m2 = e2 = 0;      } else {        exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1);      }      printf("%3d %3d %5.1f %3.1fe%+05.0f %3.1fe%+05.0f\n",        n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m1, e1, m2, e2);    } /* LLR */  } /* n */  return 0;}#endif /* SO */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?