llr.c
来自「EM算法的改进」· C语言 代码 · 共 747 行 · 第 1/2 页
C
747 行
/* smooth cdf by linear interpolation in logs */ for (i=r; i>0; i=j) { for (j=i-1; j>0 && d[j]==LOGZERO; j--) ; /* find next non-zero p */ if (i!=j) slope = (cdf[i]-cdf[j])/(i-j); /* slope */ for (k=j+1; k<i; k++) cdf[k] = cdf[j] + (k-j)*slope; } return cdf;} /* cdf *//******************************************************************************//* get_scaled_llr Given a set of observed counts (o_i) and expected counts (e_i) and a scale factor alpha, return the scaled information content: Sum_i=1..A NINT(alpha * o_i log(o_i/e_i))*//******************************************************************************/int get_scaled_llr( double alpha, /* scale factor */ int A, /* size of alphabet */ double *o, /* observed counts */ double *e /* expected counts */) { int i; int llr = 0; for (i=0; i<A; i++) { if (o[i]) { llr += NINT(alpha * o[i] * log(o[i]/e[i])); } } return llr >= 0 ? llr : 0;} /* get_scaled_llr *//******************************************************************************//* brute Calculate the distribution of scaled LLR using a brute force algorithm (for small values of n only). Used for checking get_llr.*//******************************************************************************/void brute1( int A, /* dimension of discrete distribution */ double *dd, /* discrete distribution */ int N, /* number of samples of distribution */ double alpha, /* scale factor for llr */ double *o, double *e, double *p, double prob){ int i; int llr; if (N==0) { llr = get_scaled_llr(alpha, A, o, e); p[llr] += prob; } else { for (i=0; i<A; i++) { o[i]++; brute1(A, dd, N-1, alpha, o, e, p, prob*dd[i]); o[i]--; } } } /* brute1 */double *brute( int A, /* dimension of discrete distribution */ double *dd, /* discrete distribution */ int N, /* number of samples of distribution */ int range, /* desired range for scaled LLR */ double alpha /* scale factor for llr */){ int i; double *o=NULL; double *e=NULL; double *p=NULL; Resize(o, A, double); Resize(e, A, double); Resize(p, range+2, double); for (i=0; i<A; i++) { o[i] = 0; e[i] = dd[i] * N; } for (i=0; i<range+2; i++) { p[i] = 0; } brute1(A, dd, N, alpha, o, e, p, 1.0); return p;} /* brute *//******************************************************************************//* get_llr_pv Get the log p-value of a weighted log-likelihood ratio given: llr log-likelihood ratio n (weighted) number of sequences in the alignment w width of the alignment range desired number of values in range frac speedup factor alength number of letters in alphabet dd[] alphabet frequency distribution Computes the distributions on the fly only as needed. Range should be the same each time get_llr_pv is called. Call with integral n and w=0 to allocate table in parallel mode. Returns the p-value.*//******************************************************************************/extern double get_llr_pv( double llr, /* log likelihood ratio */ double n, /* wgtd number sequences in alignment */ int w, /* width of alignment */ int range, /* desired range for resolution */ double frac, /* speedup factor */ int alength, /* length of alphabet */ double *dd /* alphabet frequency distribution */){ int i, N; double I; /* weighted log likelihood ratio */ int I0, I1; /* position of llr in table */ double logpv; /* log pvalue */ double n0, n1; /* floor and ceil of n */ double alpha; /* scale factor used */ if (n<=1) return 0.0; /* only one site p-value = 1.0 */ /* return geometric mean if N is not integral */ if ( (n0=floor(n)) != (n1=ceil(n)) ) return ( (n1-n)*get_llr_pv(llr, n0, w, range, frac, alength, dd) + (n-n0)*get_llr_pv(llr, n1, w, range, frac, alength, dd) ); N = (int) n; /* make n an integer */ /* N larger than any previous N? */ if (ndistrs < N) { /* first call */ Resize(distrs, N+1, DISTR); /* create array of distributions */ for (i=ndistrs+1; i<=N; i++) { distrs[i].w = 0; distrs[i].offset = NULL; distrs[i].range = NULL; distrs[i].d = NULL; distrs[i].cdf = NULL; distrs[i].mean = 0; } ndistrs = N; /* set maximum N */ } /* done if w == 0 */ if (w == 0) return 0.0; /* w larger than any previous w for this N? */ if (distrs[N].w < w) { /* larger w */ Resize(distrs[N].d, w+1, double *); Resize(distrs[N].cdf, w+1, double *); Resize(distrs[N].offset, w+1, int); Resize(distrs[N].range, w+1, int); /* first time? */ if (distrs[N].w == 0) { /* get the w=1 distribution */ distrs[N].d[1] = llr_distr(alength, dd, N, range, frac, &distrs[N].alpha, &distrs[N].offset[1], &distrs[N].range[1]); /* get mean of LLR for w = 1 */ for (i=0; i<=distrs[N].range[1]; i++) { double llr = (i + distrs[N].offset[1]) / distrs[N].alpha; distrs[N].mean += exp(distrs[N].d[1][i])*llr; } distrs[N].cdf[1] = cdf(distrs[N].d[1], distrs[N].range[1]); distrs[N].w = 1; } /* first time */ /* get the distributions for widths oldw .. maxw */ /*fprintf(stderr, "enter cdf N= %d w= %d oldw= %d\n", N, w, distrs[N].w);*/ for (i=distrs[N].w+1; i<=w; i++) { /* width */ distrs[N].d[i] = sum_distr( distrs[N].d[i-1], distrs[N].range[i-1], distrs[N].d[1], distrs[N].range[1], &distrs[N].range[i] ); distrs[N].offset[i] = distrs[N].offset[i-1] + distrs[N].offset[1]; distrs[N].cdf[i] = cdf(distrs[N].d[i], distrs[N].range[i]); } /* width */ /*fprintf(stderr, "leave cdf\n");*/ distrs[N].w = w; /* set maximum w */ } /* new w */ /* get position in table */ alpha = distrs[N].alpha; I = (alpha * llr) - distrs[N].offset[w]; /* position in table */ I0 = (int) I; /* floor of position */ I1 = I0 + 1; /* ceil. of position */ if (I < 0) { /* lower bound I */ logpv = distrs[N].cdf[w][0]; } else if (I0 >= distrs[N].range[w]) { /* upper bound I */ logpv = distrs[N].cdf[w][distrs[N].range[w]]; } else { /* lin. interpolate */ logpv = distrs[N].cdf[w][I0] + (I-I0)*(distrs[N].cdf[w][I1]-distrs[N].cdf[w][I0]); } /* return log p-value */ return logpv;} /* get_llr_pv *//******************************************************************************//* get_llr_mean Returns the mean of the LLR distribution for w=1. Assumes get_llr_pv has already been called.*//******************************************************************************/extern double get_llr_mean( double n /* wgt. number sequences in alignment */){ return(distrs[NINT(n)].mean);} /* get_llr_mean */#ifdef SO/************************************************************************//* llr <alength> <freq_file> <N> Compute the probability distribution for the scaled information content (LLR) of N letters. Only the most probable <frac> LLR values are used to speed the calculation.*//************************************************************************/extern int main( int argc, char** argv){ int i, j, I, n; int A=0; /* length of alphabet */ char *ffreq=NULL; /* name of frequency file */ FILE *ffile; /* frequency file */ double *dd = NULL; /* discrete distribution */ char *alphabet = NULL; /* alphabet */ int N=2, minN=0, maxN=0; /* number of observations */ int w=1; /* maximum width of alignment */ double frac = 1; /* fraction of scores to use */ int range = 100; /* desired range per N */ double *p2=NULL; /* LLR statistic distribution */ double *pv2=NULL; int brute_maxN = 0; /* maximum N for checking llr_distr */ char *line; /* input buffer line */ double alpha; /* scale factor used */ i = 1; argv[0] = "llr"; DO_STANDARD_COMMAND_LINE(2, USAGE(<alength> <freq_file> [options]); USAGE(\n\t<alength>\tnumber of letters in alphabet); USAGE(\t<freq_file>\tfile containing alphabet and letter frequencies); USAGE(\t\t\twhere each line is: <letter> <freq>); NON_SWITCH(1,\r, switch (i++) { case 1: A = atoi(_OPTION_); break; case 2: ffreq = _OPTION_; break; default: COMMAND_LINE_ERROR; } ); DATA_OPTN(1, N, <N>, \tnumber of observations; default=2, N = atof(_OPTION_)); DATA_OPTN(1, minN, <minN>, \tminimum number of observations; default=2, minN = atoi(_OPTION_)); DATA_OPTN(1, maxN, <maxN>, \tmaximum number of observations; default=2, maxN = atoi(_OPTION_)); DATA_OPTN(1, w, <w>, \tmaximum width of alignment; default=1, w = atoi(_OPTION_)); DATA_OPTN(1, frac, <frac>, \tfraction of possible scores to use; default=1, frac = atoi(_OPTION_)); DATA_OPTN(1, range, <range>, scale llr to have <range> values; default=100, range = atoi(_OPTION_)); USAGE(\n\tCompute the probability distribution for the log-likelihood); USAGE(\tratio (LLR) of N letters. If a range for N is given); USAGE(\tthe distributions for each value in the range are computed.); USAGE(\tIf <frac> is given only the most probable <frac> fraction of); USAGE(\tvalues are used to speed the calculation.); USAGE(\n\tCopyright); USAGE(\t(2000-2006) The Regents of the University of California); USAGE(\tAll Rights Reserved.); USAGE(\tAuthor: Timothy L. Bailey); ); init_log(); init_exp(); /* set up range for N */ if (minN==0) minN = N; if (maxN==0) maxN = N; if (minN > maxN) { fprintf(stderr, "You must specify -maxN larger than -minN.\n"); exit(1); } /* get alphabet and frequencies */ ffile = fopen(ffreq, "r"); Resize(dd, A, double); Resize(alphabet, A, char); i = 0; while ((line = getline2(ffile))) { char c; double f; if (line[0] == '#') continue; /* skip comments */ if (sscanf(line, "%c %lf", &c, &f) != 2) break; alphabet[i] = c; dd[i++] = f; myfree(line); } if (i != A) { fprintf(stderr, "%d frequencies were found; %d were expected.\n", i,A); for (j=0; j<i; j++) printf("%c %f\n", alphabet[j], dd[j]); exit(1); } printf("# Alphabet frequency distribution:\n"); for (j=0; j<A; j++) printf("# %c %f\n", alphabet[j], dd[j]); /* get distribution and print it */ for (n=minN; n<=maxN; n++) { /* number of observations */ printf("# desired range %d\n", range); /* get the distribution set */ get_llr_pv(0, (double) n, w, range, frac, A, d); printf("# alpha %f\n", alpha); /* get the check distribution */ if (n <= brute_maxN) { p2 = brute(A, dd, n, range, alpha); pv2 = cdf(p2, range); } /* print results */ printf("# N I llr p 1-cdf\n"); for (I=0; I<=distrs[n].range[w]; I++) { /* LLR */ double m1, e1, m2, e2; if (distrs[n].d[w][I] == LOGZERO) { m1 = e1 = 0; } else { exp10_logx(distrs[n].d[w][I]/log(10.0), m1, e1, 1); } if (distrs[n].cdf[w][I] == LOGZERO) { m2 = e2 = 0; } else { exp10_logx(distrs[n].cdf[w][I]/log(10.0), m2, e2, 1); } printf("%3d %3d %5.1f %3.1fe%+05.0f %3.1fe%+05.0f\n", n, I, (distrs[n].offset[w]+I)/distrs[n].alpha, m1, e1, m2, e2); } /* LLR */ } /* n */ return 0;}#endif /* SO */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?