📄 scaleswt.c
字号:
llen->max_score = -1; llen->min_score=10000; alloc_hist(llen); llen->zero_s = 0; llen->min_length = 10000; llen->max_length = 0;}voidaddhist(struct llen_str *llen, int score, int length, int max_hscore){ int llength; double dscore; if ( score<=0 || length < LENGTH_CUTOFF) { llen->min_score = 0; llen->zero_s++; return ; } if (score < llen->min_score) llen->min_score = score; if (score > llen->max_score) llen->max_score = score; if (length > llen->max_length) llen->max_length = length; if (length < llen->min_length) llen->min_length = length; if (score > max_hscore) score = max_hscore; llength = (int)(LN_FACT*log((double)length)+0.5); if (llength < 0 ) llength = 0; if (llength > llen->max) llength = llen->max; llen->hist[llength]++; dscore = (double)score; llen->score_sums[llength] += dscore; llen->score2_sums[llength] += dscore * dscore; /* db->entries++; db->length += length; if (db->length > LONG_MAX) {db->carry++;db->length -= LONG_MAX;} */}/* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */voidinithistz(int mh, struct hist_str *histp ){ int i; histp->min_hist = 20; histp->max_hist = 120; histp->histint = (int) ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5); histp->maxh = (int) ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5); if (histp->hist_a==NULL) { if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))== NULL) { fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh); histp->histflg = 0; } else histp->histflg = 1; } else { for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0; }}/* fasts/f will not show any histogram */voidaddhistz(double zs, struct hist_str *histp){}voidprune_hist(struct llen_str *llen, int score, int length, int max_hscore, long *entries){ int llength; double dscore; if (score <= 0 || length < LENGTH_CUTOFF) return; if (score > max_hscore) score = max_hscore; llength = (int)(LN_FACT*log((double)length)+0.5); if (llength < 0 ) llength = 0; if (llength > llen->max) llength = llen->max; llen->hist[llength]--; dscore = (double)score; llen->score_sums[llength] -= dscore; llen->score2_sums[llength] -= dscore * dscore; (*entries)--; /* if (length < db->length) db->length -= length; else {db->carry--; db->length += (LONG_MAX - (unsigned long)length);} */} /* fit_llen: no trimming (1) regress scores vs log(n) using weighted variance (2) calculate mean variance after length regression*/voidfit_llen(struct llen_str *llen, struct rstat_str *pr){ int j; int n; int n_size; double x, y2, u, z; double mean_x, mean_y, var_x, var_y, covar_xy; double mean_y2, covar_xy2, var_y2, dllj; double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w; /* now fit scores to best linear function of log(n), using simple linear regression */ for (llen->min=0; llen->min < llen->max; llen->min++) if (llen->hist[llen->min]) break; llen->min--; for (n_size=0,j = llen->min; j < llen->max; j++) { if (llen->hist[j] > 1) { dllj = (double)llen->hist[j]; llen->score_var[j] = llen->score2_sums[j]/dllj - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj); llen->score_var[j] /= (double)(llen->hist[j]-1); if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1; n_size++; } } pr->nb_tot = n_size; n_w = 0.0; sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0; for (j = llen->min; j < llen->max; j++) if (llen->hist[j] > 1) { x = j + 0.5; dllj = (double)llen->hist[j]; n_w += dllj/llen->score_var[j]; sum_x += dllj * x / llen->score_var[j] ; sum_y += llen->score_sums[j] / llen->score_var[j]; sum_x2 += dllj * x * x /llen->score_var[j]; sum_xy += x * llen->score_sums[j]/llen->score_var[j]; } if (n_size < 5 ) { llen->fit_flag=0; pr->rho = 0; pr->mu = sum_y/n_w; return; } else { delta = n_w * sum_x2 - sum_x * sum_x; if (delta > 0.001) { pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta; pr->rho_e = n_w/delta; pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta; pr->mu_e = sum_x2/delta; } else { llen->fit_flag = 0; pr->rho = 0; pr->mu = sum_y/n_w; return; } } delta = n_w * sum_x2 - sum_x * sum_x; pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta; pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta; n = 0; mean_x = mean_y = mean_y2 = 0.0; var_x = var_y = 0.0; covar_xy = covar_xy2 = 0.0; for (j = llen->min; j <= llen->max; j++) if (llen->hist[j] > 1 ) { n += llen->hist[j]; x = (double)j + 0.5; mean_x += (double)llen->hist[j] * x; mean_y += llen->score_sums[j]; var_x += (double)llen->hist[j] * x * x; var_y += llen->score2_sums[j]; covar_xy += x * llen->score_sums[j]; } mean_x /= n; mean_y /= n; var_x = var_x / n - mean_x * mean_x; var_y = var_y / n - mean_y * mean_y; covar_xy = covar_xy / n - mean_x * mean_y;/* pr->rho = covar_xy / var_x; pr->mu = mean_y - pr->rho * mean_x;*/ mean_y2 = covar_xy2 = var_y2 = 0.0; for (j = llen->min; j <= llen->max; j++) if (llen->hist[j] > 1) { x = (double)j + 0.5; u = pr->rho * x + pr->mu; y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;/* dllj = (double)llen->hist[j]; fprintf(stderr,"%.2f\t%d\t%g\t%g\n",x/LN_FACT,llen->hist[j], llen->score_sums[j]/dllj,y2/dllj);*/ mean_y2 += y2; var_y2 += y2 * y2; covar_xy2 += x * y2; /* fprintf(stderr,"%6.1f %4d %8d %8d %7.2f %8.2f\n", x,llen->hist[j],llen->score_sums[j],llen->score2_sums[j],u,y2); */ } pr->mean_var = mean_y2 /= (double)n; covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2; if (pr->mean_var <= 0.01) { llen->fit_flag = 0; pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0; } /* fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n", pr->rho*LN_FACT,pr->mu,pr->mean_var); */ if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1); else pr->var_e = 0.0; if (llen->fit_flag) { pr->rho2 = covar_xy2 / var_x; pr->mu2 = pr->mean_var - pr->rho2 * mean_x; } else { pr->rho2 = 0; pr->mu2 = pr->mean_var; } if (pr->rho2 < 0.0 ) z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT); else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF; if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF; pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;}/* fit_llens: trim high variance bins (1) regress scores vs log(n) using weighted variance (2) regress residuals vs log(n) (3) remove high variance bins (4) calculate mean variance after length regression*/voidfit_llens(struct llen_str *llen, struct rstat_str *pr){ int j; int n, n_u2; double x, y, y2, u, u2, v, z; double mean_x, mean_y, var_x, var_y, covar_xy; double mean_y2, covar_xy2; double mean_u2, mean_3u2, dllj; double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;/* now fit scores to best linear function of log(n), using simple linear regression */ for (llen->min=0; llen->min < llen->max; llen->min++) if (llen->hist[llen->min]) break; llen->min--; for (j = llen->min; j < llen->max; j++) { if (llen->hist[j] > 1) { dllj = (double)llen->hist[j]; llen->score_var[j] = (double)llen->score2_sums[j]/dllj - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj); llen->score_var[j] /= (double)(llen->hist[j]-1); if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0; } } n_w = 0.0; sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0; for (j = llen->min; j < llen->max; j++) if (llen->hist[j] > 1) { x = j + 0.5; dllj = (double)llen->hist[j]; n_w += dllj/llen->score_var[j]; sum_x += dllj * x / llen->score_var[j] ; sum_y += llen->score_sums[j] / llen->score_var[j]; sum_x2 += dllj * x * x /llen->score_var[j]; sum_xy += x * llen->score_sums[j]/llen->score_var[j]; } delta = n_w * sum_x2 - sum_x * sum_x; pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta; pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;/* printf(" rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */ n = 0; mean_x = mean_y = mean_y2 = 0.0; var_x = var_y = 0.0; covar_xy = covar_xy2 = 0.0; for (j = llen->min; j <= llen->max; j++) if (llen->hist[j] > 1 ) { n += llen->hist[j]; x = (double)j + 0.5; dllj = (double)llen->hist[j]; mean_x += dllj * x; mean_y += llen->score_sums[j]; var_x += dllj * x * x; var_y += llen->score2_sums[j]; covar_xy += x * llen->score_sums[j]; } mean_x /= n; mean_y /= n; var_x = var_x / n - mean_x * mean_x; var_y = var_y / n - mean_y * mean_y; covar_xy = covar_xy / n - mean_x * mean_y;/* pr->rho = covar_xy / var_x; pr->mu = mean_y - pr->rho * mean_x;*/ mean_y2 = covar_xy2 = 0.0; for (j = llen->min; j <= llen->max; j++) if (llen->hist[j] > 1) { x = (double)j + 0.5; u = pr->rho * x + pr->mu; y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u; mean_y2 += y2; covar_xy2 += x * y2; } mean_y2 /= n; covar_xy2 = covar_xy2 / n - mean_x * mean_y2; pr->rho2 = covar_xy2 / var_x; pr->mu2 = mean_y2 - pr->rho2 * mean_x; if (pr->rho2 < 0.0 ) z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT); else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF; if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF; pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;/* fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n", pr->var_cutoff,z);*/ mean_u2 = 0.0; n_u2 = 0; for ( j = llen->min; j < llen->max; j++) { y = j+0.5; dllj = (double)llen->hist[j]; x = pr->rho * y + pr->mu; v = pr->rho2 * y + pr->mu2; if (v < pr->var_cutoff) v = pr->var_cutoff; if (llen->hist[j]> 1) { u2 = (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj; mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1); n_u2++; /* fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n", j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */ } else llen->score_var[j] = -1.0; } mean_u2 = sqrt(mean_u2/(double)n_u2); /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */ mean_3u2 = mean_u2*3.0; for (j = llen->min; j < llen->max; j++) { if (llen->hist[j] <= 1) continue; if (sqrt(llen->score_var[j]) > mean_3u2) { /* fprintf(stderr," removing %d %d %.2f\n", j, (int)(exp((double)j/LN_FACT)-0.5), sqrt(llen->score_var[j])); */ pr->nb_trimmed++; pr->n1_trimmed += llen->hist[j]; llen->hist[j] = 0; } } fit_llen(llen, pr);}/* REG_STATS - Z() from rho/mu/mean_var */double find_zr(int score, double escore, int length, double comp, struct rstat_str *rs){ double log_len, z; if (score <= 0) return 0.0; if ( length < LENGTH_CUTOFF) return 0.0; log_len = LN_FACT*log((double)(length));/* var = rs->rho2 * log_len + rs->mu2; if (var < rs->var_cutoff) var = rs->var_cutoff;*/ z = ((double)score - rs->rho * log_len - rs->mu) / sqrt(rs->mean_var); return (50.0 + z*10.0);}double find_zt(int score, double escore, int length, double comp, struct rstat_str *rs){ if (escore > 0.0) return -log(escore)/M_LN2; else return 744.440071/M_LN2;}double find_zn(int score, double escore, int length, double comp, struct rstat_str *rs){ double z; z = ((double)score - rs->mu) / sqrt(rs->mean_var); return (50.0 + z*10.0);}/* computes E value for a given z value, assuming extreme value distribution */doublez_to_E(double zs, long entries, struct db_str db){ double e, n; /* if (db->entries < 5) return (double)db.entries; */ if (entries < 1) { n = db.entries;} else {n = entries;} if (zs > ZS_MAX) return 0.0; e = exp(-PI_SQRT6 * zs - EULER_G); return n * (e > .01 ? 1.0 - exp(-e) : e);}doublezs_to_p(double zs){ return zs;}/* this version assumes the probability is in the ->zscore variable, which is provided by this file after last_scale()*/doublezs_to_bit(double zs, int n0, int n1){ return zs+log((double)(n0*n1))/M_LN2 ;}/* computes E-value for a given z value, assuming extreme value distribution */doublezs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db){ double e, z, k; /* if (db->entries < 5) return 0.0; */ if (zs > ZS_MAX ) return 0.0; if (entries < 1) entries = db.entries; if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) { k = (double)db.length /(double)n1; if (db.carry > 0) { k *= (double)db.carry * (double)LONG_MAX;} } else k = (double)entries; if (k < 1.0) k = 1.0; zs *= M_LN2; if ( zs > 100.0) e = 0.0; else e = exp(-zs); return k * e;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -