📄 scaleswn.c
字号:
covar_xy = covar_xy / n - mean_x * mean_y;/* pr->rho = covar_xy / var_x; pr->mu = mean_y - pr->rho * mean_x;*/ mean_y2 = covar_xy2 = 0.0; for (j = llen->min; j <= llen->max; j++) if (llen->hist[j] > 1) { x = (double)j + 0.5; u = pr->rho * x + pr->mu; y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u; mean_y2 += y2; covar_xy2 += x * y2; } mean_y2 /= n; covar_xy2 = covar_xy2 / n - mean_x * mean_y2; pr->rho2 = covar_xy2 / var_x; pr->mu2 = mean_y2 - pr->rho2 * mean_x; if (pr->rho2 < 0.0 ) z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT); else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF; if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF; pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;/* fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n", pr->var_cutoff,z);*/ mean_u2 = 0.0; n_u2 = 0; for ( j = llen->min; j < llen->max; j++) { y = j+0.5; dllj = (double)llen->hist[j]; x = pr->rho * y + pr->mu; v = pr->rho2 * y + pr->mu2; if (v < pr->var_cutoff) v = pr->var_cutoff; if (llen->hist[j]> 1) { u2 = (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj; mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1); n_u2++; /* fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n", j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */ } else llen->score_var[j] = -1.0; } mean_u2 = sqrt(mean_u2/(double)n_u2); /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */ mean_3u2 = mean_u2*3.0; for (j = llen->min; j < llen->max; j++) { if (llen->hist[j] <= 1) continue; if (sqrt(llen->score_var[j]) > mean_3u2) { /* fprintf(stderr," removing %d %d %.2f\n", j, (int)(exp((double)j/LN_FACT)-0.5), sqrt(llen->score_var[j])); */ pr->nb_trimmed++; pr->n1_trimmed += llen->hist[j]; llen->hist[j] = 0; } } fit_llen(llen, pr);}struct s2str {double s; int n;};void s2_sort ( struct s2str *sptr, int n);voidfit_llen2(struct llen_str *llen, struct rstat_str *pr){ int j; int n, n_y2, llen_delta, llen_del05; int n_size; double x, y2, u; double mean_x, mean_y, var_x, var_y, covar_xy; double mean_y2, covar_xy2; struct s2str *ss2; double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w; /* now fit scores to best linear function of log(n), using simple linear regression */ for (llen->min=0; llen->min < llen->max; llen->min++) if (llen->hist[llen->min]) break; for ( ; llen->max > llen->min; llen->max--) if (llen->hist[llen->max]) break; for (n_size=0,j = llen->min; j < llen->max; j++) { if (llen->hist[j] > 1) { llen->score_var[j] = llen->score2_sums[j]/(double)llen->hist[j] - (llen->score_sums[j]/(double)llen->hist[j]) * (llen->score_sums[j]/(double)llen->hist[j]); llen->score_var[j] /= (double)(llen->hist[j]-1); if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0; n_size++; } } n_w = 0.0; sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0; for (j = llen->min; j < llen->max; j++) if (llen->hist[j] > 1) { x = j + 0.5; n_w += (double)llen->hist[j]/llen->score_var[j]; sum_x += (double)llen->hist[j] * x / llen->score_var[j] ; sum_y += llen->score_sums[j] / llen->score_var[j]; sum_x2 += (double)llen->hist[j] * x * x /llen->score_var[j]; sum_xy += x * llen->score_sums[j]/llen->score_var[j]; } if (n_size < 5 ) { llen->fit_flag=0; pr->rho = 0; pr->mu = sum_y/n_w; } else { det = n_w * sum_x2 - sum_x * sum_x; if (det > 0.001) { pr->rho = (n_w * sum_xy - sum_x * sum_y)/det; pr->rho_e = n_w/det; pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det; pr->mu_e = sum_x2/det; } else { llen->fit_flag = 0; pr->rho = 0; pr->mu = sum_y/n_w; } } det = n_w * sum_x2 - sum_x * sum_x; pr->rho = (n_w * sum_xy - sum_x * sum_y)/det; pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;/* fprintf(stderr," rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */ n = 0; mean_x = mean_y = mean_y2 = 0.0; var_x = var_y = 0.0; covar_xy = covar_xy2 = 0.0; for (j = llen->min; j <= llen->max; j++) if (llen->hist[j] > 1 ) { n += llen->hist[j]; x = (double)j + 0.5; mean_x += (double)llen->hist[j] * x; mean_y += llen->score_sums[j]; var_x += (double)llen->hist[j] * x * x; var_y += llen->score2_sums[j]; covar_xy += x * llen->score_sums[j]; } mean_x /= n; mean_y /= n; var_x = var_x / n - mean_x * mean_x; var_y = var_y / n - mean_y * mean_y; covar_xy = covar_xy / n - mean_x * mean_y;/* pr->rho = covar_xy / var_x; pr->mu = mean_y - pr->rho * mean_x;*/ if ((ss2=(struct s2str *)calloc(llen->max+1,sizeof(struct s2str)))==NULL) { fprintf(stderr," cannot allocate ss2\n"); return; } mean_y2 = 0.0; n_y2 = n = 0; for (j = llen->min; j <= llen->max; j++) if (llen->hist[j] > VHISTC) { n++; n_y2 += ss2[j].n = llen->hist[j]; x = (double)j + 0.5; u = pr->rho * x + pr->mu; ss2[j].s = y2 = llen->score2_sums[j] - 2*llen->score_sums[j]*u + llen->hist[j]*u*u; mean_y2 += y2; } pr->mean_var = mean_y2/(double)n_y2; pr->mean_var_sqrt = sqrt(pr->mean_var); s2_sort(ss2+llen->min,llen->max-llen->min+1); /* fprintf(stderr,"llen->min: %d, max: %d\n",llen->min,llen->max); */ llen_delta = 0; for (j=llen->min; j<=llen->max; j++) { if (ss2[j].n > 1) { llen_delta++;/* fprintf(stderr,"%d\t%d\t%.2f\t%.4f\n", j,ss2[j].n,ss2[j].s,ss2[j].s/ss2[j].n);*/ } } llen_del05 = llen_delta/20; mean_y2 = 0.0; n_y2 = 0; for (j = llen->min; j<llen->min+llen_del05; j++) { pr->n1_trimmed += ss2[j].n; pr->nb_trimmed++; } for (j = llen->min+llen_del05; j <= llen->min+llen_delta-llen_del05; j++) if (ss2[j].n > 1) { mean_y2 += ss2[j].s; n_y2 += ss2[j].n; } for (j = llen->min+llen_delta-llen_del05+1; j< llen->max; j++) { pr->n1_trimmed += ss2[j].n; pr->nb_trimmed++; } free(ss2); if (n_y2 > 1) { pr->mean_var = mean_y2/(double)n_y2; pr->mean_var_sqrt = sqrt(pr->mean_var); } /* fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var: %.4f/%d\n", pr->rho*LN_FACT,pr->mu,pr->mean_var,n); */ pr->var_e = 0.0;}/* REG_STATS - Z() from rho/mu/mean_var */double find_zr(int score, double escore, int length, double comp, struct pstat_str *pu){ double log_len, z; #ifdef LOCAL_SCORE if (score <= 0) return 0;#endif if ( length < LENGTH_CUTOFF) return 0;#ifdef LOCAL_SCORE log_len = LN_FACT*log((double)(length));#else /* log_len = length; */ log_len = LN_FACT*log((double)(length));#endif/* var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2; if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.var_cutoff;*/ z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt; return (50.0 + z*10.0);}/* REG2_STATS Z() from rho/mu, rho2/mu2 */double find_zr2(int score, double escore, int length, double comp, struct pstat_str *pu){ double log_len, var; double z; if ( length < LENGTH_CUTOFF) return 0;#ifdef LOCAL_SCORE log_len = LN_FACT*log((double)(length));#else /* log_len = length; */ log_len = LN_FACT*log((double)(length));#endif var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2; if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.mean_var; z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(var); return (50.0 + z*10.0);}#ifdef USE_LNSTATS/* LN_STATS - ln()-scaled mu, mean_var */double find_zl(int score, int length, double comp, struct pstat_str *pu){ double ls, z; ls = (double)score*LN200/log((double)length); z = (ls - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt; return (50.0 + z*10.0);}#endif/* MLE_STATS - Z() from MLE for lambda, K */doublefind_ze(int score, double escore, int length, double comp, struct pstat_str *pu){ double z, mp, np, a_n1; a_n1 = (double)length; mp = pu->r_u.ag.a_n0; np = a_n1; if (np < 1.0) np = 1.0; if (mp < 1.0) mp = 1.0; z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp); z = -z + EULER_G; z /= - PI_SQRT6; return (50.0 + z*10.0);}/* MLE2_STATS - Z() from MLE for mle_a0..2, mle_b1, length, comp */doublefind_ze2(int score, double escore, int length, double comp, struct pstat_str *pu){ double z, mp, np, a_n1; a_n1 = (double)length; if (comp <= 0.0) comp = pu->r_u.m2.ave_comp; /* avoid very biased comp estimates */ /* comp = exp((4.0*log(comp)+log(pu->r_u.m2.ave_comp))/5.0); */ mp = pu->r_u.m2.a_n0; np = a_n1; if (np < 1.0) np = 1.0; if (mp < 1.0) mp = 1.0; z = (-(pu->r_u.m2.mle2_a0 + pu->r_u.m2.mle2_a1 * comp + pu->r_u.m2.mle2_a2 * comp * log(np * mp)) + score) / (pu->r_u.m2.mle2_b1 * comp); z = -z + EULER_G; z /= - PI_SQRT6; return (50.0 + z*10.0);}/* AG_STATS - Altschul-Gish Lamdba, K */doublefind_za(int score, double escore, int length, double comp, struct pstat_str *pu){ double z, mp, np, a_n1, a_n1f; a_n1 = (double)length; a_n1f = log(a_n1)/pu->r_u.ag.H; mp = pu->r_u.ag.a_n0 - pu->r_u.ag.a_n0f - a_n1f; np = a_n1 - pu->r_u.ag.a_n0f - a_n1f; if (np < 1.0) np = 1.0; if (mp < 1.0) mp = 1.0; z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp); z = -z + EULER_G; z /= - PI_SQRT6; return (50.0 + z*10.0);}double find_zn(int score, double escore, int length, double comp, struct pstat_str *pu){ double z; z = ((double)score - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt; return (50.0 + z*10.0);}/* computes E value for a given z value, assuming extreme value distribution */doublez_to_E(double zs, long entries, struct db_str db){ double e, n; /* if (db->entries < 5) return (double)db.entries; */ if (entries < 1) { n = db.entries;} else {n = entries;} if (zs > ZS_MAX) return 0.0;#ifndef NORMAL_DIST e = exp(- PI_SQRT6 * zs - .577216); return n * (e > .01 ? 1.0 - exp(-e) : e);#else return n * erfc(zs/M_SQRT2)/2.0; #endif}doublezs_to_p(double zs){ double e, z; /* if (db.entries < 5) return 0.0; */ z = (zs - 50.0)/10.0; if (z > ZS_MAX) return 0.0;#ifndef NORMAL_DIST e = exp(- PI_SQRT6 * z - EULER_G); return (e > .01 ? 1.0 - exp(-e) : e);#else return erfc(zs/M_SQRT2)/2.0; #endif}doublezs_to_bit(double zs, int n0, int n1){ double z, a_n0, a_n1; z = (zs - 50.0)/10.0; a_n0 = (double)n0; a_n1 = (double)n1; return (PI_SQRT6 * z + EULER_G + log(a_n0*a_n1))/M_LN2 ;}/* computes E-value for a given z value, assuming extreme value distribution */doublezs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db){ double e, z, k; /* if (db->entries < 5) return 0.0; */ z = (zs - 50.0)/10.0; if (z > ZS_MAX ) return 0.0; if (entries < 1) entries = db.entries; if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) { k = (double)db.length /(double)n1; if (db.carry > 0) { k += ((double)db.carry * (double)LONG_MAX)/(double)n1; } } else k = (double)entries; if (k < 1.0) k = 1.0;#ifndef NORMAL_DIST z *= PI_SQRT6; z += EULER_G; e = exp(-z); return k * (e > .01 ? 1.0 - exp(-e) : e);#else return k * erfc(z/M_SQRT2)/2.0; #endif}#ifdef NORMAL_DISTdouble np_to_z(double, int *);#endif/* compute z-score for given E()-value, assuming normal or extreme-value dist */doubleE_to_zs(double E, long entries){ double e, z; int error; e = E/(double)entries;#ifndef NORMAL_DIST z = (log(e)+EULER_G)/(- PI_SQRT6); return z*10.0 + 50.0;#else /* this formula does not work for E() >= 1 */ if (e >= 1.0) e = 0.99; z = np_to_z(1.0-e,&error); if (!error) return z*10.0 + 50.0; else return 0.0;#endif}/* computes 1.0 - E value for a given z value, assuming extreme value distribution
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -