📄 scaleswt.c
字号:
}/* computes E-value for a given z value, assuming extreme value distribution */doubleE_to_zs(double E, long entries){ double e, z; int error; e = E/(double)entries;#ifndef NORMAL_DIST z = (log(e)+EULER_G)/(-PI_SQRT6); return z*10.0+50.0;#else z = np_to_z(1.0-e,&error); if (!error) return z*10.0+50.0; else return 0.0;#endif}/* computes 1.0 - E value for a given z value, assuming extreme value distribution */doublezs_to_Ec(double zs, long entries){ double e, z; if (entries < 5) return 0.0; z = (zs - 50.0)/10.0; if (z > ZS_MAX) return 1.0; e = exp(-PI_SQRT6 * z - EULER_G); return (double)entries * (e > .01 ? exp(-e) : 1.0 - e);}voidsort_escore(double *v, int n){ int gap, i, j; double dtmp; for (gap=n/2; gap>0; gap/=2) { for (i=gap; i<n; i++) { for (j=i-gap; j>=0; j -= gap) { if (v[j] <= v[j+gap]) break; dtmp = v[j]; v[j] = v[j+gap]; v[j+gap] = dtmp; } } }}/* scale_tat - compute 'a', 'b', 'c' coefficients for scaling fasts/f escores 5-May-2003 - also calculate index for high ties*/voidscale_tat(double *escore, int nstats, long db_entries, int do_trim, struct rstat_str *rs){ int i, j, k, start; double *x, *lnx, *lny; /* sort_escore(escore, nstats); */ while (*escore<0.0) {escore++; nstats--; } x = (double *) calloc(nstats, sizeof(double)); if(x == NULL) { fprintf(stderr, "Couldn't calloc tatE/x\n"); exit(1); } lnx = (double *) calloc(nstats,sizeof(double)); if(lnx == NULL) { fprintf(stderr, "Couldn't calloc tatE/lnx\n"); exit(1); } lny = (double *) calloc(nstats,sizeof(double)); if(lny == NULL) { fprintf(stderr, "Couldn't calloc tatE/lny\n"); exit(1); } for(i = 0 ; i < nstats ; ) { lny[i] = log(escore[i]); for(j = i+1 ; j < nstats ; j++) { if(escore[j] != escore[i]) break; } x[i] = ((((double)i + (double)(j - i - 1)/2.0)*(double)nstats/(double)db_entries)+1.0)/(double)nstats; lnx[i] = log(x[i]); for(k = i+1 ; k < j ; k++) { lny[k]=lny[i]; x[k] = x[i]; lnx[k]=lnx[i]; } i = k; } if (!do_trim) { start = 0; } else { start = 0.05 * (double) nstats; start = start > 500 ? 500 : start; } linreg(lny, x, lnx, nstats, &rs->tat_a, &rs->tat_b, &rs->tat_c, start); /* I have the coefficients I need - a, b, c; free arrays */ free(lny); free(lnx); free(x); /* calculate tie_j - the index below which all scores are considered positional ties */ rs->tie_j = 0.005 * db_entries;}voidlinreg(double *lny, double *x, double *lnx, int n, double *a, double *b, double *c, int start) { double yf1, yf2, yf3; double f1f1, f1f2, f1f3; double f2f2, f2f3; double f3f3, delta; int i; yf1 = yf2 = yf3 = 0.0; f1f1 = f1f2 = f1f3 = f2f2 = f2f3 = f3f3 = 0.0; for(i = start; i < n; i++) { yf1 += lny[i] * lnx[i]; yf2 += lny[i] * x[i]; yf3 += lny[i]; f1f1 += lnx[i] * lnx[i]; f1f2 += lnx[i] * x[i]; f1f3 += lnx[i]; f2f2 += x[i] * x[i]; f2f3 += x[i]; f3f3 += 1.0; } delta = det(f1f1, f1f2, f1f3, f1f2, f2f2, f2f3, f1f3, f2f3, f3f3); *a = det(yf1, f1f2, f1f3, yf2, f2f2, f2f3, yf3, f2f3, f3f3) / delta; *b = det(f1f1, yf1, f1f3, f1f2, yf2, f2f3, f1f3, yf3, f3f3) / delta; *c = det(f1f1, f1f2, yf1, f1f2, f2f2, yf2, f1f3, f2f3, yf3) / delta;}double det(double a11, double a12, double a13, double a21, double a22, double a23, double a31, double a32, double a33){ double result; result = a11 * (a22 * a33 - a32 * a23); result -= a12 * (a21 * a33 - a31 * a23); result += a13 * (a21 * a32 - a31 * a22); return result;}voidlast_stats(const unsigned char *aa0, int n0, struct stat_str *sptr, int nstats, struct beststr **bestp_arr, int nbest, struct mngmsg m_msg, struct pstruct *ppst, struct hist_str *histp, struct rstat_str **rs_sp){ double *obs_escore; int i, nobs, nobs_t, do_trim; long db_entries; struct rstat_str *rs_s; if (*rs_sp == NULL) { if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) { fprintf(stderr," cannot allocate rs_s: %ld\n",sizeof(struct rstat_str)); exit(1); } else *rs_sp = rs_s; } else rs_s = *rs_sp; histp->entries = 0; sortbeste(bestp_arr,nbest); rs_s->spacefactor = calc_spacefactor(aa0, n0, m_msg.nm0,ppst->nsq); if (ppst->zsflag >= 1 && ppst->zsflag <= 4) { if (m_msg.escore_flg) { nobs = nbest; do_trim = 1; } else { nobs = nstats; do_trim = 0; } if ((obs_escore = (double *)calloc(nobs,sizeof(double)))==NULL) { fprintf(stderr," cannot allocate obs_escore[%d]\n",nbest); exit(1); } if (m_msg.escore_flg) { for (i=nobs=0; i<nbest; i++) { if (bestp_arr[i]->rst.escore<= 1.00) obs_escore[nobs++]=bestp_arr[i]->rst.escore; } /* nobs_t = nobs; for (i=0; i<nbest; i++) { if (bestp_arr[i]->rst.escore >= 0.99 && bestp_arr[i]->rst.escore <= 1.00) obs_escore[nobs++]=bestp_arr[i]->rst.escore; } */ db_entries = m_msg.db.entries; } else { for (i=nobs=0; i<nstats; i++) { if (sptr[i].escore <= 1.00 ) obs_escore[nobs++]=sptr[i].escore; } /* nobs_t = nobs; for (i=0; i<nstats; i++) { if (sptr[i].escore >= 0.99 && sptr[i].escore <= 1.0) obs_escore[nobs++]=sptr[i].escore; } */ db_entries = nobs;/* db_entries = m_msg.db.entries;*/ } sortbesto(obs_escore,nobs); if (nobs > 100) { scale_tat(obs_escore,nobs,db_entries,do_trim,rs_s); rs_s->have_tat=1; sprintf(histp->stat_info,"scaled Tatusov statistics (%d): tat_a: %6.4f tat_b: %6.4f tat_c: %6.4f", nobs,rs_s->tat_a, rs_s->tat_b, rs_s->tat_c); } else { rs_s->have_tat=0; sprintf(histp->stat_info,"Space_factor %.4g scaled statistics", rs_s->spacefactor); } free(obs_escore); } else { rs_s->have_tat=0; histp->stat_info[0] = '\0'; }}/* scale_scores() takes the best (real) scores and re-scales them; beststr bptr[] must be sorted */voidscale_scores(struct beststr **bptr, int nbest, struct db_str db, struct pstruct *ppst, struct rstat_str *rs){ int i, j, k; double obs, r_a, r_b, r_c; /* this scale function absolutely requires that the results be sorted before it is used */ sortbeste(bptr,nbest); if (!rs->have_tat) { for (i=0; i<nbest; i++) { bptr[i]->rst.escore *= rs->spacefactor; } } else { /* here if more than 1000 scores */ r_a = rs->tat_a; r_b = rs->tat_b; r_c = rs->tat_c; /* the problem with scaletat is that the E() value is related to ones position in the list of top scores - thus, knowing the score is not enough - one must know the rank */ for(i = 0 ; i < nbest ; ) { /* take the bottom 0.5%, and the ties, and treat them all the same */ j = i + 1; while (j< nbest && (j <= (0.005 * db.entries) || bptr[j]->rst.escore == bptr[i]->rst.escore) ) { j++; } /* observed frequency */ obs = ((double)i + ((double)(j - i - 1)/ 2.0) + 1.0)/(double)db.entries; /* make certain ties all have the same correction */ for (k = i ; k < j ; k++) { bptr[k]->rst.escore *= obs/exp(r_a*log(obs) + r_b*obs + r_c); } i = k; } } for (i=0; i<nbest; i++) { if(bptr[i]->rst.escore > 0.01) bptr[i]->rst.escore = 1.0 - exp(-bptr[i]->rst.escore); if (bptr[i]->rst.escore > 0.0) bptr[i]->zscore = -log(bptr[i]->rst.escore)/M_LN2; else bptr[i]->zscore = 744.440071/M_LN2; bptr[i]->rst.escore *= ppst->zdb_size; }}double scale_one_score (int ipos, double escore, struct db_str db, struct rstat_str *rs) { double obs; double a, b, c; if (!rs->have_tat) return escore * rs->spacefactor; if (ipos < rs->tie_j) ipos = rs->tie_j/2; a = rs->tat_a; b = rs->tat_b; c = rs->tat_c; obs = ((double)ipos + 1.0)/(double)db.entries; escore *= obs/exp(a*log(obs) + b*obs + c); return escore;}double calc_spacefactor(const unsigned char *aa0, int n0, int nm0, int nsq) {#if !defined(FASTF) return pow(2.0, (double) nm0) - 1.0;#else int i, j, n, l, nr, bin, k; int nmoff; int **counts; int **factors; double tmp, result = 0.0; nmoff = (n0 - nm0 + 1)/nm0+1; counts = (int **) calloc(nsq, sizeof(int *)); if(counts == NULL) { fprintf(stderr, "couldn't calloc counts array!\n"); exit(1); } counts[0] = (int *) calloc(nsq * (nmoff - 1), sizeof(int)); if(counts[0] == NULL) { fprintf(stderr, "couldn't calloc counts array!\n"); exit(1); } for(i = 0 ; i < nsq ; i++) { counts[i] = counts[0] + (i * (nmoff - 1)); } for(i = 0 ; i < nm0 ; i++) { for(j = 0 ; j < (nmoff - 1) ; j++) { counts[ aa0[nmoff * i + j] ] [ j ] ++; } } factors = (int **) calloc(nm0 + 1, sizeof(int *)); if(factors == NULL) { fprintf(stderr, "Couldn't calloc factors array!\n"); exit(1); } factors[0] = (int *) calloc((nm0 + 1) * (nmoff - 1), sizeof(int)); if(factors[0] == NULL) { fprintf(stderr, "Couldn't calloc factors array!\n"); exit(1); } for(i = 0 ; i <= nm0 ; i++) { factors[i] = factors[0] + (i * (nmoff - 1)); } /* this algorithm was adapted from the GAP4 library's NrArrangement function: The GAP Group, GAP --- Groups, Algorithms, and Programming, Version 4.1; Aachen, St Andrews, 1999. (http://www-gap.dcs.st-and.ac.uk/ gap) */ /* calculate K factors for each column in query: */ for(j = 0 ; j < (nmoff - 1) ; j++) { /* only one way to select 0 elements */ factors[0][j] = 1; /* for each of the possible elements in this column */ for(n = 0 ; n < nsq ; n++) { /* if there aren't any of these, skip it */ if(counts[n][j] == 0) { continue; } /* loop over the possible lengths of the arrangement: K..0 */ for(l = nm0 ; l >= 0 ; l--) { nr = 0; bin = 1; /* compute the number of arrangements of length <l> using only the first <n> elements of <mset> */ for(i = 0, k = min(counts[n][j], l); i <= k ; i++) { /* add the number of arrangements of length <l> that consist of <l>-<i> of the first <n>-1 elements and <i> copies of the <n>th element */ nr += bin * factors[l-i][j]; bin = (int) ((float) bin * (float) (l - i) / (float) (i + 1)); } factors[l][j] = nr; } } } result = 0.0; for(i = 1 ; i <= nm0 ; i++) { tmp = 1.0; for(j = 0 ; j < (nmoff - 1) ; j++) { tmp *= (double) factors[i][j]; } tmp /= factorial(i, 1); result += tmp; } free(counts[0]); free(counts); free(factors[0]); free(factors); return result;#endif}void sortbesto (double *obs, int nobs){ int gap, i, j, k; double v; int incs[16] = { 1391376, 463792, 198768, 86961, 33936, 13776, 4592, 1968, 861, 336, 112, 48, 21, 7, 3, 1 }; for ( k = 0; k < 16; k++) for (gap = incs[k], i=gap; i < nobs; i++) { v = obs[i]; j = i; while ( j >= gap && obs[j-gap] > v) { obs[j] = obs[j - gap]; j -= gap; } obs[j] = v; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -