📄 scaleswt.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
}/* computes E-value for a given z value, assuming extreme value distribution */doubleE_to_zs(double E, long entries){  double e, z;  int error;  e = E/(double)entries;#ifndef NORMAL_DIST  z = (log(e)+EULER_G)/(-PI_SQRT6);  return z*10.0+50.0;#else  z = np_to_z(1.0-e,&error);  if (!error) return z*10.0+50.0;  else return 0.0;#endif}/* computes 1.0 - E value for a given z value, assuming extreme value   distribution */doublezs_to_Ec(double zs, long entries){  double e, z;  if (entries < 5) return 0.0;  z = (zs - 50.0)/10.0;  if (z > ZS_MAX) return 1.0;  e =  exp(-PI_SQRT6 * z - EULER_G);  return (double)entries * (e > .01 ?  exp(-e) : 1.0 - e);}voidsort_escore(double *v, int n){  int gap, i, j;  double dtmp;	  for (gap=n/2; gap>0; gap/=2) {    for (i=gap; i<n; i++) {      for (j=i-gap; j>=0; j -= gap) {	if (v[j] <= v[j+gap]) break;	dtmp = v[j];	v[j] = v[j+gap];	v[j+gap] = dtmp;      }    }  }}/* scale_tat - compute 'a', 'b', 'c' coefficients for scaling fasts/f   escores    5-May-2003 - also calculate index for high ties*/voidscale_tat(double *escore, int nstats,	  long db_entries, int do_trim,	  struct rstat_str *rs){  int i, j, k, start;  double *x, *lnx, *lny;  /*   sort_escore(escore, nstats); */  while (*escore<0.0) {escore++; nstats--; }  x = (double *) calloc(nstats, sizeof(double));  if(x == NULL) {    fprintf(stderr, "Couldn't calloc tatE/x\n");    exit(1);  }  lnx = (double *) calloc(nstats,sizeof(double));  if(lnx == NULL) {    fprintf(stderr, "Couldn't calloc tatE/lnx\n");    exit(1);  }    lny = (double *) calloc(nstats,sizeof(double));  if(lny == NULL) {    fprintf(stderr, "Couldn't calloc tatE/lny\n");    exit(1);  }    for(i = 0 ; i < nstats ; ) {    lny[i] = log(escore[i]);    for(j = i+1 ; j < nstats ; j++) {	if(escore[j] != escore[i]) break;    }    x[i] = ((((double)i + (double)(j - i - 1)/2.0)*(double)nstats/(double)db_entries)+1.0)/(double)nstats;    lnx[i] = log(x[i]);    for(k = i+1 ; k < j ; k++) {      lny[k]=lny[i];      x[k] = x[i];      lnx[k]=lnx[i];    }    i = k;  }  if (!do_trim) {    start = 0;  } else {    start = 0.05 * (double) nstats;    start = start > 500 ? 500 : start;  }  linreg(lny, x, lnx, nstats, &rs->tat_a, &rs->tat_b, &rs->tat_c, start);  /* I have the coefficients I need - a, b, c; free arrays */  free(lny);  free(lnx);  free(x);  /* calculate tie_j - the index below which all scores are considered     positional ties */  rs->tie_j = 0.005 * db_entries;}voidlinreg(double *lny, double *x, double *lnx, int n,       double *a, double *b, double *c, int start) {  double yf1, yf2, yf3;  double f1f1, f1f2, f1f3;  double f2f2, f2f3;  double f3f3, delta;  int i;  yf1 = yf2 = yf3 = 0.0;  f1f1 = f1f2 = f1f3 = f2f2 = f2f3 = f3f3 = 0.0;  for(i = start; i < n; i++) {    yf1 += lny[i] * lnx[i];    yf2 += lny[i] * x[i];    yf3 += lny[i];    f1f1 += lnx[i] * lnx[i];    f1f2 += lnx[i] * x[i];    f1f3 += lnx[i];    f2f2 += x[i] * x[i];    f2f3 += x[i];    f3f3 += 1.0;  }  delta = det(f1f1, f1f2, f1f3, f1f2, f2f2, f2f3, f1f3, f2f3, f3f3);  *a = det(yf1, f1f2, f1f3, yf2, f2f2, f2f3, yf3, f2f3, f3f3) / delta;  *b = det(f1f1, yf1, f1f3, f1f2, yf2, f2f3, f1f3, yf3, f3f3) / delta;  *c = det(f1f1, f1f2, yf1, f1f2, f2f2, yf2, f1f3, f2f3, yf3) / delta;}double det(double a11, double a12, double a13,	   double a21, double a22, double a23,	   double a31, double a32, double a33){  double result;  result = a11 * (a22 * a33 - a32 * a23);  result -= a12 * (a21 * a33 - a31 * a23);  result += a13 * (a21 * a32 - a31 * a22);      return result;}voidlast_stats(const unsigned char *aa0, int n0,	   struct stat_str *sptr, int nstats,	   struct beststr **bestp_arr, int nbest,	   struct mngmsg m_msg, struct pstruct *ppst,	   struct hist_str *histp, struct rstat_str **rs_sp){  double *obs_escore;  int i, nobs, nobs_t, do_trim;  long db_entries;  struct rstat_str *rs_s;  if (*rs_sp == NULL) {    if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) {      fprintf(stderr," cannot allocate rs_s: %ld\n",sizeof(struct rstat_str));      exit(1);    }    else *rs_sp = rs_s;  }  else rs_s = *rs_sp;      histp->entries = 0;  sortbeste(bestp_arr,nbest);  rs_s->spacefactor =     calc_spacefactor(aa0, n0, m_msg.nm0,ppst->nsq);  if (ppst->zsflag >= 1 && ppst->zsflag <= 4) {    if (m_msg.escore_flg) {      nobs = nbest;      do_trim = 1;    }    else {      nobs = nstats;      do_trim = 0;    }    if ((obs_escore = (double *)calloc(nobs,sizeof(double)))==NULL) {      fprintf(stderr," cannot allocate obs_escore[%d]\n",nbest);      exit(1);    }    if (m_msg.escore_flg) {      for (i=nobs=0; i<nbest; i++) {	if (bestp_arr[i]->rst.escore<= 1.00)	  obs_escore[nobs++]=bestp_arr[i]->rst.escore;      }      /*      nobs_t = nobs;      for (i=0; i<nbest; i++) {	if (bestp_arr[i]->rst.escore >= 0.99 &&	    bestp_arr[i]->rst.escore <= 1.00)	  obs_escore[nobs++]=bestp_arr[i]->rst.escore;      }      */      db_entries = m_msg.db.entries;    }    else {      for (i=nobs=0; i<nstats; i++) {	if (sptr[i].escore <= 1.00 ) obs_escore[nobs++]=sptr[i].escore;      }      /*      nobs_t = nobs;      for (i=0; i<nstats; i++) {	if (sptr[i].escore >= 0.99 &&	    sptr[i].escore <= 1.0) obs_escore[nobs++]=sptr[i].escore;      }      */      db_entries = nobs;/*    db_entries = m_msg.db.entries;*/    }    sortbesto(obs_escore,nobs);    if (nobs > 100) {      scale_tat(obs_escore,nobs,db_entries,do_trim,rs_s);      rs_s->have_tat=1;      sprintf(histp->stat_info,"scaled Tatusov statistics (%d): tat_a: %6.4f tat_b: %6.4f tat_c: %6.4f",	      nobs,rs_s->tat_a, rs_s->tat_b, rs_s->tat_c);    }    else {      rs_s->have_tat=0;      sprintf(histp->stat_info,"Space_factor %.4g scaled statistics",	    rs_s->spacefactor);    }    free(obs_escore);  }  else {    rs_s->have_tat=0;    histp->stat_info[0] = '\0';  }}/* scale_scores() takes the best (real) scores and re-scales them;   beststr bptr[] must be sorted */voidscale_scores(struct beststr **bptr, int nbest, struct db_str db,	     struct pstruct *ppst, struct rstat_str *rs){  int i, j, k;  double obs, r_a, r_b, r_c;  /* this scale function absolutely requires that the results be sorted     before it is used */  sortbeste(bptr,nbest);  if (!rs->have_tat) {    for (i=0; i<nbest; i++) {      bptr[i]->rst.escore *= rs->spacefactor;    }  }  else {    /* here if more than 1000 scores */    r_a = rs->tat_a; r_b = rs->tat_b; r_c = rs->tat_c;  /* the problem with scaletat is that the E() value is related to     ones position in the list of top scores - thus, knowing the score     is not enough - one must know the rank */    for(i = 0 ; i < nbest ; ) {      /* take the bottom 0.5%, and the ties, and treat them all the same */      j = i + 1;      while (j< nbest && 	     (j <= (0.005 * db.entries) || bptr[j]->rst.escore == bptr[i]->rst.escore)	     ) {	j++;      }      /* observed frequency */      obs = ((double)i + ((double)(j - i - 1)/ 2.0) + 1.0)/(double)db.entries;      /* make certain ties all have the same correction */      for (k = i ; k < j ; k++) {	bptr[k]->rst.escore *= obs/exp(r_a*log(obs) + r_b*obs + r_c);      }      i = k;    }  }  for (i=0; i<nbest; i++) {    if(bptr[i]->rst.escore > 0.01)      bptr[i]->rst.escore = 1.0 - exp(-bptr[i]->rst.escore);    if (bptr[i]->rst.escore > 0.0)      bptr[i]->zscore = -log(bptr[i]->rst.escore)/M_LN2;    else       bptr[i]->zscore = 744.440071/M_LN2;    bptr[i]->rst.escore *= ppst->zdb_size;  }}double scale_one_score (int ipos, double escore,                        struct db_str db,                        struct rstat_str *rs) {  double obs;  double a, b, c;  if (!rs->have_tat)    return escore * rs->spacefactor;  if (ipos < rs->tie_j) ipos = rs->tie_j/2;  a = rs->tat_a; b = rs->tat_b; c = rs->tat_c;  obs = ((double)ipos + 1.0)/(double)db.entries;  escore *= obs/exp(a*log(obs) + b*obs + c);  return escore;}double calc_spacefactor(const unsigned char *aa0, int n0,			int nm0, int nsq) {#if !defined(FASTF)  return pow(2.0, (double) nm0) - 1.0;#else  int i, j, n, l, nr, bin, k;  int nmoff;  int **counts;  int **factors;  double tmp, result = 0.0;  nmoff = (n0 - nm0 + 1)/nm0+1;  counts = (int **) calloc(nsq, sizeof(int *));  if(counts == NULL) {    fprintf(stderr, "couldn't calloc counts array!\n");    exit(1);  }  counts[0] = (int *) calloc(nsq * (nmoff - 1), sizeof(int));  if(counts[0] == NULL) {    fprintf(stderr, "couldn't calloc counts array!\n");    exit(1);  }  for(i = 0 ; i < nsq ; i++) {    counts[i] = counts[0] + (i * (nmoff - 1));  }  for(i = 0 ; i < nm0 ; i++) {    for(j = 0 ; j < (nmoff - 1) ; j++) {      counts[ aa0[nmoff * i + j] ] [ j ] ++;    }  }  factors = (int **) calloc(nm0 + 1, sizeof(int *));  if(factors == NULL) {    fprintf(stderr, "Couldn't calloc factors array!\n");    exit(1);  }  factors[0] = (int *) calloc((nm0 + 1) * (nmoff - 1), sizeof(int));  if(factors[0] == NULL) {    fprintf(stderr, "Couldn't calloc factors array!\n");    exit(1);  }  for(i = 0 ; i <= nm0 ; i++) {    factors[i] = factors[0] + (i * (nmoff - 1));  }  /*    this algorithm was adapted from the GAP4 library's NrArrangement function:    The GAP Group, GAP --- Groups, Algorithms, and Programming,    Version 4.1; Aachen, St Andrews, 1999.    (http://www-gap.dcs.st-and.ac.uk/ gap)  */  /* calculate K factors for each column in query: */  for(j = 0 ; j < (nmoff - 1) ; j++) {    /* only one way to select 0 elements */    factors[0][j] = 1;    /* for each of the possible elements in this column */    for(n = 0 ; n < nsq ; n++) {      /* if there aren't any of these, skip it */      if(counts[n][j] == 0) { continue; }      /* loop over the possible lengths of the arrangement: K..0 */      for(l = nm0 ; l >= 0 ; l--) {	nr = 0;	bin = 1;	/*	  compute the number of arrangements of length <l>	  using only the first <n> elements of <mset>	*/	for(i = 0, k = min(counts[n][j], l); i <= k ; i++) {	  /* 	     add the number of arrangements of length <l>	     that consist of <l>-<i> of the first <n>-1 elements	     and <i> copies of the <n>th element	  */	  nr += bin * factors[l-i][j];	  bin = (int) ((float) bin * (float) (l - i) / (float) (i + 1));	}	factors[l][j] = nr;      }    }  }  result = 0.0;  for(i = 1 ; i <= nm0 ; i++) {    tmp = 1.0;    for(j = 0 ; j < (nmoff - 1) ; j++) {      tmp *= (double) factors[i][j];    }    tmp /= factorial(i, 1);    result += tmp;  }    free(counts[0]);  free(counts);  free(factors[0]);  free(factors);  return result;#endif}void sortbesto (double *obs, int nobs){  int gap, i, j, k;  double v;  int incs[16] = { 1391376, 463792, 198768, 86961, 33936,		   13776, 4592, 1968, 861, 336, 		   112, 48, 21, 7, 3, 1 };  for ( k = 0; k < 16; k++)    for (gap = incs[k], i=gap; i < nobs; i++) {      v = obs[i];      j = i;      while ( j >= gap && obs[j-gap] > v) {	obs[j] = obs[j - gap];	j -= gap;      }      obs[j] = v;    }}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -