⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 scaleswn.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
📖 第 1 页 / 共 5 页
字号:
  covar_xy = covar_xy / n - mean_x * mean_y;/*  pr->rho = covar_xy / var_x;  pr->mu = mean_y - pr->rho * mean_x;*/  mean_y2 = covar_xy2 = 0.0;  for (j = llen->min; j <= llen->max; j++)     if (llen->hist[j] > 1) {      x = (double)j + 0.5;      u = pr->rho * x + pr->mu;      y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;      mean_y2 += y2;      covar_xy2 += x * y2;    }    mean_y2 /= n;  covar_xy2 = covar_xy2 / n - mean_x * mean_y2;  pr->rho2 = covar_xy2 / var_x;  pr->mu2 = mean_y2 - pr->rho2 * mean_x;  if (pr->rho2 < 0.0 )    z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);  else z =  pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;  if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;  pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;/*  fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",	 pr->var_cutoff,z);*/  mean_u2 = 0.0;  n_u2 = 0;  for ( j = llen->min; j < llen->max; j++) {    y = j+0.5;    dllj = (double)llen->hist[j];    x = pr->rho * y + pr->mu;    v = pr->rho2 * y + pr->mu2;    if (v < pr->var_cutoff) v = pr->var_cutoff;    if (llen->hist[j]> 1) {      u2 =  (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;      mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);      n_u2++;      /*      fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n",	      j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */    }    else llen->score_var[j] = -1.0;  }  mean_u2 = sqrt(mean_u2/(double)n_u2);  /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */  mean_3u2 = mean_u2*3.0;  for (j = llen->min; j < llen->max; j++) {    if (llen->hist[j] <= 1) continue;    if (sqrt(llen->score_var[j]) > mean_3u2) {      /*      fprintf(stderr," removing %d %d %.2f\n",	     j, (int)(exp((double)j/LN_FACT)-0.5),	     sqrt(llen->score_var[j]));	     */      pr->nb_trimmed++;      pr->n1_trimmed += llen->hist[j];      llen->hist[j] = 0;    }  }  fit_llen(llen, pr);}struct s2str {double s; int n;};void s2_sort ( struct s2str *sptr, int n);voidfit_llen2(struct llen_str *llen, struct rstat_str *pr){  int j;  int n, n_y2, llen_delta, llen_del05;  int n_size;  double x, y2, u;  double mean_x, mean_y, var_x, var_y, covar_xy;  double mean_y2, covar_xy2;  struct s2str *ss2;  double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;  /* now fit scores to best linear function of log(n), using   simple linear regression */    for (llen->min=0; llen->min < llen->max; llen->min++)    if (llen->hist[llen->min]) break;  for ( ; llen->max > llen->min; llen->max--)    if (llen->hist[llen->max]) break;  for (n_size=0,j = llen->min; j < llen->max; j++) {    if (llen->hist[j] > 1) {      llen->score_var[j] = llen->score2_sums[j]/(double)llen->hist[j]	- (llen->score_sums[j]/(double)llen->hist[j])	* (llen->score_sums[j]/(double)llen->hist[j]);      llen->score_var[j] /= (double)(llen->hist[j]-1);      if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;      n_size++;    }  }	    n_w = 0.0;  sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;  for (j = llen->min; j < llen->max; j++)    if (llen->hist[j] > 1) {      x = j + 0.5;      n_w += (double)llen->hist[j]/llen->score_var[j];      sum_x +=   (double)llen->hist[j] * x / llen->score_var[j] ;      sum_y += llen->score_sums[j] / llen->score_var[j];      sum_x2 +=  (double)llen->hist[j] * x * x /llen->score_var[j];      sum_xy +=  x * llen->score_sums[j]/llen->score_var[j];    }  if (n_size < 5 ) {    llen->fit_flag=0;    pr->rho = 0;    pr->mu = sum_y/n_w;  }  else {    det = n_w * sum_x2 - sum_x * sum_x;    if (det > 0.001) {      pr->rho = (n_w * sum_xy  - sum_x * sum_y)/det;      pr->rho_e = n_w/det;      pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;      pr->mu_e = sum_x2/det;    }    else {      llen->fit_flag = 0;      pr->rho = 0;      pr->mu = sum_y/n_w;    }  }  det = n_w * sum_x2 - sum_x * sum_x;  pr->rho = (n_w * sum_xy  - sum_x * sum_y)/det;  pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;/*   fprintf(stderr," rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */  n = 0;  mean_x = mean_y = mean_y2 = 0.0;  var_x = var_y = 0.0;  covar_xy = covar_xy2 = 0.0;  for (j = llen->min; j <= llen->max; j++)     if (llen->hist[j] > 1 ) {      n += llen->hist[j];      x = (double)j + 0.5;      mean_x += (double)llen->hist[j] * x;      mean_y += llen->score_sums[j];      var_x += (double)llen->hist[j] * x * x;      var_y += llen->score2_sums[j];      covar_xy += x * llen->score_sums[j];    }  mean_x /= n; mean_y /= n;  var_x = var_x / n - mean_x * mean_x;  var_y = var_y / n - mean_y * mean_y;    covar_xy = covar_xy / n - mean_x * mean_y;/*  pr->rho = covar_xy / var_x;  pr->mu = mean_y - pr->rho * mean_x;*/  if ((ss2=(struct s2str *)calloc(llen->max+1,sizeof(struct s2str)))==NULL) {    fprintf(stderr," cannot allocate ss2\n");    return;  }  mean_y2 = 0.0;  n_y2 = n = 0;  for (j = llen->min; j <= llen->max; j++)     if (llen->hist[j] > VHISTC) {      n++;      n_y2 += ss2[j].n = llen->hist[j];      x = (double)j + 0.5;      u = pr->rho * x + pr->mu;      ss2[j].s = y2 = llen->score2_sums[j] - 2*llen->score_sums[j]*u + llen->hist[j]*u*u;      mean_y2 += y2;    }  pr->mean_var = mean_y2/(double)n_y2;  pr->mean_var_sqrt = sqrt(pr->mean_var);  s2_sort(ss2+llen->min,llen->max-llen->min+1);    /*  fprintf(stderr,"llen->min: %d, max: %d\n",llen->min,llen->max); */  llen_delta = 0;  for (j=llen->min; j<=llen->max; j++) {    if (ss2[j].n > 1) {      llen_delta++;/*      fprintf(stderr,"%d\t%d\t%.2f\t%.4f\n",	      j,ss2[j].n,ss2[j].s,ss2[j].s/ss2[j].n);*/    }  }  llen_del05 = llen_delta/20;  mean_y2 = 0.0;  n_y2 = 0;  for (j = llen->min; j<llen->min+llen_del05; j++) {    pr->n1_trimmed += ss2[j].n;    pr->nb_trimmed++;  }  for (j = llen->min+llen_del05; j <= llen->min+llen_delta-llen_del05; j++)     if (ss2[j].n > 1) {      mean_y2 += ss2[j].s;      n_y2 += ss2[j].n;    }  for (j = llen->min+llen_delta-llen_del05+1; j< llen->max; j++) {    pr->n1_trimmed += ss2[j].n;    pr->nb_trimmed++;  }    free(ss2);  if (n_y2 > 1) {    pr->mean_var = mean_y2/(double)n_y2;    pr->mean_var_sqrt = sqrt(pr->mean_var);  }  /*    fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var: %.4f/%d\n",	  pr->rho*LN_FACT,pr->mu,pr->mean_var,n); */    pr->var_e = 0.0;}/* REG_STATS - Z() from rho/mu/mean_var */double find_zr(int score, double escore, int length, double comp, struct pstat_str *pu){  double log_len, z;  #ifdef LOCAL_SCORE  if (score <= 0) return 0;#endif  if ( length < LENGTH_CUTOFF) return 0;#ifdef LOCAL_SCORE  log_len = LN_FACT*log((double)(length));#else  /*   log_len = length; */  log_len = LN_FACT*log((double)(length));#endif/*  var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;  if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.var_cutoff;*/  z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt;  return (50.0 + z*10.0);}/* REG2_STATS Z() from rho/mu, rho2/mu2 */double find_zr2(int score, double escore, int length, double comp, struct pstat_str *pu){  double log_len, var;  double z;    if ( length < LENGTH_CUTOFF) return 0;#ifdef LOCAL_SCORE  log_len = LN_FACT*log((double)(length));#else  /*   log_len = length; */  log_len = LN_FACT*log((double)(length));#endif  var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;  if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.mean_var;  z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(var);  return (50.0 + z*10.0);}#ifdef USE_LNSTATS/* LN_STATS - ln()-scaled mu, mean_var */double find_zl(int score, int length, double comp, struct pstat_str *pu){  double ls, z;    ls = (double)score*LN200/log((double)length);  z = (ls - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt;  return (50.0 + z*10.0);}#endif/* MLE_STATS - Z() from MLE for lambda, K */doublefind_ze(int score, double escore, int length, double comp, struct pstat_str *pu){  double z, mp, np, a_n1;    a_n1 = (double)length;   mp = pu->r_u.ag.a_n0;  np = a_n1;  if (np < 1.0) np = 1.0;  if (mp < 1.0) mp = 1.0;  z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);  z = -z + EULER_G;  z /= - PI_SQRT6;  return (50.0 + z*10.0);}/* MLE2_STATS - Z() from MLE for mle_a0..2, mle_b1, length, comp */doublefind_ze2(int score, double escore, int length, double comp, struct pstat_str *pu){  double z, mp, np, a_n1;    a_n1 = (double)length;   if (comp <= 0.0) comp = pu->r_u.m2.ave_comp;  /* avoid very biased comp estimates */  /* comp = exp((4.0*log(comp)+log(pu->r_u.m2.ave_comp))/5.0); */  mp = pu->r_u.m2.a_n0;  np = a_n1;  if (np < 1.0) np = 1.0;  if (mp < 1.0) mp = 1.0;  z = (-(pu->r_u.m2.mle2_a0 + pu->r_u.m2.mle2_a1 * comp + pu->r_u.m2.mle2_a2 * comp * log(np * mp)) + score) / (pu->r_u.m2.mle2_b1 * comp);  z = -z + EULER_G;  z /= - PI_SQRT6;  return (50.0 + z*10.0);}/* AG_STATS - Altschul-Gish Lamdba, K */doublefind_za(int score, double escore, int length, double comp, struct pstat_str *pu){  double z, mp, np, a_n1, a_n1f;    a_n1 = (double)length;   a_n1f = log(a_n1)/pu->r_u.ag.H;  mp = pu->r_u.ag.a_n0 - pu->r_u.ag.a_n0f - a_n1f;  np = a_n1 - pu->r_u.ag.a_n0f - a_n1f;  if (np < 1.0) np = 1.0;  if (mp < 1.0) mp = 1.0;  z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);  z = -z + EULER_G;  z /= - PI_SQRT6;  return (50.0 + z*10.0);}double find_zn(int score, double escore, int length, double comp, struct pstat_str *pu){  double z;    z = ((double)score - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt;  return (50.0 + z*10.0);}/* computes E value for a given z value, assuming extreme value distribution */doublez_to_E(double zs, long entries, struct db_str db){  double e, n;  /*  if (db->entries < 5) return (double)db.entries; */  if (entries < 1) { n = db.entries;}  else {n = entries;}  if (zs > ZS_MAX) return 0.0;#ifndef NORMAL_DIST  e = exp(- PI_SQRT6 * zs - .577216);  return n * (e > .01 ? 1.0 - exp(-e) : e);#else  return n * erfc(zs/M_SQRT2)/2.0; #endif}doublezs_to_p(double zs){  double e, z;  /*  if (db.entries < 5) return 0.0; */  z = (zs - 50.0)/10.0;  if (z > ZS_MAX) return 0.0;#ifndef NORMAL_DIST  e = exp(- PI_SQRT6 * z - EULER_G);  return (e > .01 ? 1.0 - exp(-e) : e);#else  return erfc(zs/M_SQRT2)/2.0; #endif}doublezs_to_bit(double zs, int n0, int n1){  double z, a_n0, a_n1;  z = (zs - 50.0)/10.0;  a_n0 = (double)n0;  a_n1 = (double)n1;  return (PI_SQRT6 * z + EULER_G + log(a_n0*a_n1))/M_LN2 ;}/* computes E-value for a given z value, assuming extreme value distribution */doublezs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db){  double e, z, k;  /*  if (db->entries < 5) return 0.0; */  z = (zs - 50.0)/10.0;  if (z > ZS_MAX ) return 0.0;  if (entries < 1) entries = db.entries;  if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {    k = (double)db.length /(double)n1;    if (db.carry > 0) {      k += ((double)db.carry * (double)LONG_MAX)/(double)n1;    }  }  else k = (double)entries;  if (k < 1.0) k = 1.0;#ifndef NORMAL_DIST  z *= PI_SQRT6;  z += EULER_G;  e = exp(-z);  return k * (e > .01 ? 1.0 - exp(-e) : e);#else  return k * erfc(z/M_SQRT2)/2.0; #endif}#ifdef NORMAL_DISTdouble np_to_z(double, int *);#endif/* compute z-score for given E()-value, assuming normal or   extreme-value dist */doubleE_to_zs(double E, long entries){  double e, z;  int error;  e = E/(double)entries;#ifndef NORMAL_DIST  z = (log(e)+EULER_G)/(- PI_SQRT6);  return z*10.0 + 50.0;#else  /* this formula does not work for E() >= 1 */  if (e >= 1.0) e = 0.99;  z = np_to_z(1.0-e,&error);  if (!error) return z*10.0 + 50.0;  else return 0.0;#endif}/* computes 1.0 - E value for a given z value, assuming extreme value   distribution

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -