📄 scaleswt.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* scaleswt.c *//* $Name: fa35_03_06 $ - $Id: scaleswt.c,v 1.22 2007/06/29 17:11:42 wrp Exp $ *//* as of 24 Sept, 2000 - scaleswn uses no global variables *//*  copyright (c) 1995, 1996, 2000, 2002 William R. Pearson  This version is designed for fasts/f, which used Tatusov  probabilities for statistical estimates, but still needs a  quick-and-dirty linear regression fit to rank things  For comparisons that obey tatusov statistics, we try whenever  possible to provide accurate e_scores, rather than raw scores.  As a  result, no lambda/K fitting is required; and process_hist() can be  called atthe very beginning of the search to initialize some of the  statistics structures and find_zp().  find_zp() must still return a valid z_score surrogate, as  comp_lib.c/p2_complib.c continue to use z_score's to rank hits, save  the best, etc.  If e_score's cannot be calculated, the process_hist() provides  linear regression fitting for conventional z_score estimates.*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <limits.h>#include <float.h>#include <math.h>#include <limits.h>#include "defs.h"#include "param.h"#include "structs.h"#ifndef PCOMPLIB#include "mw.h"#else#include "p_mw.h"#endif#define MAXHIST 50#define MAX_LLEN 200#define LHISTC 5#define VHISTC 5#define MAX_SSCORE 300#define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */#define LN_FACT 10.0#ifndef M_LN2#define M_LN2 0.69314718055994530942#endif#define EULER_G 0.57721566490153286060#define PI_SQRT6 1.28254983016186409554#ifndef M_SQRT2#define M_SQRT2 1.41421356237#endif#define LN200 5.2983173666#define ZS_MAX 400.0	/* used to prevent underflow on some machines */#define TOLERANCE 1.0e-12#define TINY 1.0e-6/* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/struct rstat_str {  double ngLambda, ngK, ngH;  double rho, rho_e, mu, mu_e, mean_var, var_e;  /* ?_e:std. error of ? *//* used by REG2_STATS */  double rho2, mu2, var_cutoff;  int n_trimmed; /* excluded because of high z-score */  int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */  double tat_a, tat_b, tat_c, spacefactor;  int have_tat;  int tie_j;};#define AVE_STATS 0	/* no length effect, only mean/variance */double find_zt(int score, double escore, int len, double comp, struct rstat_str *);double find_zn(int score, double escore, int len, double comp, struct rstat_str *);double power(double, int);void sortbesto(double *, int );extern void sortbeste(struct beststr **bptr, int nbest);int proc_hist_n(struct stat_str *sptr, int n, 		 struct pstruct *ppst, struct hist_str *histp, int do_trim,		 struct rstat_str *);#define REG_STATS 1	/* length-regression scaled */double find_zr(int score, double escore, int len, double comp, struct rstat_str *);int proc_hist_r(struct stat_str *sptr, int n,		 struct pstruct *ppst, struct hist_str *histp,		 int do_trim, struct rstat_str *rs);double (*find_zp)(int score, double escore, int len, double comp,		 struct rstat_str *) = &find_zr;struct llen_str {  int min, max;  int max_score, min_score;  int *hist;  double *score_sums, *score2_sums;  double *score_var;  int max_length, min_length, zero_s;  int fit_flag;};static void inithist(struct llen_str *, struct pstruct *, int);static void free_hist( struct llen_str *);static void addhist(struct llen_str *, int, int, int);static void prune_hist(struct llen_str *, int, int, int, long *);void inithistz(int, struct hist_str *histp);void addhistz(double zs, struct hist_str *histp);static void fit_llen(struct llen_str *, struct rstat_str *);static void fit_llens(struct llen_str *, struct rstat_str *);void linreg(double *lny, double *x, double *lnx, int n,	    double *a, double *b, double *c, int start);double calc_spacefactor(const unsigned char *, int, int, int);double det(double a11, double a12, double a13,	   double a21, double a22, double a23,	   double a31, double a32, double a33);double factorial (int a, int b);/* void set_db_size(int, struct db_str *, struct hist_str *); */#ifdef DEBUGFILE *tmpf;#endifintprocess_hist(struct stat_str *sptr, int nstats, 	     struct mngmsg m_msg,	     struct pstruct *ppst,	     struct hist_str *histp,	     struct rstat_str **rs_sp,	     int do_hist	     ){  int zsflag, do_trim;  struct rstat_str *rs_s;  if (ppst->zsflag < 0) {    *rs_sp = NULL;    return ppst->zsflag;  }  if (*rs_sp == NULL) {    if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) {      fprintf(stderr," cannot allocate rs_snion: %ld\n",sizeof(struct rstat_str));      exit(1);    }    else *rs_sp = rs_s;  }  else {    rs_s = *rs_sp;    memset(rs_s,0,sizeof(struct rstat_str));  }  if (m_msg.escore_flg) {    find_zp = &find_zt;    inithistz(MAXHIST,histp);    return 1;  }  if (nstats < 20) {    fprintf(stderr," too few sequences for sampling: %d\n",nstats);    free(rs_s);    *rs_sp = NULL;    return -1;  }  rs_s->ngLambda = m_msg.Lambda;  rs_s->ngK = m_msg.K;  rs_s->ngH = m_msg.H;  zsflag = ppst->zsflag;  if (zsflag >= 10) {    zsflag -= 10;    do_trim = 0;  }  else do_trim = 1;  find_zp = &find_zr;  return proc_hist_r(sptr, nstats, ppst, histp, do_trim,  rs_s);}intcalc_thresh(struct pstruct *ppst, int nstats, 	    double Lambda, double K, double H, double *zstrim){  int max_hscore;  double ave_n1, tmp_score, z, l_fact;  if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) {    ave_n1 = 5000.0;    l_fact = 1.0;  }  else {    ave_n1 = 400.0;    l_fact = 0.7;  }/*  max_hscore = MAX_SSCORE; *//*  mean expected for ppst->n0 * 400 for protein, 5000 for DNA *//*  we want a number of offsets that is appropriate for the database size so    far (nstats)*//*  the calculation below sets a high-score threshold using an  ungapped lambda, but errs towards the high-score side by using  E()=0.001 and calculating with 0.70*lambda, which is the correct for  going from ungapped to -12/-2 gapped lambda with BLOSUM50*/#ifndef NORMAL_DIST  tmp_score = 0.01/((double)nstats*K*(double)ppst->n0*ave_n1);  tmp_score = -log(tmp_score)/(Lambda*l_fact);  max_hscore = (int)(tmp_score+0.5);  z = 1.0/(double)nstats;  z = (log(z)+EULER_G)/(-PI_SQRT6);#else  max_hscore = 100;  z = 5.0;#endif  *zstrim = 10.0*z+50.0;  return max_hscore;}intproc_hist_r(struct stat_str *sptr, int nstats,	    struct pstruct *ppst, struct hist_str *histp,	    int do_trim, struct rstat_str *rs){  int i, max_hscore;  double zs, ztrim;  char s_string[128];  struct llen_str llen;  char *f_string;  llen.fit_flag=1;  llen.hist=NULL;  max_hscore = calc_thresh(ppst, nstats, rs->ngLambda,			   rs->ngK, rs->ngH, &ztrim);  inithist(&llen, ppst,max_hscore);  f_string = &(histp->stat_info[0]);  for (i = 0; i<nstats; i++)    addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);  histp->entries = nstats - llen.zero_s;  if ((llen.max_score - llen.min_score) < 10) {    free_hist(&llen);    llen.fit_flag = 0;    find_zp = &find_zn;    return proc_hist_n(sptr, nstats, ppst, histp, do_trim, rs);  }  fit_llen(&llen, rs); /* now we have rho, mu, rho2, mu2, mean_var			  to set the parameters for the histogram */  if (!llen.fit_flag) {	/* the fit failed, fall back to proc_hist_n */    free_hist(&llen);    find_zp = &find_zn;    return proc_hist_n(sptr,nstats, ppst, histp, do_trim, rs);  }  rs->n_trimmed= rs->n1_trimmed = rs->nb_trimmed = 0;  if (do_trim) {    if (llen.fit_flag) {      for (i = 0; i < nstats; i++) {	zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);	if (zs < 20.0 || zs > ztrim) {	  rs->n_trimmed++;	  prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,		     &(histp->entries));	}      }    }  /*  fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", rs->n_trimmed); */    if (llen.fit_flag) fit_llens(&llen, rs);  /*   fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", rs->n1_trimmed,rs->nb_trimmed); */  }  free_hist(&llen);  /* rst all the scores in the histogram */  if (ppst->zsflag < 10) s_string[0]='\0';  else if (ppst->zs_win > 0)    sprintf(s_string,"(shuffled, win: %d)",ppst->zs_win);  else strncpy(s_string,"(shuffled)",sizeof(s_string));  inithistz(MAXHIST, histp);  sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d  B-trim: %d in %d/%d\n Lambda= %6.4f",	  s_string,	  rs->rho*LN_FACT,sqrt(rs->rho_e),rs->mu,sqrt(rs->mu_e), rs->mean_var,sqrt(rs->var_e),	  llen.zero_s, rs->n_trimmed, rs->n1_trimmed, rs->nb_trimmed, rs->nb_tot,	  PI_SQRT6/sqrt(rs->mean_var));    return REG_STATS;}intproc_hist_n(struct stat_str *sptr, int nstats,	    struct pstruct *ppst, struct hist_str *histp,	    int do_trim, struct rstat_str *rs){  int i, j;  double s_score, s2_score, ssd;  double ztrim;  int nit, max_hscore;  char s_string[128];  char *f_string;  f_string = &(histp->stat_info[0]);  /*   db->entries = db->length = db->carry = 0; */  max_hscore = calc_thresh(ppst, nstats, rs->ngLambda,			   rs->ngK, rs->ngH, &ztrim);  s_score = s2_score = 0.0;  histp->entries = 0;  for ( j = 0, i = 0; i < nstats; i++) {    if (sptr[i].score > 0 && sptr[i].score <= max_hscore) {      s_score += (ssd=(double)sptr[i].score);      s2_score += ssd * ssd;      histp->entries++;      /*       db->length += sptr[i].n1;      if (db->length > LONG_MAX) {	db->carry++;	db->length -= LONG_MAX;      }      */      j++;    }  }  if (j > 1 ) {    rs->mu = s_score/(double)j;    rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;    rs->mean_var /= (double)(j-1);  }  else {    rs->mu = 50.0;    rs->mean_var = 10.0;  }    if (rs->mean_var < 0.01) {    rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;  }  /* now remove some scores */  nit = 5;  while (nit-- > 0) {    rs->n_trimmed = 0;    for (i=0; i< nstats; i++) {      if (sptr[i].n1 < 0) continue;      ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);      if (ssd > ztrim || ssd < 20.0) {	/*      fprintf(stderr,"removing %3d %3d %4.1f\n",		sptr[i].score, sptr[i].n1,ssd); */	ssd = sptr[i].score;	s_score -= ssd;	s2_score -= ssd*ssd;	j--;	rs->n_trimmed++;	histp->entries--;	sptr[i].n1 = -sptr[i].n1;      }    }    if (j > 1 ) {      rs->mu = s_score/(double)j;      rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;      rs->mean_var /= (double)(j-1);    }    else {      rs->mu = 50.0;      rs->mean_var = 10.0;    }    if (rs->mean_var < 0.01) {      rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;    }    if (rs->n_trimmed < LHISTC) {      /*	fprintf(stderr,"nprune %d at %d\n",nprune,nit);	*/      break;    }  }  if (ppst->zsflag < 10) s_string[0]='\0';  else if (ppst->zs_win > 0)    sprintf(s_string,"(shuffled, win: %d)",ppst->zs_win);  else strncpy(s_string,"(shuffled)",sizeof(s_string));  sprintf(f_string,"%s unscaled statistics: mu= %6.4f  var=%6.4f; Lambda= %6.4f",	  s_string, rs->mu,rs->mean_var,PI_SQRT6/sqrt(rs->mean_var));  return AVE_STATS;}/*This routine calculates the maximum likelihood estimates for theextreme value distribution exp(-exp(-(-x-a)/b)) using the formula	<lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}	<a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )	The <a> parameter can be transformed into and K	of the formula: 1 - exp ( - K m n exp ( - lambda S ))	using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))			1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))			a = log(K m n) / lambda			a lambda = log (K m n)			exp(a lambda)  = K m n 	 but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})	 so:            K m n = (1/n sum{ exp( -x[i] *lambda)})			K = sum{}/(nlib m n )*/voidalloc_hist(struct llen_str *llen){  int max_llen, i;  max_llen = llen->max;  if (llen->hist == NULL) {    llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));    llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));    llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));    llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));  }  for (i=0; i< max_llen+1; i++) {      llen->hist[i] = 0;      llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;  }}  voidfree_hist(struct llen_str *llen){  if (llen->hist!=NULL) {    free(llen->score_var);    free(llen->score2_sums);    free(llen->score_sums);    free(llen->hist);    llen->hist=NULL;  }}voidinithist(struct llen_str *llen, struct pstruct *ppst, int max_hscore){  llen->max = MAX_LLEN;
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -