⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 scaleswn.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
📖 第 1 页 / 共 5 页
字号:
/* scaleswn.c *//* $Name: fa35_03_06 $ - $Id: scaleswn.c,v 1.63 2007/10/04 16:15:07 wrp Exp $ *//* as of 24 Sept, 2000 - scaleswn uses no global variables *//*	Provide statistical estimates using an extreme value distribution	copyright (c) 1995, 1996, 2000 William R. Pearson	This code provides multiple methods for scaling sequence	similarity scores to correct for length effects.	Currently, six methods are available:	ppst->zsflag = 0 - no scaling  (AVE_STATS)	ppst->zsflag = 1 - regression-scaled scores (REG_STATS)	ppst->zsflag = 2 - (revised) MLE Lambda/K scaled scores (MLE_STATS)	ppst->zsflag = 3 - scaling using Altschul's parameters (AG_STATS)	ppst->zsflag = 4 - regression-scaled with iterative outlier removal (REGI_STATS)	ppst->zsflag = 5 = like 1, but length scaled variance (REG2_STATS)	ppst->zsflag = 6 = like 2, but uses lambda composition/scale (MLE2_STATS)	ppst->zsflag = 11 = 10 + 1 - use random shuffles, method 1*/#include <stdio.h>#include <stdlib.h>#include <math.h>#include <string.h>#include <limits.h>#include "defs.h"#include "param.h"#include "structs.h"#ifndef PCOMPLIB#include "mw.h"#else#include "p_mw.h"#endif#define MAXHIST 50#define MAX_LLEN 200#define LHISTC 5#define VHISTC 5#define MAX_SSCORE 300#define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */#define LN_FACT 10.0#ifndef M_LN2#define M_LN2 0.69314718055994530942#endif#define EULER_G 0.57721566490153286060#define PI_SQRT6 1.28254983016186409554#ifndef M_SQRT2#define M_SQRT2 1.41421356237#endif#define LN200 5.2983173666#define ZS_MAX 400.0	/* used to prevent underflow on some machines */#define TOLERANCE 1.0e-12#define TINY 1.0e-6/* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/struct rstat_str {  double rho, rho_e, mu, mu_e, mean_var, var_e;  /* var_e:std. error of var */  double mean_var_sqrt;		/* sqrt(mean_var) - used frequently *//* used by REG2_STATS */  double rho2, mu2, var_cutoff;  int n_trimmed; /* excluded because of high z-score */  int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */};/* used by AG_STATS, MLE_STATS */struct ag_stat_str {  double K, Lambda, H, a_n0f, a_n0;};/* used by MLE2_STATS */struct mle2_stat_str {  double a_n0;  double mle2_a0, mle2_a1, mle2_a2, mle2_b1;  double ave_comp, max_comp, ave_H;};struct pstat_str {  int zsflag;  double ngLambda, ngK, ngH;  double ave_n1;  union {    struct rstat_str rg;    struct ag_stat_str ag;    struct mle2_stat_str m2;  } r_u;};#define AVE_STATS 0	/* no length effect, only mean/variance */double find_zn(int score, double escore, int len, double comp, struct pstat_str *);int proc_hist_n(struct stat_str *sptr, int n, 		struct pstruct *ppst, struct hist_str *histp, int do_trim,		struct pstat_str *);#define REG_STATS 1	/* length-regression scaled */#define REGI_STATS 4	/* length regression, iterative */double find_zr(int score, double escore, int len, double comp, struct pstat_str *);int proc_hist_r(struct stat_str *sptr, int n,		struct pstruct *ppst, struct hist_str *histp,		int do_trim, struct pstat_str *pu);#define MLE_STATS 2	/* MLE for lambda, K */double find_ze(int score, double escore, int len, double comp, struct pstat_str *);int proc_hist_ml(struct stat_str *sptr, int n,		 struct pstruct *ppst, struct hist_str *histp, int do_trim,		 struct pstat_str *);#define AG_STATS 3	/* Altschul-Gish parameters */double find_za(int score, double escore, int len, double comp, struct pstat_str *);int proc_hist_a(struct stat_str *sptr, int n,		struct pstruct *ppst, struct hist_str *histp, int do_trim,		struct pstat_str *);#define REG2_STATS 5	/* length regression on mean + variance */double find_zr2(int score, double escore, int len, double comp, struct pstat_str *);int proc_hist_r2(struct stat_str *sptr, int n,		 struct pstruct *ppst, struct hist_str *histp, int do_trim,		 struct pstat_str *);#define MLE2_STATS 6	/* MLE stats using comp(lambda) */double find_ze2(int score, double escore, int length, double comp, struct pstat_str *);int proc_hist_ml2(struct stat_str *sptr, int n,		  struct pstruct *ppst, struct hist_str *histp, int do_trim,		  struct pstat_str *);#ifdef USE_LNSTATS#define LN_STATS 2double find_zl(int score, double escore, int len, double comp, struct pstat_str *);int proc_hist_ln(struct stat_str *sptr, int n, 		 struct pstruct *ppst, struct hist_str *histp, int do_trim,		 struct pstat_str *);#endif/* scaleswn.c local variables that belong in their own structure */double (*find_zp)(int score, double escore, int len, double comp, struct pstat_str *) = &find_zr;struct llen_str {  int min, max;  int max_score, min_score;  int *hist;  double *score_sums, *score2_sums;  double *score_var;  int max_length, min_length, zero_s;  int fit_flag;};static void inithist(struct llen_str *, struct pstruct *, int);static void free_hist( struct llen_str *);static void addhist(struct llen_str *, int, int, int);static void prune_hist(struct llen_str *, int, int, int, long *);void inithistz(int, struct hist_str *histp);void addhistz(double zs, struct hist_str *histp);void addhistzp(double zs, struct hist_str *histp);static void fit_llen(struct llen_str *, struct rstat_str *);static void fit_llen2(struct llen_str *, struct rstat_str *);static void fit_llens(struct llen_str *, struct rstat_str *);extern void sortbeste(struct beststr **bptr, int nbest);/* void set_db_size(int, struct db_str *, struct hist_str *); */#ifdef DEBUGFILE *tmpf;#endifintprocess_hist(struct stat_str *sptr, int nstats, 	     struct mngmsg m_msg,	     struct pstruct *ppst,	     struct hist_str *histp,	     struct pstat_str **ps_sp,	     int do_hist){  int zsflag, r_zsflag, do_trim, i;  double ave_n1;  struct pstat_str *ps_s;  if (ppst->zsflag < 0) {    *ps_sp = NULL;    return ppst->zsflag;  }  if (*ps_sp == NULL) {    if ((ps_s=(struct pstat_str *)calloc(1,sizeof(struct pstat_str)))==NULL) {      fprintf(stderr," cannot allocate pstat_union: %ld\n",sizeof(struct pstat_str));      exit(1);    }    else {      *ps_sp = ps_s;    }  }  else {    ps_s = *ps_sp;    memset(ps_s,0,sizeof(struct pstat_str));  }  ps_s->ngLambda = m_msg.Lambda;  ps_s->ngK = m_msg.K;  ps_s->ngH = m_msg.H;  zsflag = ppst->zsflag;  if (nstats < 10) zsflag = AG_STATS;/*#ifdef DEBUG  if (ppst->debug_lib) {    tmpf=fopen("tmp_stats.res","w+");    for (i=0; i<nstats; i++) fprintf(tmpf,"%d\t%d\n",sptr[i].score,sptr[i].n1);    fclose(tmpf);  }#endif*/  if (zsflag >= 10) {    zsflag -= 10;    do_trim = 0;  }  else do_trim = 1;#ifdef USE_LNSCALE  if (zsflag==LN_STATS) {    find_zp = &find_zl;    ps_s->zsflag=r_zsflag = proc_hist_ln(sptr, nstats, histp, do_trim, ps_s);  }#else  if (zsflag==MLE_STATS) {    find_zp = &find_ze;    ps_s->zsflag=r_zsflag = proc_hist_ml(sptr, nstats, ppst, histp, do_trim, ps_s);  }#endif  else if (zsflag==REG_STATS) {    find_zp = &find_zr;    ps_s->zsflag=r_zsflag = proc_hist_r(sptr, nstats, ppst, histp, do_trim,  ps_s);  }  else if (zsflag==AG_STATS) {    find_zp = &find_za;    ps_s->zsflag=r_zsflag = proc_hist_a(sptr, nstats, ppst, histp, do_trim,  ps_s);  }  else if (zsflag==REGI_STATS) {    find_zp = &find_zr;    ps_s->zsflag=r_zsflag = proc_hist_r2(sptr,nstats, ppst, histp, do_trim,  ps_s);  }  else if (zsflag==REG2_STATS) {    find_zp = &find_zr2;    ps_s->zsflag=r_zsflag = proc_hist_r(sptr,nstats, ppst, histp, do_trim,  ps_s);  }#if !defined(TFAST) && !defined(FASTX)  else if (zsflag == MLE2_STATS) {    find_zp = &find_ze2;    ps_s->zsflag=r_zsflag = proc_hist_ml2(sptr, nstats, ppst, histp, do_trim,  ps_s);  }#endif  else {	/* AVE_STATS */    find_zp = &find_zn;    ps_s->zsflag=r_zsflag = proc_hist_n(sptr,nstats, ppst, histp, do_trim,  ps_s);  }  if (!ppst->zdb_size_set) ppst->zdb_size = m_msg.db.entries;  if (!do_hist) {    histp->entries = nstats; /* db->entries = 0; */    inithistz(MAXHIST, histp);    for (i = 0; i < nstats; i++) {      if (sptr[i].n1 < 0) sptr[i].n1 = -sptr[i].n1;      addhistz(find_zp(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,ps_s),	       histp);    }  }  return r_zsflag;}intcalc_thresh(struct pstruct *ppst, struct stat_str *sptr, int nstats, 	    double Lambda, double K, double H, double *zstrim){  int max_hscore;  int i;  double ave_n1, tmp_score, z, l_fact;  ave_n1 = 0.0;  for (i=0; i<nstats; i++) {    ave_n1 += (double)sptr[i].n1;  }  if (nstats >= 1) ave_n1 /= nstats;  if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) {    l_fact = 1.0;  }  else {    l_fact = 0.7;  }/*  max_hscore = MAX_SSCORE; *//*  mean expected for ppst->n0 * 400 for protein, 5000 for DNA *//*  we want a number of offsets that is appropriate for the database size so    far (nstats)*//*  the calculation below sets a high-score threshold using an  ungapped lambda, but errs towards the high-score side by using  E()=0.001 and calculating with 0.70*lambda, which is the correct for  going from ungapped to -12/-2 gapped lambda with BLOSUM50*/#ifndef NORMAL_DIST  if (K < 1.0e-50) K = 0.02;	/* prevent floating pt underflow with K=0 */  tmp_score = 0.01/((double)nstats*K*(double)ppst->n0*ave_n1);  tmp_score = -log(tmp_score)/(Lambda*l_fact);  max_hscore = (int)(tmp_score+0.5);  z = 1.0/(double)nstats;  z = (log(z)+EULER_G)/(- PI_SQRT6);#else  max_hscore = 100;  z = 5.0;#endif  *zstrim = 10.0*z+50.0;  return max_hscore;}intproc_hist_r(struct stat_str *sptr, int nstats,	    struct pstruct *ppst, struct hist_str *histp,	    int do_trim, struct pstat_str *pu){  int i, max_hscore;  double zs, ztrim;  char s_string[128];  char rho_str[32];	/* used for simplifying output label */  double rho_val;  struct llen_str llen;  char *f_string;  llen.fit_flag=1;  llen.hist=NULL;  max_hscore = calc_thresh(ppst, sptr,  nstats, pu->ngLambda,			   pu->ngK, pu->ngH, &ztrim);  inithist(&llen, ppst,max_hscore);  f_string = histp->stat_info;  for (i = 0; i<nstats; i++)    addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);  if ((llen.max_score - llen.min_score) < 10) {    free_hist(&llen);    llen.fit_flag = 0;    find_zp = &find_zn;    return proc_hist_n(sptr, nstats, ppst, histp, do_trim, pu);  }  fit_llen(&llen, &(pu->r_u.rg)); /* now we have rho, mu, rho2, mu2, mean_var				 to set the parameters for the histogram */  if (!llen.fit_flag) {	/* the fit failed, fall back to proc_hist_ml */    free_hist(&llen);    find_zp = &find_ze;    return proc_hist_ml(sptr,nstats, ppst, histp, do_trim, pu);  }  pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;  if (do_trim) {    if (llen.fit_flag) {      for (i = 0; i < nstats; i++) {	zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);	if (zs < 20.0 || zs > ztrim) {	  pu->r_u.rg.n_trimmed++;	  prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,		     &(histp->entries));	}      }    }  /*  fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", pu->r_u.rg.n_trimmed); */    if (llen.fit_flag) fit_llens(&llen, &(pu->r_u.rg));  /*   fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", pu->r_u.rg.n1_trimmed,pu->r_u.rg.nb_trimmed); */  }  free_hist(&llen);  /* put all the scores in the histogram */  if (ppst->zsflag < 10) s_string[0]='\0';  else if (ppst->zs_win > 0)    sprintf(s_string,"(shuffled [%d], win: %d)",nstats,ppst->zs_win);  else     sprintf(s_string,"(shuffled [%d])",nstats);  /* #ifdef LOCAL_SCORE */  strncpy(rho_str,"ln(x)",sizeof(rho_str));  rho_val = pu->r_u.rg.rho*LN_FACT;  /*#else  strncpy(rho_str,"x",sizeof(rho_str));  rho_val = pu->r_u.rg.rho;#endif  */  if (ppst->zsflag == REG2_STATS || ppst->zsflag == 10+REG2_STATS) {    sprintf(f_string,"%s Expectation_v fit: rho(%s)= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n rho2=%6.2f; mu2= %6.2f, 0's: %d Z-trim: %d  B-trim: %d in %d/%d",	    s_string, rho_str, rho_val ,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),	    pu->r_u.rg.rho2,pu->r_u.rg.mu2,llen.zero_s,	    pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot);  }  else {    sprintf(f_string,"%s Expectation_n fit: rho(%s)= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d  B-trim: %d in %d/%d\n Lambda= %8.6f",	    s_string, rho_str, rho_val,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e), pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),	    llen.zero_s, pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot,	    PI_SQRT6/sqrt(pu->r_u.rg.mean_var));  }  return REG_STATS;}intproc_hist_r2(struct stat_str *sptr, int nstats,	     struct pstruct *ppst, struct hist_str *histp,	     int do_trim, struct pstat_str *pu){  int i, nit, nprune, max_hscore;  double zs, ztrim;  char s_string[128];  char *f_string;  struct llen_str llen;  llen.fit_flag=1;  llen.hist=NULL;  max_hscore = calc_thresh(ppst, sptr, nstats, pu->ngLambda,			   pu->ngK, pu->ngH, &ztrim);  inithist(&llen, ppst,max_hscore);  f_string = histp->stat_info;  for (i = 0; i<nstats; i++)    addhist(&llen,sptr[i].score,sptr[i].n1,max_hscore);  pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;  if (do_trim) nit = 5;  else nit = 0;  while (nit-- > 0) {    nprune = 0;    fit_llen2(&llen, &(pu->r_u.rg));    for (i = 0; i < nstats; i++) {      if (sptr[i].n1 < 0) continue;      zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,pu);      if (zs < 20.0 || zs > ztrim ) {	nprune++;	pu->r_u.rg.n_trimmed++;	prune_hist(&llen,sptr[i].score,sptr[i].n1,max_hscore,		   &(histp->entries));	sptr[i].n1 = -sptr[i].n1;      }    }    /*    fprintf(stderr," %d Z-trimmed at %d\n",nprune,nit); */    if (nprune < LHISTC) { break; }  }  fit_llen(&llen, &(pu->r_u.rg));  free_hist(&llen);  if (ppst->zsflag < 10) s_string[0]='\0';  else if (ppst->zs_win > 0)    sprintf(s_string,"(shuffled [%d], win: %d)",nstats,ppst->zs_win);  else     sprintf(s_string,"(shuffled [%d])",nstats);  sprintf(f_string,"%s Expectation_i fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n mean_var=%6.4f+/-%6.3f 0's: %d Z-trim: %d N-it: %d\n Lambda= %8.6f",	  s_string,

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -