📄 scaleswt.c
字号:
/* scaleswt.c *//* $Name: fa35_03_06 $ - $Id: scaleswt.c,v 1.22 2007/06/29 17:11:42 wrp Exp $ *//* as of 24 Sept, 2000 - scaleswn uses no global variables *//* copyright (c) 1995, 1996, 2000, 2002 William R. Pearson This version is designed for fasts/f, which used Tatusov probabilities for statistical estimates, but still needs a quick-and-dirty linear regression fit to rank things For comparisons that obey tatusov statistics, we try whenever possible to provide accurate e_scores, rather than raw scores. As a result, no lambda/K fitting is required; and process_hist() can be called atthe very beginning of the search to initialize some of the statistics structures and find_zp(). find_zp() must still return a valid z_score surrogate, as comp_lib.c/p2_complib.c continue to use z_score's to rank hits, save the best, etc. If e_score's cannot be calculated, the process_hist() provides linear regression fitting for conventional z_score estimates.*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <limits.h>#include <float.h>#include <math.h>#include <limits.h>#include "defs.h"#include "param.h"#include "structs.h"#ifndef PCOMPLIB#include "mw.h"#else#include "p_mw.h"#endif#define MAXHIST 50#define MAX_LLEN 200#define LHISTC 5#define VHISTC 5#define MAX_SSCORE 300#define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */#define LN_FACT 10.0#ifndef M_LN2#define M_LN2 0.69314718055994530942#endif#define EULER_G 0.57721566490153286060#define PI_SQRT6 1.28254983016186409554#ifndef M_SQRT2#define M_SQRT2 1.41421356237#endif#define LN200 5.2983173666#define ZS_MAX 400.0 /* used to prevent underflow on some machines */#define TOLERANCE 1.0e-12#define TINY 1.0e-6/* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/struct rstat_str { double ngLambda, ngK, ngH; double rho, rho_e, mu, mu_e, mean_var, var_e; /* ?_e:std. error of ? *//* used by REG2_STATS */ double rho2, mu2, var_cutoff; int n_trimmed; /* excluded because of high z-score */ int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */ double tat_a, tat_b, tat_c, spacefactor; int have_tat; int tie_j;};#define AVE_STATS 0 /* no length effect, only mean/variance */double find_zt(int score, double escore, int len, double comp, struct rstat_str *);double find_zn(int score, double escore, int len, double comp, struct rstat_str *);double power(double, int);void sortbesto(double *, int );extern void sortbeste(struct beststr **bptr, int nbest);int proc_hist_n(struct stat_str *sptr, int n, struct pstruct *ppst, struct hist_str *histp, int do_trim, struct rstat_str *);#define REG_STATS 1 /* length-regression scaled */double find_zr(int score, double escore, int len, double comp, struct rstat_str *);int proc_hist_r(struct stat_str *sptr, int n, struct pstruct *ppst, struct hist_str *histp, int do_trim, struct rstat_str *rs);double (*find_zp)(int score, double escore, int len, double comp, struct rstat_str *) = &find_zr;struct llen_str { int min, max; int max_score, min_score; int *hist; double *score_sums, *score2_sums; double *score_var; int max_length, min_length, zero_s; int fit_flag;};static void inithist(struct llen_str *, struct pstruct *, int);static void free_hist( struct llen_str *);static void addhist(struct llen_str *, int, int, int);static void prune_hist(struct llen_str *, int, int, int, long *);void inithistz(int, struct hist_str *histp);void addhistz(double zs, struct hist_str *histp);static void fit_llen(struct llen_str *, struct rstat_str *);static void fit_llens(struct llen_str *, struct rstat_str *);void linreg(double *lny, double *x, double *lnx, int n, double *a, double *b, double *c, int start);double calc_spacefactor(const unsigned char *, int, int, int);double det(double a11, double a12, double a13, double a21, double a22, double a23, double a31, double a32, double a33);double factorial (int a, int b);/* void set_db_size(int, struct db_str *, struct hist_str *); */#ifdef DEBUGFILE *tmpf;#endifintprocess_hist(struct stat_str *sptr, int nstats, struct mngmsg m_msg, struct pstruct *ppst, struct hist_str *histp, struct rstat_str **rs_sp, int do_hist ){ int zsflag, do_trim; struct rstat_str *rs_s; if (ppst->zsflag < 0) { *rs_sp = NULL; return ppst->zsflag; } if (*rs_sp == NULL) { if ((rs_s=(struct rstat_str *)calloc(1,sizeof(struct rstat_str)))==NULL) { fprintf(stderr," cannot allocate rs_snion: %ld\n",sizeof(struct rstat_str)); exit(1); } else *rs_sp = rs_s; } else { rs_s = *rs_sp; memset(rs_s,0,sizeof(struct rstat_str)); } if (m_msg.escore_flg) { find_zp = &find_zt; inithistz(MAXHIST,histp); return 1; } if (nstats < 20) { fprintf(stderr," too few sequences for sampling: %d\n",nstats); free(rs_s); *rs_sp = NULL; return -1; } rs_s->ngLambda = m_msg.Lambda; rs_s->ngK = m_msg.K; rs_s->ngH = m_msg.H; zsflag = ppst->zsflag; if (zsflag >= 10) { zsflag -= 10; do_trim = 0; } else do_trim = 1; find_zp = &find_zr; return proc_hist_r(sptr, nstats, ppst, histp, do_trim, rs_s);}intcalc_thresh(struct pstruct *ppst, int nstats, double Lambda, double K, double H, double *zstrim){ int max_hscore; double ave_n1, tmp_score, z, l_fact; if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) { ave_n1 = 5000.0; l_fact = 1.0; } else { ave_n1 = 400.0; l_fact = 0.7; }/* max_hscore = MAX_SSCORE; *//* mean expected for ppst->n0 * 400 for protein, 5000 for DNA *//* we want a number of offsets that is appropriate for the database size so far (nstats)*//* the calculation below sets a high-score threshold using an ungapped lambda, but errs towards the high-score side by using E()=0.001 and calculating with 0.70*lambda, which is the correct for going from ungapped to -12/-2 gapped lambda with BLOSUM50*/#ifndef NORMAL_DIST tmp_score = 0.01/((double)nstats*K*(double)ppst->n0*ave_n1); tmp_score = -log(tmp_score)/(Lambda*l_fact); max_hscore = (int)(tmp_score+0.5); z = 1.0/(double)nstats; z = (log(z)+EULER_G)/(-PI_SQRT6);#else max_hscore = 100; z = 5.0;#endif *zstrim = 10.0*z+50.0; return max_hscore;}intproc_hist_r(struct stat_str *sptr, int nstats, struct pstruct *ppst, struct hist_str *histp, int do_trim, struct rstat_str *rs){ int i, max_hscore; double zs, ztrim; char s_string[128]; struct llen_str llen; char *f_string; llen.fit_flag=1; llen.hist=NULL; max_hscore = calc_thresh(ppst, nstats, rs->ngLambda, rs->ngK, rs->ngH, &ztrim); inithist(&llen, ppst,max_hscore); f_string = &(histp->stat_info[0]); for (i = 0; i<nstats; i++) addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore); histp->entries = nstats - llen.zero_s; if ((llen.max_score - llen.min_score) < 10) { free_hist(&llen); llen.fit_flag = 0; find_zp = &find_zn; return proc_hist_n(sptr, nstats, ppst, histp, do_trim, rs); } fit_llen(&llen, rs); /* now we have rho, mu, rho2, mu2, mean_var to set the parameters for the histogram */ if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_n */ free_hist(&llen); find_zp = &find_zn; return proc_hist_n(sptr,nstats, ppst, histp, do_trim, rs); } rs->n_trimmed= rs->n1_trimmed = rs->nb_trimmed = 0; if (do_trim) { if (llen.fit_flag) { for (i = 0; i < nstats; i++) { zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs); if (zs < 20.0 || zs > ztrim) { rs->n_trimmed++; prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore, &(histp->entries)); } } } /* fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", rs->n_trimmed); */ if (llen.fit_flag) fit_llens(&llen, rs); /* fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", rs->n1_trimmed,rs->nb_trimmed); */ } free_hist(&llen); /* rst all the scores in the histogram */ if (ppst->zsflag < 10) s_string[0]='\0'; else if (ppst->zs_win > 0) sprintf(s_string,"(shuffled, win: %d)",ppst->zs_win); else strncpy(s_string,"(shuffled)",sizeof(s_string)); inithistz(MAXHIST, histp); sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d B-trim: %d in %d/%d\n Lambda= %6.4f", s_string, rs->rho*LN_FACT,sqrt(rs->rho_e),rs->mu,sqrt(rs->mu_e), rs->mean_var,sqrt(rs->var_e), llen.zero_s, rs->n_trimmed, rs->n1_trimmed, rs->nb_trimmed, rs->nb_tot, PI_SQRT6/sqrt(rs->mean_var)); return REG_STATS;}intproc_hist_n(struct stat_str *sptr, int nstats, struct pstruct *ppst, struct hist_str *histp, int do_trim, struct rstat_str *rs){ int i, j; double s_score, s2_score, ssd; double ztrim; int nit, max_hscore; char s_string[128]; char *f_string; f_string = &(histp->stat_info[0]); /* db->entries = db->length = db->carry = 0; */ max_hscore = calc_thresh(ppst, nstats, rs->ngLambda, rs->ngK, rs->ngH, &ztrim); s_score = s2_score = 0.0; histp->entries = 0; for ( j = 0, i = 0; i < nstats; i++) { if (sptr[i].score > 0 && sptr[i].score <= max_hscore) { s_score += (ssd=(double)sptr[i].score); s2_score += ssd * ssd; histp->entries++; /* db->length += sptr[i].n1; if (db->length > LONG_MAX) { db->carry++; db->length -= LONG_MAX; } */ j++; } } if (j > 1 ) { rs->mu = s_score/(double)j; rs->mean_var = s2_score - (double)j * rs->mu * rs->mu; rs->mean_var /= (double)(j-1); } else { rs->mu = 50.0; rs->mean_var = 10.0; } if (rs->mean_var < 0.01) { rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0; } /* now remove some scores */ nit = 5; while (nit-- > 0) { rs->n_trimmed = 0; for (i=0; i< nstats; i++) { if (sptr[i].n1 < 0) continue; ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs); if (ssd > ztrim || ssd < 20.0) { /* fprintf(stderr,"removing %3d %3d %4.1f\n", sptr[i].score, sptr[i].n1,ssd); */ ssd = sptr[i].score; s_score -= ssd; s2_score -= ssd*ssd; j--; rs->n_trimmed++; histp->entries--; sptr[i].n1 = -sptr[i].n1; } } if (j > 1 ) { rs->mu = s_score/(double)j; rs->mean_var = s2_score - (double)j * rs->mu * rs->mu; rs->mean_var /= (double)(j-1); } else { rs->mu = 50.0; rs->mean_var = 10.0; } if (rs->mean_var < 0.01) { rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0; } if (rs->n_trimmed < LHISTC) { /* fprintf(stderr,"nprune %d at %d\n",nprune,nit); */ break; } } if (ppst->zsflag < 10) s_string[0]='\0'; else if (ppst->zs_win > 0) sprintf(s_string,"(shuffled, win: %d)",ppst->zs_win); else strncpy(s_string,"(shuffled)",sizeof(s_string)); sprintf(f_string,"%s unscaled statistics: mu= %6.4f var=%6.4f; Lambda= %6.4f", s_string, rs->mu,rs->mean_var,PI_SQRT6/sqrt(rs->mean_var)); return AVE_STATS;}/*This routine calculates the maximum likelihood estimates for theextreme value distribution exp(-exp(-(-x-a)/b)) using the formula <lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)} <a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } ) The <a> parameter can be transformed into and K of the formula: 1 - exp ( - K m n exp ( - lambda S )) using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) )) 1 - exp ( -exp( - lambda ( S + log(K m n) / lambda)) a = log(K m n) / lambda a lambda = log (K m n) exp(a lambda) = K m n but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)}) so: K m n = (1/n sum{ exp( -x[i] *lambda)}) K = sum{}/(nlib m n )*/voidalloc_hist(struct llen_str *llen){ int max_llen, i; max_llen = llen->max; if (llen->hist == NULL) { llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int)); llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double)); llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double)); llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double)); } for (i=0; i< max_llen+1; i++) { llen->hist[i] = 0; llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0; }} voidfree_hist(struct llen_str *llen){ if (llen->hist!=NULL) { free(llen->score_var); free(llen->score2_sums); free(llen->score_sums); free(llen->hist); llen->hist=NULL; }}voidinithist(struct llen_str *llen, struct pstruct *ppst, int max_hscore){ llen->max = MAX_LLEN;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -