⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tatstats.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
字号:
#include <stdlib.h>#include <stdio.h>#include <math.h>#include <string.h>#include "defs.h"#include "param.h"#include "tatstats.h"#ifndef PCOMPLIB#include "mw.h"#else#include "p_mw.h"#endif/* calc_priors() - calculate frequencies of amino-acids, possibly with counts *//* generate_tatprobs() - build the table of score probabilities if the   sequences are not too long */doubledet(double a11, double a12, double a13,    double a21, double a22, double a23,    double a31, double a32, double a33);double power(double r, int p){  double tr;  int neg;  if (r==0.0) return(p==0?1.0:0.0);  if (neg = p<0) p = -p;  tr = 1.0;  while (p>0) {    if (p & 1) tr *= r;    p >>= 1;    if (p) r *= r;  }  return((neg? 1.0/tr: tr));}doublefactorial (int a, int b) {  double res = 1.0;  if(a == 0) { return 1.0; }  while(a > b) {    res *= (double) a;    a--;  }  return res;}voidcalc_priors(double *priors,	    struct pstruct *ppst,	    struct f_struct *f_str,	    const unsigned char *aa1, int n1,	    int pseudocts){  long counts[26], sum;  int i;  if(n1 == 0 && f_str->priors[1] > 0.0) {    for(i = 1 ; i <= ppst->nsq ; i++) {      priors[i] = f_str->priors[i];    }    return;  }  if(n1 == 0) {    if (ppst->dnaseq==SEQT_PROT ) {      /* Robinson & Robinson residue counts from Stephen Altschul */      counts[ 1] = 35155;    /*   A  */      counts[ 2] = 23105;    /*   R  */        counts[ 3] = 20212;    /*   N  */        counts[ 4] = 24161;    /*   D  */        counts[ 5] =  8669;    /*   C  */        counts[ 6] = 19208;    /*   Q  */        counts[ 7] = 28354;    /*   E  */        counts[ 8] = 33229;    /*   G  */        counts[ 9] =  9906;    /*   H  */        counts[10] = 23161;    /*   I  */        counts[11] = 40625;    /*   L  */        counts[12] = 25872;    /*   K  */        counts[13] = 10101;    /*   M  */        counts[14] = 17367;    /*   F  */        counts[15] = 23435;    /*   P  */        counts[16] = 32070;    /*   S  */        counts[17] = 26311;    /*   T  */        counts[18] =  5990;    /*   W  */        counts[19] = 14488;    /*   Y  */        counts[20] = 29012;    /*   V  */        counts[21] =     0;    /*   B  */        counts[22] =     0;    /*   Z  */        counts[23] =     0;    /*   X   */       counts[24] =     0;    /*   *   */      counts[25] =     0;    /*   J   */    }    else { /* SEQT_DNA */      counts[1] = 250;      counts[2] = 250;      counts[3] = 250;      counts[4] = 250;      for (i=5; i<=ppst->nsq; i++) counts[i]=0;    }  } else {    memset(&counts[0], 0, sizeof(counts));    for(i = 0 ; i < n1 ; i++) {      if(aa1[i] > ppst->nsq || aa1[i] < 1) continue;      counts[aa1[i]]++;    }  }  sum = 0;  for(i = 1 ; i <= ppst->nsq ; i++) sum += counts[i];    for(i = 1 ; i <= ppst->nsq ; i++) {    if(n1 == 0) {      priors[i] = (double) counts[i] / (double) sum;    } else {      priors[i] = ( ((double) pseudocts * f_str->priors[i]) + (double) counts[i] ) / ( (double) sum + (double) pseudocts );    }  }  return;} intmax_score(int *scores, int nsq) {  int max, i;  max = -BIGNUM;  for ( i = 1 ; i <= nsq ; i++ ) {    if (scores[i] > max) max = scores[i];  }   return max;}intmin_score(int *scores, int nsq) {  int min, i;  min = BIGNUM;  for (i = 1 ; i <= nsq ; i++ ) {    if (scores[i] < min) min = scores[i];  }  return min;}doublecalc_tatusov ( struct slink *last,	       struct slink *this,	       const unsigned char *aa0, int n0,	       const unsigned char *aa1, int n1,	       int **pam2, int nsq,	       struct f_struct *f_str,	       int pseudocts,	       int do_opt,	       int zsflag	       ){  int i, is, j, k;  double *priors, my_priors[MAXSQ], tatprob, left_tatprob, right_tatprob;  unsigned char *query = NULL;  int length, maxlength, sumlength, sumscore, tmp, seg;  int start, stop;  struct slink *sl;  int N;  double *tatprobsptr;#if defined(FASTS) || defined(FASTM)  int index = 0;  int notokay = 0;#endif  struct tat_str *oldtat = NULL, *newtat = NULL;#if defined(FASTS) || defined(FASTM)  start = this->vp->start - this->vp->dp + f_str->noff;  stop = this->vp->stop - this->vp->dp + f_str->noff;  tmp = stop - start + 1;#else  /*    FASTF alignments can also hang off the end of library sequences,    but no query residues are used up in the process, but we have to    keep track of which are  */  tmp = 0;  for(i = 0, j = 0 ; i < n0 ; i++) {    if (this->vp->used[i] == 1) {tmp++; }  }#endif  sumlength = maxlength = length = tmp;  seg = 1;  sumscore = this->vp->score;#if defined(FASTS) || defined(FASTM)  if(f_str->aa0b[start] == start && f_str->aa0e[stop] == stop) {    index |= (1 << f_str->aa0i[start]);  } else {    notokay |= (1 << f_str->aa0i[start]);  }#endif  for(sl = last; sl != NULL ; sl = sl->prev) {#if defined(FASTS) || defined(FASTM)    start = sl->vp->start - sl->vp->dp + f_str->noff;    stop = sl->vp->stop - sl->vp->dp + f_str->noff;    tmp = stop - start + 1;#else    tmp = 0;    for(i = 0, j = 0 ; i < n0 ; i++) {      if(sl->vp->used[i] == 1) {	tmp++;      }    }#endif    sumlength += tmp;    maxlength = tmp > maxlength ? tmp : maxlength;    seg++;    sumscore += sl->vp->score;#if defined(FASTS) || defined(FASTM)    if(f_str->aa0b[start] == start && f_str->aa0e[stop] == stop) {      index |= (1 << f_str->aa0i[start]);    } else {      notokay |= (1 << f_str->aa0i[start]);    }#endif  }  tatprob = -1.0;     #if defined(FASTS) || defined(FASTM)  /* for T?FASTS, we try to use what we've precalculated: */  /* with z = 3, do_opt is true, but we can use precalculated - with     all other z's we can use precalculated only if !do_opt */  if(!notokay && f_str->tatprobs != NULL) {    /* create our own newtat and copy f_str's tat into it */    index--;    newtat = (struct tat_str *) malloc(sizeof(struct tat_str));    if(newtat == NULL) {      fprintf(stderr, "Couldn't calloc memory for newtat.\n");      exit(1);    }        memcpy(newtat, f_str->tatprobs[index], sizeof(struct tat_str));    newtat->probs = (double *) calloc(f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore + 1, sizeof(double));    if(newtat->probs == NULL) {      fprintf(stderr, "Coudln't calloc memory for newtat->probs.\n");      exit(1);    }    memcpy(newtat->probs, f_str->tatprobs[index]->probs,	   (f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore + 1) * sizeof(double));     tatprob = f_str->intprobs[index][sumscore - f_str->tatprobs[index]->lowscore];  } else { /* we need to recalculate from scratch */#endif    /* for T?FASTF, we're always recalculating from scratch: */    query = (unsigned char *) calloc(length, sizeof(unsigned char));    if(query == NULL) {      fprintf(stderr, "Couldn't calloc memory for query.\n");      exit(1);    }    #if defined(FASTS) || defined(FASTM)    start = this->vp->start - this->vp->dp + f_str->noff;    for(i = 0, j = 0 ; i < length ; i++) {      query[j++] = aa0[start + i];    }#else    for(i = 0, j = 0 ; i < n0 ; i++) {      if (this->vp->used[i] == 1) {query[j++] = aa0[i];}    }#endif    /*  calc_priors - not currently implemented for aa1 dependent */    /*     if( (do_opt && zsflag == 2) || zsflag == 4 ) {          priors = &my_priors[0];      calc_priors(priors, f_str, aa1, n1, pseudocts);    } else {      priors = f_str->priors;    }    */    priors = f_str->priors;    oldtat = (last != NULL ? last->tat : NULL);    generate_tatprobs(query, 0, length - 1, priors, pam2, nsq, &newtat, oldtat);    free(query);#if defined(FASTS) || defined(FASTM)  } /* close the FASTS-specific if-else from above */#endif  this->newtat = newtat;    if(tatprob < 0.0) { /* hasn't been set by precalculated FASTS intprobs */    /* integrate probabilities >= sumscore */    tatprobsptr = newtat->probs;    is = i = newtat->highscore - newtat->lowscore;    N = sumscore - newtat->lowscore;    right_tatprob = 0;    for ( ;  i >= N; i--) {      right_tatprob += tatprobsptr[i];    }    left_tatprob = tatprobsptr[0];    for (i = 1 ; i < N ; i++ ) {      left_tatprob += tatprobsptr[i];    }    if (right_tatprob < left_tatprob) {tatprob = right_tatprob;}    else {tatprob = 1.0 - left_tatprob;}    tatprob /= (right_tatprob+left_tatprob);  }  if (maxlength > 0) {    n1 += 2 * (maxlength - 1);  }#ifndef FASTM  tatprob *= factorial(n1 - sumlength + seg, n1 - sumlength);#else  tatprob *= power(n1 - sumlength,seg)/(1<<seg);#endif  if(tatprob > 0.01)    tatprob = 1.0 - exp(-tatprob);   return tatprob;}voidgenerate_tatprobs(const unsigned char *query,		  int begin,		  int end,		  double *priors,		  int **pam2,		  int nsq,		  struct tat_str **tatarg,		  struct tat_str *oldtat){  int i, j, k, l, m, n, N, highscore, lowscore;  int *lowrange = NULL, *highrange = NULL;  double *probs = NULL, *newprobs = NULL, *priorptr, tmp;  struct tat_str *tatprobs = NULL;  int *pamptr, *pamptrsave;  if((tatprobs = (struct tat_str *) calloc(1, sizeof(struct tat_str)))==NULL) {    fprintf(stderr, "Couldn't allocate individual tatprob struct.\n");    exit(1);  }  n = end - begin + 1;  if ( (lowrange = (int *) calloc(n, sizeof(int))) == NULL ) {    fprintf(stderr, "Couldn't allocate memory for lowrange.\n");    exit(1);  }    if ( (highrange = (int *) calloc(n, sizeof(int))) == NULL ) {    fprintf(stderr, "Couldn't allocate memory for highrange.\n");    exit(1);  }  /* calculate the absolute highest and lowest score possible for this */  /* segment.  Also, set the range we need to iterate over at each position */  /* in the query: */  if(oldtat == NULL) {    highscore = lowscore = 0;  } else {    highscore = oldtat->highscore;    lowscore = oldtat->lowscore;  }  for ( i = 0 ; i < n ; i++ ) {    if (query[begin+i] == 0) break;    highscore =      (highrange[i] = highscore + max_score(pam2[query[begin + i]], nsq));    lowscore =      (lowrange[i] = lowscore + min_score(pam2[query[begin + i]], nsq));    /*    fprintf(stderr, "i: %d, max: %d, min: %d, high[i]: %d, low[i]: %d, high: %d, low: %d, char: %d\n",	    i,	    max_score(pam2[query[begin + i]], nsq),	    min_score(pam2[query[begin + i]], nsq),	    highrange[i], lowrange[i],	    highscore, lowscore, query[begin + i]);     */  }  /* allocate an array of probabilities for all possible scores */  /* i.e. if highest score possible is 50 and lowest score possible */  /* is -20, then there are 50 - (-20) + 1 = 71 possible different */  /* scores (including 0): */  N = highscore - lowscore;  if ( (probs = (double *) calloc(N + 1, sizeof(double))) == NULL ) {    fprintf(stderr, "Couldn't allocate probability matrix : %d.\n", N + 1);    exit(1);  }  if(oldtat == NULL) {    /* for the first position, iterate over the only possible scores, */    /* summing the priors for the amino acids that can yield each score. */    pamptr = pam2[query[begin]];    for ( i = 1 ; i <= nsq ; i++ ) {      if(priors[i] > 0.0) {	probs[(pamptr[i] - lowscore)] += priors[i];      }    }  } else {    /* Need to copy the data out of oldtat->probs into probs */    memcpy( &probs[oldtat->lowscore - lowscore],	    oldtat->probs,	    (oldtat->highscore - oldtat->lowscore + 1) * sizeof(double));  }  if ( (newprobs = (double *) calloc(N + 1, sizeof(double))) == NULL ) {    fprintf(stderr, "Couldn't allocate newprobs matrix.\n");    exit(1);  }  /* now for each remaining residue in the segment ... */  for ( i = (oldtat == NULL ? 1 : 0) ; i < n ; i++ ) {    pamptrsave = pam2[query[begin + i]];    /* ... calculate new probability distribution .... */    /* ... for each possible score (limited to current range) ... */    for ( j = lowrange[i] - lowscore,	    k = highrange[i] - lowscore ;	  j <= k ;	  j++ ) {            tmp = 0.0;      pamptr = &pamptrsave[1];      priorptr = &priors[1];      /* ... for each of the possible alignment scores at this position ... */      for ( l = 1 ;	    l <= nsq ;	    l++) {	/* make sure we don't go past highest possible score, or past           the lowest possible score; not sure why this can happen */	m = j - *pamptr++;	if ( m <= N && m >= 0 ) {	  /* update the probability of getting score j: */	  tmp += probs[m] * *priorptr++;	}      }      newprobs[j] += tmp;    }    /* save the new set of probabilities, get rid of old; we don't       necessarily have to copy/clear all N+1 slots, we could use       high/low score boundaries -- not sure that's worth the       effort. */    memcpy(probs, newprobs, (N + 1) * sizeof(double));    memset(newprobs, 0, (N + 1) * sizeof(double));  }  free(newprobs);  free(highrange);  free(lowrange);  tatprobs->probs = probs;  /*  tatprobs->intprobs = intprobs; */  tatprobs->lowscore = lowscore;  tatprobs->highscore = highscore;  *tatarg = tatprobs;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -