⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 weight.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 2 页
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* weight.c * SRE, Thu Mar  3 07:56:01 1994 *  * Calculate weights for sequences in an alignment. * RCS $Id: weight.c,v 1.7 1999/07/20 20:37:30 eddy Exp $ */#include <ctype.h>#include <string.h>#include "squid.h"static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, 		       float *fwt, int node);static float simple_distance(char *s1, char *s2);static int    simple_diffmx(char **aseqs,int num, float ***ret_dmx);/* Function: GSCWeights() *  * Purpose:  Use Erik's tree-based algorithm to set weights for *           sequences in an alignment. upweight() and downweight() *           are derived from Graeme Mitchison's code. *            * Args:     aseq        - array of (0..nseq-1) aligned sequences *           nseq        - number of seqs in alignment   *           alen        - length of alignment *           wgt         - allocated [0..nseq-1] array of weights to be returned *            * Return:   (void) *           wgt is filled in. */voidGSCWeights(char **aseq, int nseq, int alen, float *wgt){  float **dmx;                 /* distance (difference) matrix */  struct phylo_s *tree;  float  *lwt, *rwt;           /* weight on left, right of this tree node */  float  *fwt;                 /* final weight assigned to this node */  int      i;    /* Sanity check first   */  if (nseq == 1) { wgt[0] = 1.0; return; }  /* I use a simple fractional difference matrix derived by   * pairwise identity. Perhaps I should include a Poisson   * distance correction.   */  MakeDiffMx(aseq, nseq, &dmx);  if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree))  Die("Cluster() failed");    /* Allocations   */  lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));  rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));  fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));    /* lwt and rwt are the total branch weight to the left and   * right of a node or sequence. They are 0..2N-2.  0..N-1 are    * the sequences; these have weight 0. N..2N-2 are the actual   * tree nodes.   */  for (i = 0; i < nseq; i++)    lwt[i] = rwt[i] = 0.0;				/* recursively calculate rwt, lwt, starting				   at node nseq (the root) */  upweight(tree, nseq, lwt, rwt, nseq);				/* recursively distribute weight across the				   tree */  fwt[nseq] = nseq;  downweight(tree, nseq, lwt, rwt, fwt, nseq);				/* collect the weights */  for (i = 0; i < nseq; i++)    wgt[i]  = fwt[i];  FMX2Free(dmx);  FreePhylo(tree, nseq);  free(lwt); free(rwt); free(fwt);}static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node){  int ld,rd;  ld = tree[node-nseq].left;  if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);  rd = tree[node-nseq].right;  if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);  lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;  rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;}static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node){  int ld,rd;  float lnum, rnum;  ld = tree[node-nseq].left;  rd = tree[node-nseq].right;  if (lwt[node] + rwt[node] > 0.0)    {      fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));      fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));    }  else    {      lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;      rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;      fwt[ld] = fwt[node] * lnum / (lnum + rnum);      fwt[rd] = fwt[node] * rnum / (lnum + rnum);    }  if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);  if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);}/* Function: VoronoiWeights() *  * Purpose:  Calculate weights using the scheme of Sibbald & *           Argos (JMB 216:813-818 1990). The scheme is *           slightly modified because the original algorithm *           actually doesn't work on gapped alignments. *           The sequences are assumed to be protein. *            * Args:     aseq  - array of (0..nseq-1) aligned sequences *           nseq  - number of sequences *           alen  - length of alignment *           wgt   - allocated [0..nseq-1] array of weights to be returned * * Return:   void *           wgt is filled in. */voidVoronoiWeights(char **aseq, int nseq, int alen, float *wgt){  float **dmx;                 /* distance (difference) matrix    */  float  *halfmin;             /* 1/2 minimum distance to other seqs */  char   **psym;                /* symbols seen in each column     */  int     *nsym;                /* # syms seen in each column      */  int      symseen[27];         /* flags for observed syms         */  char    *randseq;             /* randomly generated sequence     */  int      acol;		/* pos in aligned columns          */  int      idx;                 /* index in sequences              */  int      symidx;              /* 0..25 index for symbol          */  int      i;			/* generic counter                 */  float   min;			/* minimum distance                */  float   dist;		/* distance between random and real */  float   challenge, champion; /* for resolving ties              */  int     itscale;		/* how many iterations per seq     */  int     iteration;             int     best;			/* index of nearest real sequence  */  /* Sanity check first   */  if (nseq == 1) { wgt[0] = 1.0; return; }  itscale = 50;  /* Precalculate 1/2 minimum distance to other   * sequences for each sequence   */  if (! simple_diffmx(aseq, nseq, &dmx))     Die("simple_diffmx() failed");  halfmin = MallocOrDie (sizeof(float) * nseq);  for (idx = 0; idx < nseq; idx++)    {      for (min = 1.0, i = 0; i < nseq; i++)	{	  if (i == idx) continue;	  if (dmx[idx][i] < min) min = dmx[idx][i];	}      halfmin[idx] = min / 2.0;    }  Free2DArray((void **) dmx, nseq);  /* Set up the random sequence generating model.   */  psym = MallocOrDie (alen * sizeof(char *));  nsym = MallocOrDie (alen * sizeof(int));  for (acol = 0; acol < alen; acol++)    psym[acol] = MallocOrDie (27 * sizeof(char));/* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */  for (acol = 0; acol < alen; acol++)    {      memset(symseen, 0, sizeof(int) * 27);      for (idx = 0; idx < nseq; idx++)	if (! isgap(aseq[idx][acol]))	  {	    if (isupper((int) aseq[idx][acol])) 	      symidx = aseq[idx][acol] - 'A';	    else	      symidx = aseq[idx][acol] - 'a';	    if (symidx >= 0 && symidx < 26)	      symseen[symidx] = 1;	  }	else	  symseen[26] = 1;	/* a gap */      for (nsym[acol] = 0, i = 0; i < 26; i++)	if (symseen[i]) 	  {	    psym[acol][nsym[acol]] = 'A'+i;	    nsym[acol]++;	  }      if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }    }/* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */  /* Note: the original Sibbald&Argos algorithm calls for   * bounding the sampled space using a template-like random   * sequence generator. However, this leads to one minor   * and one major problem. The minor problem is that   * exceptional amino acids in a column can have a   * significant effect by altering the amount of sampled   * sequence space; the larger the data set, the worse   * this problem becomes. The major problem is that    * there is no reasonable way to deal with gaps.   * Gapped sequences simply inhabit a different dimensionality   * and it's pretty painful to imagine calculating Voronoi   * volumes when the N in your N-space is varying.   * Note that all the examples shown by Sibbald and Argos   * are *ungapped* examples.   *    * The best way I've found to circumvent this problem is   * just not to bound the sampled space; count gaps as   * symbols and generate completely random sequences.   */#ifdef ALL_SEQUENCE_SPACE  for (acol = 0; acol < alen; acol++)    {      strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");      nsym[acol] = 21;    }#endif    /* Sibbald and Argos algorithm:   *   1) assign all seqs weight 0.   *   2) generate a "random" sequence   *   3) calculate distance to every other sequence   *      (if we get a distance < 1/2 minimum distance   *       to other real seqs, we can stop)   *   4) if unique closest sequence, increment its weight 1.   *      if multiple closest seq, choose one randomly       *   5) repeat 2-4 for lots of iterations   *   6) normalize all weights to sum to nseq.   */  randseq = MallocOrDie ((alen+1) * sizeof(char));  best = 42.;			/* solely to silence GCC uninit warnings. */  FSet(wgt, nseq, 0.0);  for (iteration = 0; iteration < itscale * nseq; iteration++)    {      for (acol = 0; acol < alen; acol++)	randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];      randseq[acol] = '\0';      champion = sre_random();      for (min = 1.0, idx = 0; idx < nseq; idx++)	{	  dist = simple_distance(aseq[idx], randseq);	  if (dist < halfmin[idx]) 	    { 	      best = idx; 	      break;      	    } 	  if (dist < min)          	    { champion = sre_random(); best = idx; min = dist; } 	  else if (dist == min)    	    { 	      challenge = sre_random(); 	      if (challenge > champion)		{ champion = challenge; best = idx; min = dist; }	    }	}      wgt[best] += 1.0;    }  for (idx = 0; idx < nseq; idx++)    wgt[idx] = wgt[idx] / (float) itscale;  free(randseq);  free(nsym);  free(halfmin);  Free2DArray((void **) psym, alen);}/* Function: simple_distance() *  * Purpose:  For two identical-length null-terminated strings, return *           the fractional difference between them. (0..1) *           (Gaps don't count toward anything.) */static floatsimple_distance(char *s1, char *s2){  int diff  = 0;  int valid = 0;  for (; *s1 != '\0'; s1++, s2++)    {      if (isgap(*s1) || isgap(*s2)) continue;      if (*s1 != *s2) diff++;      valid++;    }  return (valid > 0 ? ((float) diff / (float) valid) : 0.0);}    /* Function: simple_diffmx() *  * Purpose:  Given a set of flushed, aligned sequences, construct *           an NxN fractional difference matrix using the *           simple_distance rule. *            * Args:     aseqs        - flushed, aligned sequences *           num          - number of aseqs *           ret_dmx      - RETURN: difference matrix (caller must free) *            * Return:   1 on success, 0 on failure. */static intsimple_diffmx(char    **aseqs,	      int       num,	      float ***ret_dmx){  float **dmx;                 /* RETURN: distance matrix           */  int      i,j;			/* counters over sequences           */  /* Allocate   */  if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)    Die("malloc failed");  for (i = 0; i < num; i++)    if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)      Die("malloc failed");  /* Calculate distances, symmetric matrix   */  for (i = 0; i < num; i++)    for (j = i; j < num; j++)      dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);  /* Return   */  *ret_dmx = dmx;  return 1;}/* Function: BlosumWeights() * Date:     SRE, Fri Jul 16 17:33:59 1999 (St. Louis) *  * Purpose:  Assign weights to a set of aligned sequences *           using the BLOSUM rule: *             - do single linkage clustering at some pairwise identity *             - in each cluster, give each sequence 1/clustsize

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -