📄 weight.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
 *               total weight. * *           The clusters have no pairwise link >= maxid.  *            *           O(N) in memory. Probably ~O(NlogN) in time; O(N^2) *           in worst case, which is no links between sequences *           (e.g., values of maxid near 1.0). *  * Args:     aseqs - alignment *           nseq  - number of seqs in alignment *           alen  - # of columns in alignment *           maxid - fractional identity (e.g. 0.62 for BLOSUM62) *           wgt   - [0..nseq-1] array of weights to be returned */               voidBlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt){  int  *c, nc;  int  *nmem;        /* number of seqs in each cluster */  int   i;           /* loop counter */  SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);  FSet(wgt, nseq, 1.0);  nmem = MallocOrDie(sizeof(int) * nc);  for (i = 0; i < nc;   i++) nmem[i] = 0;  for (i = 0; i < nseq; i++) nmem[c[i]]++;  for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];  free(nmem);  free(c);  return;}/* Function: PositionBasedWeights() * Date:     SRE, Fri Jul 16 17:47:22 1999 [St. Louis] * * Purpose:  Implementation of Henikoff and Henikoff position-based *           weights (JMB 243:574-578, 1994) [Henikoff94b]. *            *           A significant advantage of this approach that Steve and Jorja *           don't point out is that it is O(N) in memory, unlike *           many other approaches like GSC weights or Voronoi. *            *           A potential disadvantage that they don't point out *           is that in the theoretical limit of infinite sequences *           in the alignment, weights go flat: eventually every *           column has at least one representative of each of 20 aa (or 4 nt) *           in it. *            *           They also don't give a rule for how to handle gaps. *           The rule used here seems the obvious and sensible one *           (ignore them). This means that longer sequences *           initially get more weight; hence a "double *           normalization" in which the weights are first divided *           by sequence length (to compensate for that effect), *           then normalized to sum to nseq. *            * Limitations: *           Implemented in a way that's alphabet-independent: *           it uses the 26 upper case letters as "residues". *           Any alphabetic character in aseq is interpreted as *           a unique "residue" (case insensitively; lower case *           mapped to upper case). All other characters are *           interpreted as gaps. *            *           This way, we don't have to pass around any alphabet *           type info (DNA vs. RNA vs. protein) and don't have *           to deal with remapping IUPAC degenerate codes *           probabilistically. However, on the down side, *           a sequence with a lot of degenerate IUPAC characters *           will get an artifactually high PB weight. *            * Args:     aseq   - sequence alignment to weight *           nseq   - number of sequences in alignment *           alen   - length of alignment *           wgt    - RETURN: weights filled in (pre-allocated 0..nseq-1) * * Returns:  (void) *           wgt is allocated (0..nseq-1) by caller, and filled in here. */voidPositionBasedWeights(char **aseq, int nseq, int alen, float *wgt){  int  rescount[26];		/* count of A-Z residues in a column   */  int  nres;			/* number of different residues in col */  int  idx, pos;                /* indices into aseq                   */  int  x;  float norm;  FSet(wgt, nseq, 0.0);  for (pos = 0; pos < alen; pos++)    {      for (x = 0; x < 26; x++) rescount[x] = 0;      for (idx = 0; idx < nseq; idx++)	if (isalpha(aseq[idx][pos]))	  rescount[toupper(aseq[idx][pos]) - 'A'] ++;      nres = 0;      for (x = 0; x < 26; x++) 	if (rescount[x] > 0) nres++;      for (idx = 0; idx < nseq; idx++)	if (isalpha(aseq[idx][pos]))	  wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']);    }  for (idx = 0; idx < nseq; idx++)    wgt[idx] /= (float) DealignedLength(aseq[idx]);  norm = (float) nseq / FSum(wgt, nseq);  FScale(wgt, nseq, norm);  return;}/* Function: FilterAlignment() * Date:     SRE, Wed Jun 30 09:19:30 1999 [St. Louis] *  * Purpose:  Constructs a new alignment by removing near-identical  *           sequences from a given alignment (where identity is  *           calculated *based on the alignment*). *           Does not affect the given alignment. *           Keeps earlier sequence, discards later one.  *            *           Usually called as an ad hoc sequence "weighting" mechanism. *            * Limitations: *           Unparsed Stockholm markup is not propagated into the *           new alignment. *            * Args:     msa      -- original alignment *           cutoff   -- fraction identity cutoff. 0.8 removes sequences > 80% id. *           ret_new  -- RETURN: new MSA, usually w/ fewer sequences *                          * Return:   (void) *           ret_new must be free'd by caller: MSAFree(). */voidFilterAlignment(MSA *msa, float cutoff, MSA **ret_new){  int     nnew;			/* number of seqs in new alignment */  int    *list;  int    *useme;  float   ident;  int     i,j;  int     remove;				/* find which seqs to keep (list) */				/* diff matrix; allow ragged ends */  list  = MallocOrDie (sizeof(int) * msa->nseq);  useme = MallocOrDie (sizeof(int) * msa->nseq);  for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;  nnew = 0;  for (i = 0; i < msa->nseq; i++)    {      remove = FALSE;      for (j = 0; j < nnew; j++)	{	  ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);	  if (ident > cutoff)	    { 	      remove = TRUE; 	      printf("removing %12s -- fractional identity %.2f to %s\n", 		     msa->sqname[i], ident,		     msa->sqname[list[j]]);	      break; 	    }	}      if (remove == FALSE) {	list[nnew++] = i;	useme[i]     = TRUE;      }    }  MSASmallerAlignment(msa, useme, ret_new);  free(list);  free(useme);  return;}/* Function: SampleAlignment() * Date:     SRE, Wed Jun 30 10:13:56 1999 [St. Louis] *  * Purpose:  Constructs a new, smaller alignment by sampling a given *           number of sequences at random. Does not change the *           alignment nor the order of the sequences. *            *           If you ask for a sample that is larger than nseqs, *           it silently returns the original alignment. *            *           Not really a weighting method, but this is as good *           a place as any to keep it, since it's similar in *           construction to FilterAlignment(). *            * Args:     msa      -- original alignment *           sample   -- number of sequences in new alignment (0 < sample <= nseq) *           ret_new  -- RETURN: new MSA  *                          * Return:   (void) *           ret_new must be free'd by caller: MSAFree(). */voidSampleAlignment(MSA *msa, int sample, MSA **ret_new){  int    *list;                 /* array for random selection w/o replace */  int    *useme;                /* array of flags 0..nseq-1: TRUE to use */  int     i, idx;  int     len;  /* Allocations   */  list  = (int *) MallocOrDie (sizeof(int) * msa->nseq);  useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);  for (i = 0; i < msa->nseq; i++)    {      list[i]  = i;      useme[i] = FALSE;    }  /* Sanity check.   */  if (sample >= msa->nseq) sample = msa->nseq;				/* random selection w/o replacement */  for (len = msa->nseq, i = 0; i < sample; i++)    {      idx = CHOOSE(len);      printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]);      useme[list[idx]] = TRUE;      list[idx] = list[--len];    }  MSASmallerAlignment(msa, useme, ret_new);  free(list);  free(useme);  return;}/* Function: SingleLinkCluster() * Date:     SRE, Fri Jul 16 15:02:57 1999 [St. Louis] * * Purpose:  Perform simple single link clustering of seqs in a *           sequence alignment. A pairwise identity threshold *           defines whether two sequences are linked or not. *            *           Important: runs in O(N) memory, unlike standard *           graph decomposition algorithms that use O(N^2) *           adjacency matrices or adjacency lists. Requires *           O(N^2) time in worst case (which is when you have *           no links at all), O(NlogN) in "average" *           case, and O(N) in best case (when there is just *           one cluster in a completely connected graph. *            *           (Developed because hmmbuild could no longer deal *           with GP120, a 16,013 sequence alignment.) *            * Limitations:  *           CASE-SENSITIVE. Assumes aseq have been put into *           either all lower or all upper case; or at least, *           within a column, there's no mixed case. *            * Algorithm:  *           I don't know if this algorithm is published. I  *           haven't seen it in graph theory books, but that might *           be because it's so obvious that nobody's bothered. *            *           In brief, we're going to do a breadth-first search *           of the graph, and we're going to calculate links *           on the fly rather than precalculating them into *           some sort of standard adjacency structure. *            *           While working, we keep two stacks of maximum length N: *                a : list of vertices that are still unconnected. *                b : list of vertices that we've connected to  *                    in our current breadth level, but we haven't *                    yet tested for other connections to a. *           The current length (number of elements in) a and b are *           kept in na, nb. *                     *           We store our results in an array of length N: *                c : assigns each vertex to a component. for example *                    c[4] = 1 means that vertex 4 is in component 1. *                    nc is the number of components. Components *                    are numbered from 0 to nc-1. We return c and nc *                    to our caller. *                     *           The algorithm is: *            *           Initialisation:  *                a  <-- all the vertices *                na <-- N *                b  <-- empty set *                nb <-- 0 *                nc <-- 0 *                 *           Then: *                while (a is not empty) *                  pop a vertex off a, push onto b *                  while (b is not empty) *                    pop vertex v off b *                    assign c[v] = nc *                    for each vertex w in a: *                       compare v,w. If w is linked to v, remove w *                       from a, push onto b. *                  nc++      *           q.e.d. :)        * * Args:     aseq   - aligned sequences *           nseq   - number of sequences in aseq *           alen   - alignment length *           maxid  - fractional identity threshold 0..1. if id >= maxid, seqs linked *           ret_c  - RETURN: 0..nseq-1 assignments of seqs to components (clusters) *           ret_nc - RETURN: number of components * * Returns:  void. *           ret_c is allocated here. Caller free's with free(*ret_c) */voidSingleLinkCluster(char **aseq, int nseq, int alen, float maxid, 		  int **ret_c, int *ret_nc){  int *a, na;                   /* stack of available vertices */  int *b, nb;                   /* stack of working vertices   */  int *c;                       /* array of results            */  int  nc;			/* total number of components  */  int  v,w;			/* index of a working vertices */  int  i;			/* loop counter */  /* allocations and initializations   */  a = MallocOrDie (sizeof(int) * nseq);  b = MallocOrDie (sizeof(int) * nseq);  c = MallocOrDie (sizeof(int) * nseq);  for (i = 0; i < nseq; i++) a[i] = i;  na = nseq;  nb = 0;  nc = 0;  /* Main algorithm   */  while (na > 0)    {      v = a[na-1]; na--;	/* pop a vertex off a, */      b[nb] = v;   nb++;	/* and push onto b     */      while (nb > 0)	{	  v    = b[nb-1]; nb--;	/* pop vertex off b          */	  c[v] = nc;		/* assign it to component nc */	  for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/	    if (simple_distance(aseq[v], aseq[a[i]]) < 1. - maxid) /* linked? */	      {					w = a[i]; a[i] = a[na-1]; na--;	/* delete w from a (note swap) */		b[nb] = w; nb++;                /* push w onto b */	      }	}      nc++;    }  /* Cleanup and return   */  free(a);  free(b);  *ret_c  = c;  *ret_nc = nc;  return;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -