📄 weight.c
字号:
* total weight. * * The clusters have no pairwise link >= maxid. * * O(N) in memory. Probably ~O(NlogN) in time; O(N^2) * in worst case, which is no links between sequences * (e.g., values of maxid near 1.0). * * Args: aseqs - alignment * nseq - number of seqs in alignment * alen - # of columns in alignment * maxid - fractional identity (e.g. 0.62 for BLOSUM62) * wgt - [0..nseq-1] array of weights to be returned */ voidBlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt){ int *c, nc; int *nmem; /* number of seqs in each cluster */ int i; /* loop counter */ SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc); FSet(wgt, nseq, 1.0); nmem = MallocOrDie(sizeof(int) * nc); for (i = 0; i < nc; i++) nmem[i] = 0; for (i = 0; i < nseq; i++) nmem[c[i]]++; for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]]; free(nmem); free(c); return;}/* Function: PositionBasedWeights() * Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis] * * Purpose: Implementation of Henikoff and Henikoff position-based * weights (JMB 243:574-578, 1994) [Henikoff94b]. * * A significant advantage of this approach that Steve and Jorja * don't point out is that it is O(N) in memory, unlike * many other approaches like GSC weights or Voronoi. * * A potential disadvantage that they don't point out * is that in the theoretical limit of infinite sequences * in the alignment, weights go flat: eventually every * column has at least one representative of each of 20 aa (or 4 nt) * in it. * * They also don't give a rule for how to handle gaps. * The rule used here seems the obvious and sensible one * (ignore them). This means that longer sequences * initially get more weight; hence a "double * normalization" in which the weights are first divided * by sequence length (to compensate for that effect), * then normalized to sum to nseq. * * Limitations: * Implemented in a way that's alphabet-independent: * it uses the 26 upper case letters as "residues". * Any alphabetic character in aseq is interpreted as * a unique "residue" (case insensitively; lower case * mapped to upper case). All other characters are * interpreted as gaps. * * This way, we don't have to pass around any alphabet * type info (DNA vs. RNA vs. protein) and don't have * to deal with remapping IUPAC degenerate codes * probabilistically. However, on the down side, * a sequence with a lot of degenerate IUPAC characters * will get an artifactually high PB weight. * * Args: aseq - sequence alignment to weight * nseq - number of sequences in alignment * alen - length of alignment * wgt - RETURN: weights filled in (pre-allocated 0..nseq-1) * * Returns: (void) * wgt is allocated (0..nseq-1) by caller, and filled in here. */voidPositionBasedWeights(char **aseq, int nseq, int alen, float *wgt){ int rescount[26]; /* count of A-Z residues in a column */ int nres; /* number of different residues in col */ int idx, pos; /* indices into aseq */ int x; float norm; FSet(wgt, nseq, 0.0); for (pos = 0; pos < alen; pos++) { for (x = 0; x < 26; x++) rescount[x] = 0; for (idx = 0; idx < nseq; idx++) if (isalpha(aseq[idx][pos])) rescount[toupper(aseq[idx][pos]) - 'A'] ++; nres = 0; for (x = 0; x < 26; x++) if (rescount[x] > 0) nres++; for (idx = 0; idx < nseq; idx++) if (isalpha(aseq[idx][pos])) wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']); } for (idx = 0; idx < nseq; idx++) wgt[idx] /= (float) DealignedLength(aseq[idx]); norm = (float) nseq / FSum(wgt, nseq); FScale(wgt, nseq, norm); return;}/* Function: FilterAlignment() * Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis] * * Purpose: Constructs a new alignment by removing near-identical * sequences from a given alignment (where identity is * calculated *based on the alignment*). * Does not affect the given alignment. * Keeps earlier sequence, discards later one. * * Usually called as an ad hoc sequence "weighting" mechanism. * * Limitations: * Unparsed Stockholm markup is not propagated into the * new alignment. * * Args: msa -- original alignment * cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id. * ret_new -- RETURN: new MSA, usually w/ fewer sequences * * Return: (void) * ret_new must be free'd by caller: MSAFree(). */voidFilterAlignment(MSA *msa, float cutoff, MSA **ret_new){ int nnew; /* number of seqs in new alignment */ int *list; int *useme; float ident; int i,j; int remove; /* find which seqs to keep (list) */ /* diff matrix; allow ragged ends */ list = MallocOrDie (sizeof(int) * msa->nseq); useme = MallocOrDie (sizeof(int) * msa->nseq); for (i = 0; i < msa->nseq; i++) useme[i] = FALSE; nnew = 0; for (i = 0; i < msa->nseq; i++) { remove = FALSE; for (j = 0; j < nnew; j++) { ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]); if (ident > cutoff) { remove = TRUE; printf("removing %12s -- fractional identity %.2f to %s\n", msa->sqname[i], ident, msa->sqname[list[j]]); break; } } if (remove == FALSE) { list[nnew++] = i; useme[i] = TRUE; } } MSASmallerAlignment(msa, useme, ret_new); free(list); free(useme); return;}/* Function: SampleAlignment() * Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis] * * Purpose: Constructs a new, smaller alignment by sampling a given * number of sequences at random. Does not change the * alignment nor the order of the sequences. * * If you ask for a sample that is larger than nseqs, * it silently returns the original alignment. * * Not really a weighting method, but this is as good * a place as any to keep it, since it's similar in * construction to FilterAlignment(). * * Args: msa -- original alignment * sample -- number of sequences in new alignment (0 < sample <= nseq) * ret_new -- RETURN: new MSA * * Return: (void) * ret_new must be free'd by caller: MSAFree(). */voidSampleAlignment(MSA *msa, int sample, MSA **ret_new){ int *list; /* array for random selection w/o replace */ int *useme; /* array of flags 0..nseq-1: TRUE to use */ int i, idx; int len; /* Allocations */ list = (int *) MallocOrDie (sizeof(int) * msa->nseq); useme = (int *) MallocOrDie (sizeof(int) * msa->nseq); for (i = 0; i < msa->nseq; i++) { list[i] = i; useme[i] = FALSE; } /* Sanity check. */ if (sample >= msa->nseq) sample = msa->nseq; /* random selection w/o replacement */ for (len = msa->nseq, i = 0; i < sample; i++) { idx = CHOOSE(len); printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]); useme[list[idx]] = TRUE; list[idx] = list[--len]; } MSASmallerAlignment(msa, useme, ret_new); free(list); free(useme); return;}/* Function: SingleLinkCluster() * Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis] * * Purpose: Perform simple single link clustering of seqs in a * sequence alignment. A pairwise identity threshold * defines whether two sequences are linked or not. * * Important: runs in O(N) memory, unlike standard * graph decomposition algorithms that use O(N^2) * adjacency matrices or adjacency lists. Requires * O(N^2) time in worst case (which is when you have * no links at all), O(NlogN) in "average" * case, and O(N) in best case (when there is just * one cluster in a completely connected graph. * * (Developed because hmmbuild could no longer deal * with GP120, a 16,013 sequence alignment.) * * Limitations: * CASE-SENSITIVE. Assumes aseq have been put into * either all lower or all upper case; or at least, * within a column, there's no mixed case. * * Algorithm: * I don't know if this algorithm is published. I * haven't seen it in graph theory books, but that might * be because it's so obvious that nobody's bothered. * * In brief, we're going to do a breadth-first search * of the graph, and we're going to calculate links * on the fly rather than precalculating them into * some sort of standard adjacency structure. * * While working, we keep two stacks of maximum length N: * a : list of vertices that are still unconnected. * b : list of vertices that we've connected to * in our current breadth level, but we haven't * yet tested for other connections to a. * The current length (number of elements in) a and b are * kept in na, nb. * * We store our results in an array of length N: * c : assigns each vertex to a component. for example * c[4] = 1 means that vertex 4 is in component 1. * nc is the number of components. Components * are numbered from 0 to nc-1. We return c and nc * to our caller. * * The algorithm is: * * Initialisation: * a <-- all the vertices * na <-- N * b <-- empty set * nb <-- 0 * nc <-- 0 * * Then: * while (a is not empty) * pop a vertex off a, push onto b * while (b is not empty) * pop vertex v off b * assign c[v] = nc * for each vertex w in a: * compare v,w. If w is linked to v, remove w * from a, push onto b. * nc++ * q.e.d. :) * * Args: aseq - aligned sequences * nseq - number of sequences in aseq * alen - alignment length * maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked * ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters) * ret_nc - RETURN: number of components * * Returns: void. * ret_c is allocated here. Caller free's with free(*ret_c) */voidSingleLinkCluster(char **aseq, int nseq, int alen, float maxid, int **ret_c, int *ret_nc){ int *a, na; /* stack of available vertices */ int *b, nb; /* stack of working vertices */ int *c; /* array of results */ int nc; /* total number of components */ int v,w; /* index of a working vertices */ int i; /* loop counter */ /* allocations and initializations */ a = MallocOrDie (sizeof(int) * nseq); b = MallocOrDie (sizeof(int) * nseq); c = MallocOrDie (sizeof(int) * nseq); for (i = 0; i < nseq; i++) a[i] = i; na = nseq; nb = 0; nc = 0; /* Main algorithm */ while (na > 0) { v = a[na-1]; na--; /* pop a vertex off a, */ b[nb] = v; nb++; /* and push onto b */ while (nb > 0) { v = b[nb-1]; nb--; /* pop vertex off b */ c[v] = nc; /* assign it to component nc */ for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/ if (simple_distance(aseq[v], aseq[a[i]]) < 1. - maxid) /* linked? */ { w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */ b[nb] = w; nb++; /* push w onto b */ } } nc++; } /* Cleanup and return */ free(a); free(b); *ret_c = c; *ret_nc = nc; return;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -