📄 camjul97.c
字号:
*ret_nmx = nmx; return;}/* Function: OldPrintGJMMatrix() * * Purpose: (debugging, basically): print out Graeme's * joint probability matrices in log odds integer form. * */voidOldPrintGJMMatrix(FILE *fp, float **jmx, float *rnd, int N) { int r, c; fprintf(fp, " "); for (c = 0; c < N; c++) fprintf(fp, " %c ", Alphabet[c]); fprintf(fp, "\n"); for (r = 0; r < N; r++) { fprintf(fp, "%c ", Alphabet[r]); for (c = 0; c < N; c++) fprintf(fp, "%3d ", (int) (10. * sreLOG2(jmx[r][c] / (rnd[r] * rnd[c])))); fprintf(fp, "\n"); }}#endif /* SRE_REMOVED*//* Function: Joint2SubstitutionMatrix() * * Purpose: Convert a joint probability matrix to a substitution * matrix. * * Convention here for substitution matrices is * smx[r][c] = r->c = P(c|r). * * We obtain the substitution matrix from the following logic: * P(rc) = P(c|r) P(r); * P(r) = \sum_c P(rc); * thus P(c|r) = P(rc) / \sum_c P(rc) * * Args: jmx - NxN P(rc) joint probability matrix * smx - NxN P(c|r) substitution matrix, alloced in caller * N - size of matrices; typically Alphabet_size * * Return: (void) * smx is filled in. */voidJoint2SubstitutionMatrix(float **jmx, float **smx, int N){ float pr; /* P(r) = \sum_c P(rc) */ int r,c; /* counters for rows, columns */ for (r = 0; r < N; r++) { for (pr = 0., c = 0; c < N; c++) pr += jmx[r][c]; for (c = 0; c < N; c++) smx[r][c] = jmx[r][c] / pr; }}#ifdef SRE_REMOVED/* Function: BlosumWeights() * * Purpose: Assign weights to a set of aligned sequences * using the BLOSUM rule: * - do single linkage clustering at some pairwise identity * - in each cluster, give each sequence 1/clustsize * total weight. * * Args: aseqs - alignment * N - number of seqs in alignment * maxid - fractional identity (e.g. 0.62 for BLOSUM62) * clust - [0..nseq-1] vector of cluster assignments, filled here (or NULL) * ret_nc - total number of clusters found (or pass NULL) */ voidBlosumWeights(char **aseqs, AINFO *ainfo, float maxid, int *clust,int *ret_nc){ float **dmx; /* difference matrix */ struct phylo_s *tree; /* UPGMA tree */ float mindiff; /* minimum distance between clusters */ int c; /* counter for clusters */ struct intstack_s *stack; int node; int i; mindiff = 1.0 - maxid; /* first we do a difference matrix */ MakeDiffMx(aseqs, ainfo->nseq, &dmx); /* then we build a tree */ Cluster(dmx, ainfo->nseq, CLUSTER_MIN, &tree); /* Find clusters below mindiff. * The rule is: * -traverse the tree * -if the parent is > mindiff and current < mindiff, then * make current node a cluster. */ for (i = 0; i < ainfo->nseq; i++) { ainfo->sqinfo[i].weight = 1.0; ainfo->sqinfo[i].flags |= SQINFO_WGT; } stack = InitIntStack(); PushIntStack(stack, 0); /* push root on stack to start */ c = 0; while (PopIntStack(stack, &node)) { if ((node == 0 || tree[tree[node].parent-ainfo->nseq].diff > mindiff) && tree[node].diff < mindiff) { /* we're at a cluster */ for (i = 0; i < ainfo->nseq; i++) if (tree[node].is_in[i]) { ainfo->sqinfo[i].weight = 1.0 / (float) tree[node].incnum; if (clust != NULL) clust[i] = c; } c++; } else /* we're not a cluster, keep traversing */ { if (tree[node].right >= ainfo->nseq) PushIntStack(stack, tree[node].right - ainfo->nseq); else { c++; if (clust != NULL) clust[tree[node].right] = c; /* single seq, wgt 1.0 */ } if (tree[node].left >= ainfo->nseq) PushIntStack(stack, tree[node].left - ainfo->nseq); else { c++; if (clust != NULL) clust[tree[node].left] = c; } } } FreeIntStack(stack); FreePhylo(tree, ainfo->nseq); FMX2Free(dmx); if (ret_nc != NULL) *ret_nc = c; return;}#endif#ifdef SRE_REMOVED/* Function: calc_probq() * * Purpose: Calculate the posterior probability distribution * P(q | a_j) for every column j in the alignment * and every matrix choice q. * * Probabilistic, based on a star topology. * Uses a BLOSUM-like rule to cluster the sequences in * the alignment into groups with some seq identity (62%). * Finds the consensus (majority rule) residue in * each cluster as the representative. * Then P(q | col) comes by Bayes: * = (P(col | q) P(q) / Z * where the likelihood * P(col | q) = \sum_b [\prod_i P(a_i | q,b)] P(b | q) * log P(col | q) = \logsum_b P(b|q) + \sum_i \log(P(a_i | q,b)) * * Args: aseqs - alignment * ainfo - optional info for alignment * mx - conditional probability matrices [0..nmx-1][root b][x] * bprior- root priors [0..nmx-1][root b] * qprior- prior prob distribution over matrices * nmx - number of matrices * probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1] * alloc'ed in called, filled in here. * * Return: (void) * probq is filled in. */static void calc_probq(char **aseqs, AINFO *ainfo, float ***mx, float **bprior, float *qprior, int nmx, float **probq){ int q; /* counter over matrices */ int a1; /* counter over sequences */ int j; /* counter over columns */ int *clust; /* assignment of seqs to clusters 0..nseq-1 */ int nclust; /* number of clusters */ float *wgt; /* weights on seqs, 0..nseq-1 */ int *sym; /* symbol indices in a column */ float obs[MAXABET]; /* number of symbols observed in a column */ int i, x; float maxc; float ngap; float bterm[20]; /* intermediate in calculation, over root b's */ int b; /* counter over root symbols */ /* Use the BLOSUM rule to calculate weights and clusters * for sequences in the alignment */ wgt = (float *) MallocOrDie (sizeof(float) * ainfo->nseq); clust = (int *) MallocOrDie (sizeof(int) * ainfo->nseq); BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust); /* Use the BLOSUM rule to calculate a "likelihood" function * P(column | q) for each column. */ sym = (int *) MallocOrDie (sizeof(int) * nclust); for (j = 0; j < ainfo->alen; j++) { /* Find majority rule symbols in this col */ for (i = 0; i < nclust; i++) { FSet(obs, Alphabet_size, 0.); ngap = 0.; for (a1 = 0; a1 < ainfo->nseq; a1++) if (clust[a1] == i) if (isgap(aseqs[a1][j])) ngap += 0.; else P7CountSymbol(obs, SymbolIndex(aseqs[a1][j]), 1.0); maxc = -1.; for (x = 0; x < Alphabet_size; x++) if (obs[x] > maxc) { maxc = obs[x]; sym[i] = x; } /* either if no symbols observed, or more gaps than syms: */ if (ngap >= maxc) sym[i] = -1; } /* Calculate log likelihood + log prior */ for (q = 0; q < nmx; q++) { for (b = 0; b < 20; b++) { bterm[b] = bprior[q][b]; for (i = 0; i < nclust; i++) if (sym[i] >= 0) bterm[b] += log(mx[q][b][sym[i]]); } probq[j][q] = log(qprior[q]) + FLogSum(bterm, 20); } LogNorm(probq[j], nmx); /* normalize -> gives posterior. */ } free(sym); free(wgt); free(clust);}/* Function: old_calc_probq() OBSOLETE VERSION * * Purpose: Calculate the posterior probability distribution * P(q | a_j) for every column j in the alignment * and every matrix choice q. * * Non-probabilistic. Uses a BLOSUM-like rule to * find the single best matrix for a column, then * assigns it a posterior of 1.0. * * This was version 1: a competitive learning rule, * posterior either 1.0 or 0.0. * * Args: aseqs - alignment * ainfo - optional info for alignment * jmx - *joint* probability matrices [0..nmx-1][0..19][0..19] * qprior- prior prob distribution over matrices [UNUSED] * nmx - number of matrices * probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1] * alloc'ed in called, filled in here. * * Return: (void) * probq is filled in. */static void old_calc_probq(char **aseqs, AINFO *ainfo, float ***jmx, float *qprior, int nmx, float **probq){ int q; /* counter over matrices */ int a1, a2; /* counters over sequences */ int j; /* counter over columns */ float x; /* BLOSUM-style objective function */ float maxx; /* maximum x so far */ int maxq; /* maximum q so far */ int *clust; /* assignment of seqs to clusters 0..nseq-1 */ int nclust; /* number of clusters */ float *wgt; /* weights on seqs, 0..nseq-1 */ int *sym; /* symbol indices in a column */ /* Use the BLOSUM rule to calculate weights and clusters * for sequences in the alignment */ wgt = (float *) MallocOrDie (sizeof(float) * ainfo->nseq); clust = (int *) MallocOrDie (sizeof(int) * ainfo->nseq); BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust); /* Use the BLOSUM rule to calculate a "likelihood" function * P(column | q) for each column. */ sym = (int *) MallocOrDie (sizeof(int) * ainfo->nseq); for (j = 0; j < ainfo->alen; j++) { for (a1 = 0; a1 < ainfo->nseq; a1++) if (!isgap(aseqs[a1][j]) && strchr(Alphabet, aseqs[a1][j]) != NULL) { sym[a1] = SYMIDX(aseqs[a1][j]); if (sym[a1] >= Alphabet_size) sym[a1] = -1; /* no degenerates */ } else sym[a1] = -1; maxx = -FLT_MAX; for (q = 0; q < nmx; q++) { x = 0.; for (a1 = 0; a1 < ainfo->nseq; a1++) for (a2 = 0; a2 < ainfo->nseq; a2++) if (sym[a1] >= 0 && sym[a2] >= 0 && clust[a1] != clust[a2]) x += wgt[a1] * wgt[a2] * log(jmx[q][sym[a1]][sym[a2]]);#ifdef SRE_REMOVED printf("%% col %3d mx %c x = %f\n", j+1, 'a'+(char)q, x); #endif if (x > maxx) { maxx = x; maxq = q; } } FSet(probq[j], nmx, 0.0); probq[j][maxq] = 1.0; /* winner-take-all rule */ } free(sym); free(wgt); free(clust);}/* Function: print_probq() * * Purpose: Debugging output. * probq is the posterior probability P(q | column) of * a matrix q given an observed alignment column. * Indexed probq[0..alen-1][0..nmx-1]. */static voidprint_probq(FILE *fp, float **probq, int alen, int nmx){ int c; /* counter for columns */ int q; /* counter for matrices */ fputs("### probq debugging output\n", fp); fputs(" ", fp); for (q = 0; q < nmx; q++) fprintf(fp, " %c ", 'a'+(char)q); fputs("\n", fp); for (c = 0; c < alen; c++) { fprintf(fp, "%4d ", c); for (q = 0; q < nmx; q++) fprintf(fp, "%5.3f ", probq[c][q]); fputs("\n", fp); }}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -