📄 camjul97.c
字号:
/* Source code from Cambridge visit July 1997 * * Position-specific matrices. */#include "config.h"#include "squidconf.h"#include <stdlib.h>#include <stdio.h>#include <string.h>#include <limits.h>#include <float.h>#include <math.h> #include "funcs.h"#include "structs.h"#include "squid.h"/* Function: MakeStarHMM() * * Purpose: Given an HMM with counts, create an HMM according * to the star rule. In star models we typically expect * that the counts have been collected using BLOSUM style * weights. * * Args: hmm - HMM structure containing counts data * mx - Star vectors, mx[q][x] * pq - vector prior P(q) * nq - number of vectors * pri - Dirichlet priors for other parameters * * Return: (void) * hmm is converted to probabilities. */voidMakeStarHMM(struct plan7_s *hmm, float **mx, float *pq, int nq, struct p7prior_s *pri){ int k; /* counter over model position */ int x; /* counter over symbol/transition */ float *pxa; /* P(x | a) : our parameter estimate */ float *pqa; /* P(q | a) for all q */ int q; /* counters over vectors q */ int ai; /* counter over symbols */ /* Match emissions: Star rule implementation. */ pxa = (float *) MallocOrDie(sizeof(float) * Alphabet_size); pqa = (float *) MallocOrDie(sizeof(float) * nq); for (k = 1; k <= hmm->M; k++) { /* calculate log P(q | a) unnormalized (i.e. + log P(a))*/ for (q = 0; q < nq; q++) { pqa[q] = log(pq[q]); for (ai = 0; ai < Alphabet_size; ai++) pqa[q] += hmm->mat[k][ai] * log(mx[q][ai]); } /* calculate log P(x | a) unnormalized (i.e + log P(a))*/ for (x = 0; x < Alphabet_size; x++) { pxa[x] = pqa[0] + log(mx[0][x]); for (q = 1; q < nq; q++) pxa[x] = LogSum(pxa[x], (pqa[q] + log(mx[q][x]))); } /* normalize now to get P(x|a) and store */ LogNorm(pxa, Alphabet_size); FCopy(hmm->mat[k], pxa, Alphabet_size); } /* Everything else is done according to P7PriorifyHMM() */ /* Model-dependent transitions are handled simply; Laplace. */ FSet(hmm->begin+2, hmm->M-1, 0.); /* wipe internal BM entries */ FSet(hmm->end+1, hmm->M-1, 0.); /* wipe internal ME exits */ hmm->tbd1 += 1.0; hmm->begin[1] += 1.0; /* Main model transitions and insert emissions */ for (k = 1; k < hmm->M; k++) { P7PriorifyTransitionVector(hmm->t[k], pri); P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, pri->iq, pri->i, NULL); } Plan7Renormalize(hmm); free(pxa); free(pqa); return;} #ifdef SRE_REMOVED/* Function: MakeIslandHMM() * * Purpose: Given a sequence alignment of i = 1..nseq sequences, * with columns j = 1..alen; and a sequence index idx * to build the island from. Return a Plan7 island HMM in * probability form. * * Args: aseqs - alignment * ainfo - alignment info * idx - index of which sequence to build island from * null - random sequence model [0..Alphabet_size-1] * mx - probability matrices mx[q][root b][x] * bpri - priors on root distributions bpri[q][root b] * qpri - prior probability distribution over matrices * nmx - number of joint probability matrices * * Return: a new Plan7 HMM */struct plan7_s *MakeIslandHMM(char **aseqs, AINFO *ainfo, int idx, float null[MAXABET], float ***mx, float **bpri, float *qpri, int nmx){ struct plan7_s *hmm; /* RETURN: Plan7 HMM */ int j; /* column position index */ int k; /* model position index */ int q; /* counter for matrices */ int x; /* counter for symbols */ float *mat; /* a match emission probability vector */ float **probq; /* posterior P(q | column) */ int sym; /* index of a symbol in alphabet */ float max; int qmax; float **pxaq; /* P(x | a,q) vectors, [q][x] */ int b; /* counter over root symbols */ /* Allocate a model which is the length of the * raw sequence. */ hmm = AllocPlan7(DealignedLength(aseqs[idx])); if (ainfo->sqinfo[idx].flags & SQINFO_NAME) Plan7SetName(hmm, ainfo->sqinfo[idx].name); if (ainfo->sqinfo[idx].flags & SQINFO_DESC) Plan7SetDescription(hmm, ainfo->sqinfo[idx].desc); Plan7SetNullModel(hmm, null, 350./351.); /* p1 made up; shouldn't matter*/ mat = (float *) MallocOrDie( sizeof(float) * Alphabet_size); pxaq = FMX2Alloc(nmx, Alphabet_size); /* Calculate the posterior probability distribution * probq (= P(q | col)) over nmx different matrices * at each column j -- probq[0..alen-1][0..nmx-1]; * currently does not use the prior on q, but does a * winner-take-all rule. */ probq = FMX2Alloc(ainfo->alen, nmx); calc_probq(aseqs, ainfo, mx, bpri, qpri, nmx, probq); /* Debugging */ print_probq(stdout, probq, ainfo->alen, nmx); for (k = 1, j = 0; j < ainfo->alen; j++) { if (isgap(aseqs[idx][j])) continue; if (strchr(Alphabet, aseqs[idx][j]) != NULL) sym = SYMIDX(aseqs[idx][j]); else Die("MakeIslandHMM() can't handle ambiguous query symbols yet"); /* Calculate P(x | a, q) emission vectors for all matrices q */ for (q = 0; q < nmx; q++) { for (x = 0; x < Alphabet_size; x++) { pxaq[q][x] = 0.0; for (b = 0; b < 20; b++) pxaq[q][x] += mx[q][b][x] * mx[q][b][sym] * bpri[q][b]; } FNorm(pxaq[q], Alphabet_size); } /* Sum P(x | a, q) emission vectors over matrices q: * P(x | a, col) = \sum_q P(x | a, q, col) P(q | a, col) * = \sum_q P(x | a, q) P(q | col) */ for (x = 0; x < Alphabet_size; x++) { hmm->mat[k][x] = 0.; for (q = 0; q < nmx; q++) hmm->mat[k][x] += probq[j][q] * pxaq[q][x]; if (k < hmm->M) hmm->ins[k][x] = null[x]; } /* Reference annotation on columns: most probable matrix */ max = -FLT_MAX; for (q = 0; q < nmx; q++) if (probq[j][q] > max) { qmax = q; max = probq[j][q]; } hmm->rf[k] = 'a'+(char)qmax; /* q > 9, so convert to char a-z*/ /* Consensus annotation on columns: original sequence. */ hmm->cs[k] = aseqs[idx][j]; k++; } /* State transitions are set subjectively */ hmm->tbd1 = 0.02; for (k = 1; k < hmm->M; k++) { hmm->t[k][TMM] = 0.97; hmm->t[k][TMI] = 0.02; hmm->t[k][TMD] = 0.01; hmm->t[k][TIM] = 0.20; hmm->t[k][TII] = 0.80; hmm->t[k][TDM] = 0.90; hmm->t[k][TDD] = 0.10; } hmm->flags |= PLAN7_HASPROB | PLAN7_RF | PLAN7_CS; FMX2Free(pxaq); FMX2Free(probq); free(mat); return hmm;}#endif/* Function: ReadGJMMatrices() * * Purpose: Read GJM's file format for star-based mixture matrices. * Very first line is nq. * First line of a set is P(q), the prior of the matrix. * Second line contains P(b|q), the prior of the root symbols, * _in arbitrary order_ (the root distribution is not over AA's!) * Third line is blank. * Next 20 lines give a 20x20 matrix of conditional probabilities; * rows = root symbols b; cols = leaf symbols x; * mx[row][col] = P(x | b). * * Instead of storing as matrices, store as q x r vectors. * * Return: (void) * mx, pq, nq are returned via passed pointers. * Caller must free FMX2Free(mx) * Caller must free(pq). */voidReadGJMMatrices(FILE *fp, float ***ret_mx, float **ret_pq, int *ret_nq){ float **mx; /* conditional p's [0..nq-1][0..19] */ float *pq; /* priors on vectors, [0..nq-1] */ int nq, nr; /* number of matrices, rows */ char buf[2048]; float tmppq; /* prior for matrix */ int q,r; /* counter for matrices, rows */ int x; /* counter for symbols */ char *s; /* tmp pointer into buf */ /* allocations */ if (fgets(buf, 2048, fp) == NULL) Die("read failed"); nr = 20; nq = atoi(buf); mx = FMX2Alloc(nq*nr, 20); pq = (float *) MallocOrDie (nq*nr * sizeof(float)); /* parse matrices */ for (q = 0; q < nq; q++) { if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); tmppq = atof(buf); if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); s = strtok(buf, "\n\t "); for (r = 0; r < nr; r++) { pq[q*nr + r] = atof(s) * tmppq; s = strtok(NULL, "\n\t "); } if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); for (r = 0; r < 20; r++) { if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); s = strtok(buf, "\n\t "); for (x = 0; x < 20; x++) { mx[q*nr+r][x] = atof(s); s = strtok(NULL, "\n\t "); } } /* two blank lines */ if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); } *ret_mx = mx; *ret_pq = pq; *ret_nq = nq*nr; return;}#ifdef SRE_REMOVED/* Function: OldReadGJMMatrices() * * Purpose: Read GJM's file format for joint probability matrix sets. * * Return: (void) * mx, qprior, nmx are returned via passed pointers. * Caller must free mx: each matrix by FMX2Free(), then free(mx). * Caller must also free(qprior). */voidOldReadGJMMatrices(FILE *fp, float ****ret_mx, float **ret_qprior, int *ret_nmx){ float ***mx; /* joint prob matrix [0..nmx-1][0..19][0..19] */ float *qprior; /* priors on matrices, [0..nmx-1] */ int nmx; /* number of matrices */ char buf[2048]; int q; /* counter for matrices */ int idx; /* index for this matrix seen in file */ int r,c; /* counter for row, column */ char *s; /* tmp pointer into buf */ /* pass one: count matrices */ nmx = 0; while (fgets(buf, 2048, fp) != NULL) if (Strparse("use [0-9]+ = .+", buf, 0) == 0) nmx++; rewind(fp); /* allocations */ qprior = (float *) MallocOrDie (20 * sizeof(float)); mx = (float ***) MallocOrDie (nmx * sizeof(float **)); for (q = 0; q < nmx; q++) mx[q] = FMX2Alloc(20, 20); /* pass two: parse matrices */ q = 0; while (fgets(buf, 2048, fp) != NULL) { if (Strparse("use ([0-9]+) = (.+)", buf, 2) != 0) continue; idx = atoi(sqd_parse[1]); qprior[q] = atof(sqd_parse[2]); /* skip two lines in his new format */ if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed"); if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed"); for (r = 0; r < 20; r++) { if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed"); s = strtok(buf, "\n\t "); for (c = 0; c < 20; c++) { mx[q][r][c] = atof(s); s = strtok(NULL, "\n\t "); } } q++; } *ret_mx = mx; *ret_qprior = qprior;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -