📄 prior.c
字号:
* Only activated if X-PR* annotation has been used, in which * priors are overridden and a single Dirichlet component is * specified for each column (using structural annotation). * If X-PR* annotation is not used, which is usually the case, * the following code has no effect (observe how the real prior * distributions are copied into tq, mq, iq). */ if (hmm->tpri != NULL && hmm->tpri[k] >= 0) { if (hmm->tpri[k] >= pri->tnum) Die("X-PRT annotation out of range"); FSet(tq, pri->tnum, 0.0); tq[hmm->tpri[k]] = 1.0; } else FCopy(tq, pri->tq, pri->tnum); if (hmm->mpri != NULL && hmm->mpri[k] >= 0) { if (hmm->mpri[k] >= pri->mnum) Die("X-PRM annotation out of range"); FSet(mq, pri->mnum, 0.0); mq[hmm->mpri[k]] = 1.0; } else FCopy(mq, pri->mq, pri->mnum); if (hmm->ipri != NULL && hmm->ipri[k] >= 0) { if (hmm->ipri[k] >= pri->inum) Die("X-PRI annotation out of range"); FSet(iq, pri->inum, 0.0); iq[hmm->ipri[k]] = 1.0; } else FCopy(iq, pri->iq, pri->inum); /* This is the main line of the code: */ P7PriorifyTransitionVector(hmm->t[k], pri, tq); P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, mq, pri->m, NULL); P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, iq, pri->i, NULL); } /* We repeat the above steps just for the final match state, M. */ if (hmm->mpri != NULL && hmm->mpri[hmm->M] >= 0) { if (hmm->mpri[hmm->M] >= pri->mnum) Die("X-PRM annotation out of range"); FSet(mq, pri->mnum, 0.0); mq[hmm->mpri[hmm->M]] = 1.0; } else FCopy(mq, pri->mq, pri->mnum); P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, mq, pri->m, NULL); /* Now we're done. Convert the counts-based HMM to probabilities. */ Plan7Renormalize(hmm);}/* Function: P7PriorifyEmissionVector() * * Purpose: Add prior pseudocounts to an observed * emission count vector and renormalize. * * Can return the posterior mixture probabilities * P(q | counts) if ret_mix[MAXDCHLET] is passed. * Else, pass NULL. * * Args: vec - the 4 or 20-long vector of counts to modify * pri - prior data structure * num - pri->mnum or pri->inum; # of mixtures * eq - pri->mq or pri->iq; prior mixture probabilities * e - pri->i or pri->m; Dirichlet components * ret_mix - filled with posterior mixture probabilities, or NULL * * Return: (void) * The counts in vec are changed and normalized to probabilities. */ voidP7PriorifyEmissionVector(float *vec, struct p7prior_s *pri, int num, float eq[MAXDCHLET], float e[MAXDCHLET][MAXABET], float *ret_mix){ int x; /* counter over vec */ int q; /* counter over mixtures */ float mix[MAXDCHLET]; /* posterior distribution over mixtures */ float totc; /* total counts */ float tota; /* total alpha terms */ float xi; /* X_i term, Sjolander eq. 41 */ /* Calculate mix[], which is the posterior probability * P(q | n) of mixture component q given the count vector n * * (side effect note: note that an insert vector in a PAM prior * is passed with num = 1, bypassing pam prior code; this means * that inserts cannot be mixture Dirichlets...) * [SRE, 12/24/00: the above comment is cryptic! what the hell does that * mean, inserts can't be mixtures? doesn't seem to be true. it * may mean that in a PAM prior, you can't have a mixture for inserts, * but I don't even understand that. The insert vectors aren't passed * with num=1!!] */ mix[0] = 1.0; if (pri->strategy == PRI_DCHLET && num > 1) { for (q = 0; q < num; q++) { mix[q] = eq[q] > 0.0 ? log(eq[q]) : -999.; mix[q] += Logp_cvec(vec, Alphabet_size, e[q]); } LogNorm(mix, num); /* now mix[q] is P(component_q | n) */ } else if (pri->strategy == PRI_PAM && num > 1) { /* pam prior uses aa frequencies as `P(q|n)' */ for (q = 0; q < Alphabet_size; q++) mix[q] = vec[q]; FNorm(mix, Alphabet_size); } /* Convert the counts to probabilities, following Sjolander (1996) */ totc = FSum(vec, Alphabet_size); for (x = 0; x < Alphabet_size; x++) { xi = 0.0; for (q = 0; q < num; q++) { tota = FSum(e[q], Alphabet_size); xi += mix[q] * (vec[x] + e[q][x]) / (totc + tota); } vec[x] = xi; } FNorm(vec, Alphabet_size); if (ret_mix != NULL) for (q = 0; q < num; q++) ret_mix[q] = mix[q];}/* Function: P7PriorifyTransitionVector() * * Purpose: Add prior pseudocounts to transition vector, * which contains three different probability vectors * for m, d, and i. * * Args: t - state transitions, counts: 3 for M, 2 for I, 2 for D. * prior - Dirichlet prior information * tq - prior distribution over Dirichlet components. * (overrides prior->iq[]; used for alternative * methods of conditioning prior on structural data) * * Return: (void) * t is changed, and renormalized -- comes back as * probability vectors. */ voidP7PriorifyTransitionVector(float *t, struct p7prior_s *prior, float tq[MAXDCHLET]){ int ts; int q; float mix[MAXDCHLET]; float totm, totd, toti; /* total counts in three transition vecs */ float xi; /* Sjolander's X_i term */ mix[0] = 1.0; /* default is simple one component */ if ((prior->strategy == PRI_DCHLET || prior->strategy == PRI_PAM) && prior->mnum > 1) { for (q = 0; q < prior->tnum; q++) { mix[q] = tq[q] > 0.0 ? log(tq[q]) : -999.; mix[q] += Logp_cvec(t, 3, prior->t[q]); /* 3 match */ mix[q] += Logp_cvec(t+3, 2, prior->t[q]+3); /* 2 insert */ mix[q] += Logp_cvec(t+5, 2, prior->t[q]+5); /* 2 delete */ } LogNorm(mix, prior->tnum); /* mix[q] is now P(q | counts) */ } /* precalc some denominators */ totm = FSum(t,3); toti = t[TIM] + t[TII]; totd = t[TDM] + t[TDD]; for (ts = 0; ts < 7; ts++) { xi = 0.0; for (q = 0; q < prior->tnum; q++) { switch (ts) { case TMM: case TMI: case TMD: xi += mix[q] * (t[ts] + prior->t[q][ts]) / (totm + FSum(prior->t[q], 3)); break; case TIM: case TII: xi += mix[q] * (t[ts] + prior->t[q][ts]) / (toti + prior->t[q][TIM] + prior->t[q][TII]); break; case TDM: case TDD: xi += mix[q] * (t[ts] + prior->t[q][ts]) / (totd + prior->t[q][TDM] + prior->t[q][TDD]); break; } } t[ts] = xi; } FNorm(t, 3); /* match */ FNorm(t+3, 2); /* insert */ FNorm(t+5, 2); /* delete */}/* Function: default_amino_prior() * * Purpose: Set the default protein prior. */static struct p7prior_s *default_amino_prior(void){ struct p7prior_s *pri; int q, x; /* default match mixture coefficients */ static float defmq[9] = { 0.178091, 0.056591, 0.0960191, 0.0781233, 0.0834977, 0.0904123, 0.114468, 0.0682132, 0.234585 }; /* default match mixture Dirichlet components */ static float defm[9][20] = { { 0.270671, 0.039848, 0.017576, 0.016415, 0.014268, 0.131916, 0.012391, 0.022599, 0.020358, 0.030727, 0.015315, 0.048298, 0.053803, 0.020662, 0.023612, 0.216147, 0.147226, 0.065438, 0.003758, 0.009621 }, { 0.021465, 0.010300, 0.011741, 0.010883, 0.385651, 0.016416, 0.076196, 0.035329, 0.013921, 0.093517, 0.022034, 0.028593, 0.013086, 0.023011, 0.018866, 0.029156, 0.018153, 0.036100, 0.071770, 0.419641 }, { 0.561459, 0.045448, 0.438366, 0.764167, 0.087364, 0.259114, 0.214940, 0.145928, 0.762204, 0.247320, 0.118662, 0.441564, 0.174822, 0.530840, 0.465529, 0.583402, 0.445586, 0.227050, 0.029510, 0.121090 }, { 0.070143, 0.011140, 0.019479, 0.094657, 0.013162, 0.048038, 0.077000, 0.032939, 0.576639, 0.072293, 0.028240, 0.080372, 0.037661, 0.185037, 0.506783, 0.073732, 0.071587, 0.042532, 0.011254, 0.028723 }, { 0.041103, 0.014794, 0.005610, 0.010216, 0.153602, 0.007797, 0.007175, 0.299635, 0.010849, 0.999446, 0.210189, 0.006127, 0.013021, 0.019798, 0.014509, 0.012049, 0.035799, 0.180085, 0.012744, 0.026466 }, { 0.115607, 0.037381, 0.012414, 0.018179, 0.051778, 0.017255, 0.004911, 0.796882, 0.017074, 0.285858, 0.075811, 0.014548, 0.015092, 0.011382, 0.012696, 0.027535, 0.088333, 0.944340, 0.004373, 0.016741 }, { 0.093461, 0.004737, 0.387252, 0.347841, 0.010822, 0.105877, 0.049776, 0.014963, 0.094276, 0.027761, 0.010040, 0.187869, 0.050018, 0.110039, 0.038668, 0.119471, 0.065802, 0.025430, 0.003215, 0.018742 }, { 0.452171, 0.114613, 0.062460, 0.115702, 0.284246, 0.140204, 0.100358, 0.550230, 0.143995, 0.700649, 0.276580, 0.118569, 0.097470, 0.126673, 0.143634, 0.278983, 0.358482, 0.661750, 0.061533, 0.199373 }, { 0.005193, 0.004039, 0.006722, 0.006121, 0.003468, 0.016931, 0.003647, 0.002184, 0.005019, 0.005990, 0.001473, 0.004158, 0.009055, 0.003630, 0.006583, 0.003172, 0.003690, 0.002967, 0.002772, 0.002686 }, }; pri = P7AllocPrior(); pri->strategy = PRI_DCHLET; /* Transition priors are subjective, but borrowed from GJM's estimations * on Pfam */ pri->tnum = 1; pri->tq[0] = 1.0; pri->t[0][TMM] = 0.7939; pri->t[0][TMI] = 0.0278; pri->t[0][TMD] = 0.0135; pri->t[0][TIM] = 0.1551; pri->t[0][TII] = 0.1331; pri->t[0][TDM] = 0.9002; pri->t[0][TDD] = 0.5630; /* Match emission priors are a mixture Dirichlet, * from Kimmen Sjolander (Blocks9) */ pri->mnum = 9; for (q = 0; q < pri->mnum; q++) { pri->mq[q] = defmq[q]; for (x = 0; x < 20; x++) pri->m[q][x] = defm[q][x]; } /* These insert emission priors are subjective. Observed frequencies * were obtained from PFAM 1.0, 10 Nov 96; * see ~/projects/plan7/InsertStatistics. * Inserts are slightly biased towards polar residues and away from * hydrophobic residues. */ pri->inum = 1; pri->iq[0] = 1.; pri->i[0][0] = 681.; /* A */ pri->i[0][1] = 120.; /* C */ pri->i[0][2] = 623.; /* D */ pri->i[0][3] = 651.; /* E */ pri->i[0][4] = 313.; /* F */ pri->i[0][5] = 902.; /* G */ pri->i[0][6] = 241.; /* H */ pri->i[0][7] = 371.; /* I */ pri->i[0][8] = 687.; /* K */ pri->i[0][9] = 676.; /* L */ pri->i[0][10] = 143.; /* M */ pri->i[0][11] = 548.; /* N */ pri->i[0][12] = 647.; /* P */ pri->i[0][13] = 415.; /* Q */ pri->i[0][14] = 551.; /* R */ pri->i[0][15] = 926.; /* S */ pri->i[0][16] = 623.; /* T */ pri->i[0][17] = 505.; /* V */ pri->i[0][18] = 102.; /* W */ pri->i[0][19] = 269.; /* Y */ return pri;}/* Function: default_nucleic_prior() * * Purpose: Set the default DNA prior. (for now, almost a Laplace) */static struct p7prior_s *default_nucleic_prior(void){ struct p7prior_s *pri; pri = P7AllocPrior(); pri->strategy = PRI_DCHLET; /* The use of the Pfam-trained amino acid transition priors * here is TOTALLY bogus. But it works better than a straight * Laplace, esp. for Maxmodelmaker(). For example, a Laplace * prior builds M=1 models for a single sequence GAATTC (at * one time an open "bug"). */ pri->tnum = 1; pri->tq[0] = 1.; pri->t[0][TMM] = 0.7939; pri->t[0][TMI] = 0.0278; pri->t[0][TMD] = 0.0135; pri->t[0][TIM] = 0.1551; pri->t[0][TII] = 0.1331; pri->t[0][TDM] = 0.9002; pri->t[0][TDD] = 0.5630; pri->mnum = 1; pri->mq[0] = 1.; FSet(pri->m[0], Alphabet_size, 1.); pri->inum = 1; pri->iq[0] = 1.; FSet(pri->i[0], Alphabet_size, 1.); return pri;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -