📄 plan7.c
字号:
/* Function: Plan7SWConfig() * * Purpose: Set the alignment independent parameters of * a Plan7 model to hmmsw (Smith/Waterman) configuration. * * Notes: entry/exit is asymmetric because of the left/right * nature of the HMM/profile. Entry probability is distributed * simply by assigning p_x = pentry / (M-1) to M-1 * internal match states. However, the same approach doesn't * lead to a flat distribution over exit points. Exit p's * must be corrected for the probability of a previous exit * from the model. Requiring a flat distribution over exit * points leads to an easily solved piece of algebra, giving: * p_1 = pexit / (M-1) * p_x = p_1 / (1 - (x-1) p_1) * * Args: hmm - the Plan7 model w/ data-dep prob's valid * pentry - probability of an internal entry somewhere; * will be evenly distributed over M-1 match states * pexit - probability of an internal exit somewhere; * will be distributed over M-1 match states. * * Return: (void) * HMM probabilities are modified. */voidPlan7SWConfig(struct plan7_s *hmm, float pentry, float pexit){ float basep; /* p1 for exits: the base p */ int k; /* counter over states */ /* Configure special states. */ hmm->xt[XTN][MOVE] = 1-hmm->p1; /* allow N-terminal tail */ hmm->xt[XTN][LOOP] = hmm->p1; hmm->xt[XTE][MOVE] = 1.; /* disallow jump state */ hmm->xt[XTE][LOOP] = 0.; hmm->xt[XTC][MOVE] = 1-hmm->p1; /* allow C-terminal tail */ hmm->xt[XTC][LOOP] = hmm->p1; hmm->xt[XTJ][MOVE] = 1.; /* J is unused */ hmm->xt[XTJ][LOOP] = 0.; /* Configure entry. */ hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1); FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- hmm->tbd1)) / (float)(hmm->M-1)); /* Configure exit. */ hmm->end[hmm->M] = 1.0; basep = pexit / (float) (hmm->M-1); for (k = 1; k < hmm->M; k++) hmm->end[k] = basep / (1. - basep * (float) (k-1)); Plan7RenormalizeExits(hmm); hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */}/* Function: Plan7FSConfig() * Date: SRE, Fri Jan 2 15:34:40 1998 [StL] * * Purpose: Set the alignment independent parameters of * a Plan7 model to hmmfs (multihit Smith/Waterman) configuration. * * See comments on Plan7SWConfig() for explanation of * how pentry and pexit are used. * * Args: hmm - the Plan7 model w/ data-dep prob's valid * pentry - probability of an internal entry somewhere; * will be evenly distributed over M-1 match states * pexit - probability of an internal exit somewhere; * will be distributed over M-1 match states. * * Return: (void) * HMM probabilities are modified. */voidPlan7FSConfig(struct plan7_s *hmm, float pentry, float pexit){ float basep; /* p1 for exits: the base p */ int k; /* counter over states */ /* Configure special states. */ hmm->xt[XTN][MOVE] = 1-hmm->p1; /* allow N-terminal tail */ hmm->xt[XTN][LOOP] = hmm->p1; hmm->xt[XTE][MOVE] = 0.5; /* allow loops / multihits */ hmm->xt[XTE][LOOP] = 0.5; hmm->xt[XTC][MOVE] = 1-hmm->p1; /* allow C-terminal tail */ hmm->xt[XTC][LOOP] = hmm->p1; hmm->xt[XTJ][MOVE] = 1.-hmm->p1; /* allow J junction between domains */ hmm->xt[XTJ][LOOP] = hmm->p1; /* Configure entry. */ hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1); FSet(hmm->begin+2, hmm->M-1, (pentry * (1.-hmm->tbd1)) / (float)(hmm->M-1)); /* Configure exit. */ hmm->end[hmm->M] = 1.0; basep = pexit / (float) (hmm->M-1); for (k = 1; k < hmm->M; k++) hmm->end[k] = basep / (1. - basep * (float) (k-1)); Plan7RenormalizeExits(hmm); hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */}/* Function: Plan7ESTConfig() * * Purpose: Configure a Plan7 model for EST Smith/Waterman * analysis. * * OUTDATED; DO NOT USE WITHOUT RECHECKING * * Args: hmm - hmm to configure. * aacode - 0..63 vector mapping genetic code to amino acids * estmodel - 20x64 translation matrix, w/ codon bias and substitution error * dna2 - probability of a -1 frameshift in a triplet * dna4 - probability of a +1 frameshift in a triplet */ voidPlan7ESTConfig(struct plan7_s *hmm, int *aacode, float **estmodel, float dna2, float dna4){ int k; int x; float p; float *tripnull; /* UNFINISHED!!! */ /* configure specials */ hmm->xt[XTN][MOVE] = 1./351.; hmm->xt[XTN][LOOP] = 350./351.; hmm->xt[XTE][MOVE] = 1.; hmm->xt[XTE][LOOP] = 0.; hmm->xt[XTC][MOVE] = 1./351.; hmm->xt[XTC][LOOP] = 350./351.; hmm->xt[XTJ][MOVE] = 1.; hmm->xt[XTJ][LOOP] = 0.; /* configure entry/exit */ hmm->begin[1] = 0.5; FSet(hmm->begin+2, hmm->M-1, 0.5 / ((float)hmm->M - 1.)); hmm->end[hmm->M] = 1.; FSet(hmm->end, hmm->M-1, 0.5 / ((float)hmm->M - 1.)); /* configure dna triplet/frameshift emissions */ for (k = 1; k <= hmm->M; k++) { /* translate aa to triplet probabilities */ for (x = 0; x < 64; x++) { p = hmm->mat[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4); hmm->dnam[x][k] = Prob2Score(p, tripnull[x]); p = hmm->ins[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4); hmm->dnai[x][k] = Prob2Score(p, tripnull[x]); } hmm->dnam[64][k] = 0; /* ambiguous codons score 0 (danger?) */ hmm->dna2 = Prob2Score(dna2, 1.); hmm->dna4 = Prob2Score(dna4, 1.); }} /* Function: PrintPlan7Stats() * * Purpose: Given a newly constructed HMM and the tracebacks * of the sequences it was trained on, print out all * the interesting information at the end of hmmb * and hmmt runs that convinces the user we actually * did something. * * Args: fp - where to send the output (stdout, usually) * hmm - the new HMM, probability form * dsq - digitized training seqs * nseq - number of dsq's * tr - array of tracebacks for dsq * * Return: (void) */voidPrintPlan7Stats(FILE *fp, struct plan7_s *hmm, char **dsq, int nseq, struct p7trace_s **tr){ int idx; /* counter for sequences */ float score; /* an individual trace score */ float total, best, worst; /* for the avg. and range of the scores */ float sqsum, stddev; /* for the std. deviation of the scores */ P7Logoddsify(hmm, TRUE); /* make sure model scores are ready */ /* find individual trace scores */ score = P7TraceScore(hmm, dsq[0], tr[0]); total = best = worst = score; sqsum = score * score; for (idx = 1; idx < nseq; idx++) { /* P7PrintTrace(stdout, tr[idx], hmm, dsq[idx]); */ score = P7TraceScore(hmm, dsq[idx], tr[idx]); total += score; sqsum += score * score; if (score > best) best = score; if (score < worst) worst = score; } if (nseq > 1) { stddev = (sqsum - (total * total / (float) nseq)) / ((float) nseq - 1.); stddev = (stddev > 0) ? sqrt(stddev) : 0.0; } else stddev = 0.0; /* print out stuff. */ fprintf(fp, "Average score: %10.2f bits\n", total / (float) nseq); fprintf(fp, "Minimum score: %10.2f bits\n", worst); fprintf(fp, "Maximum score: %10.2f bits\n", best); fprintf(fp, "Std. deviation: %10.2f bits\n", stddev);}/* Function: DegenerateSymbolScore() * * Purpose: Given a sequence character x and an hmm emission probability * vector, calculate the log-odds (base 2) score of * the symbol. * * Easy if x is in the emission alphabet, but not so easy * is x is a degenerate symbol. The "correct" Bayesian * philosophy is to calculate score(X) by summing over * p(x) for all x in the degenerate symbol X to get P(X), * doing the same sum over the prior to get F(X), and * doing log_2 (P(X)/F(X)). This gives an X a zero score, * for instance. * * Though this is correct in a formal Bayesian sense -- * we have no information on the sequence, so we can't * say if it's random or model, so it scores zero -- * it sucks, big time, for scoring biological sequences. * Sequences with lots of X's score near zero, while * real sequences have average scores that are negative -- * so the X-laden sequences appear to be lifted out * of the noise of a full histogram of a database search. * Correct or not, this is highly undesirable. * * So therefore we calculated the expected score of * the degenerate symbol by summing over all x in X: * e_x log_2 (p(x)/f(x)) * where the expectation of x, e_x, is calculated from * the random model. * * Empirically, this works; it also has a wooly hand-waving * probabilistic justification that I'm happy enough about. * * Args: p - probabilities of normal symbols * null - null emission model * ambig - index of the degenerate character in Alphabet[] * * Return: the integer log odds score of x given the emission * vector and the null model, scaled up by INTSCALE. */int DegenerateSymbolScore(float *p, float *null, int ambig){ int x; float numer = 0.; float denom = 0.; for (x = 0; x < Alphabet_size; x++) { if (Degenerate[ambig][x]) { numer += null[x] * sreLOG2(p[x] / null[x]); denom += null[x]; } } return (int) (INTSCALE * numer / denom);}/***************************************************************** * * Plan9/Plan7 interface * * Very important code during the evolutionary takeover by Plan7 -- * convert between Krogh/Haussler and Plan7 models. *****************************************************************//* Function: Plan9toPlan7() * * Purpose: Convert an old HMM into Plan7. Configures it in * ls mode. * * Args: hmm - old ugly plan9 style HMM * ret_plan7 - new wonderful Plan7 HMM * * Return: (void) * Plan7 HMM is allocated here. Free w/ FreePlan7(). */ voidPlan9toPlan7(struct plan9_s *hmm, struct plan7_s **ret_plan7){ struct plan7_s *plan7; int k, x; plan7 = AllocPlan7(hmm->M); for (k = 1; k < hmm->M; k++) { plan7->t[k][TMM] = hmm->mat[k].t[MATCH]; plan7->t[k][TMD] = hmm->mat[k].t[DELETE]; plan7->t[k][TMI] = hmm->mat[k].t[INSERT]; plan7->t[k][TDM] = hmm->del[k].t[MATCH]; plan7->t[k][TDD] = hmm->del[k].t[DELETE]; plan7->t[k][TIM] = hmm->ins[k].t[MATCH]; plan7->t[k][TII] = hmm->ins[k].t[INSERT]; } for (k = 1; k <= hmm->M; k++) for (x = 0; x < Alphabet_size; x++) plan7->mat[k][x] = hmm->mat[k].p[x]; for (k = 1; k < hmm->M; k++) for (x = 0; x < Alphabet_size; x++) plan7->ins[k][x] = hmm->ins[k].p[x]; plan7->tbd1 = hmm->mat[0].t[DELETE] / (hmm->mat[0].t[DELETE] + hmm->mat[0].t[MATCH]); /* We have to make up the null transition p1; use default */ P7DefaultNullModel(plan7->null, &(plan7->p1)); for (x = 0; x < Alphabet_size; x++) plan7->null[x] = hmm->null[x]; if (hmm->name != NULL) Plan7SetName(plan7, hmm->name); if (hmm->flags & HMM_REF) { strcpy(plan7->rf, hmm->ref); plan7->flags |= PLAN7_RF; } if (hmm->flags & HMM_CS) { strcpy(plan7->cs, hmm->cs); plan7->flags |= PLAN7_CS; } Plan7LSConfig(plan7); /* configure specials for ls-style alignment */ Plan7Renormalize(plan7); /* mainly to correct for missing ID and DI */ plan7->flags |= PLAN7_HASPROB; /* probabilities are valid */ plan7->flags &= ~PLAN7_HASBITS; /* scores are not valid */ *ret_plan7 = plan7;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -