📄 modelmakers.c
字号:
/* Initialize insert counters to zero */ FSet(insc, Alphabet_size, 0.); for (idx = 0; idx < msa->nseq; idx++) insopt[idx] = 0; sc[i] = -FLT_MAX; for (j = i+1; j <= last; j++) { /* build transition matrix for column pair i,j */ code = build_cij(msa->aseq, msa->nseq, insopt, i, j, msa->wgt, cij); if (code == -1) break; /* no j to our right can work for us */ if (code == 1) { FCopy(tij, cij, 7); P7PriorifyTransitionVector(tij, prior, prior->tq); FNorm(tij, 3); tij[TMM] = sreLOG2(tij[TMM] / null_p1); tij[TMI] = sreLOG2(tij[TMI] / null_p1); tij[TMD] = sreLOG2(tij[TMD]); tij[TIM] = sreLOG2(tij[TIM] / null_p1); tij[TII] = sreLOG2(tij[TII] / null_p1); tij[TDM] = sreLOG2(tij[TDM] / null_p1); tij[TDD] = sreLOG2(tij[TDD]); /* calculate the score of using this j. */ new = sc[j] + FDot(tij, cij, 7) + FDot(insp, insc, Alphabet_size); SQD_DPRINTF2(("%3d %3d new=%6.2f scj=%6.2f m=%6.2f i=%6.2f t=%6.2f\n", i, j, new, sc[j], FDot(matp, matc[i], Alphabet_size), FDot(insp, insc, Alphabet_size), FDot(tij, cij, 7))); /* keep it if it's better */ if (new > sc[i]) { sc[i] = new; tbck[i] = j; } } /* bump insc, insopt insert symbol counters */ FAdd(insc, matc[j], Alphabet_size); for (idx = 0; idx < msa->nseq; idx++) if (!isgap(msa->aseq[idx][j-1])) insopt[idx]++; } /* add in constant contributions for col i */ /* note ad hoc scaling of mpri by wgtsum (us. nseq)*/ sc[i] += FDot(matp, matc[i], Alphabet_size) + mpri * wgtsum; } /* end loop over start positions i */ /* Termination: place the begin state. * log odds score for S->N->B is all zero except for NB transition, which * is a constant. So we only have to evaluate BM transitions. */ bestsc = -FLT_MAX; for (i = 1; i <= last; i++) { new = sc[i]; for (idx = 0; idx < msa->nseq; idx++) { if (isgap(msa->aseq[idx][j-1])) new += bm2; /* internal B->M transition */ else new += bm1; /* B->M1 transition */ } if (new > bestsc) { bestsc = new; first = i; } } /* Traceback */ matassign[0] = 0; for (i = 1; i <= msa->alen; i++) matassign[i] = ASSIGN_INSERT; for (i = first; i != 0; i = tbck[i]) { matassign[i] &= ~ASSIGN_INSERT; matassign[i] |= ASSIGN_MATCH; } /* Hand matassign off for remainder of model construction */ /* print_matassign(matassign, ainfo->alen); */ matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr); /* Clean up. */ for (i = 1; i <= msa->alen; i++) free(matc[i]); free(matc); free(sc); free(tbck); free(matassign); free(insopt);}/* Function: build_cij() * * Purpose: Construct a counts vector for transitions between * column i and column j in a multiple alignment. * * '_' gap characters indicate "external" gaps which * are to be dealt with by B->M and M->E transitions. * These characters must be placed by a preprocessor. * * insopt is an "insert optimization" -- an incrementor * which keeps track of the number of insert symbols * between i and j. * * Args: aseqs - multiple alignment. [0.nseq-1][0.alen-1] * nseq - number of seqs in aseqs * insopt - number of inserts per seq between i/j [0.nseq-1] * i - i column [1.alen], off by one from aseqs * j - j column [1.alen], off by one from aseqs * wgt - per-seq weights [0.nseq-1] * cij - transition count vectors [0..7] * * Return: -1 if an illegal transition was seen for this i/j assignment *and* * we are guaranteed that any j to the right will also * have illegal transitions. * 0 if an illegal transition was seen, but a j further to the * right may work. * 1 if all transitions were legal. */ static int build_cij(char **aseqs, int nseq, int *insopt, int i, int j, float *wgt, float *cij){ int idx; /* counter for seqs */ i--; /* make i,j relative to aseqs [0..alen-1] */ j--; FSet(cij, 8, 0.); /* zero cij */ for (idx = 0; idx < nseq; idx++) { if (insopt[idx] > 0) { if (isgap(aseqs[idx][i])) return -1; /* D->I prohibited. */ if (isgap(aseqs[idx][j])) return 0; /* I->D prohibited. */ cij[TMI] += wgt[idx]; cij[TII] += (insopt[idx]-1) * wgt[idx]; cij[TIM] += wgt[idx]; } else { if (!isgap(aseqs[idx][i])) { if (aseqs[idx][j] == '_') ; /* YO! what to do with trailer? */ else if (isgap(aseqs[idx][j])) cij[TMD] += wgt[idx]; else cij[TMM] += wgt[idx]; } else { /* ignores B->E possibility */ if (aseqs[idx][j] == '_') continue; else if (isgap(aseqs[idx][j])) cij[TDD] += wgt[idx]; else cij[TDM] += wgt[idx]; } } } return 1;}/* Function: estimate_model_length() * * Purpose: Return a decent guess about the length of the model, * based on the lengths of the sequences. * * Algorithm is dumb: use weighted average length. * * Don't assume that weights sum to nseq! */static intestimate_model_length(MSA *msa){ int idx; float total = 0.; float wgtsum = 0.; for (idx = 0; idx < msa->nseq; idx++) { total += msa->wgt[idx] * DealignedLength(msa->aseq[idx]); wgtsum += msa->wgt[idx]; } return (int) (total / wgtsum);} /* Function: matassign2hmm() * * Purpose: Given an assignment of alignment columns to match vs. * insert, finish the final part of the model construction * calculation that is constant between model construction * algorithms. * * Args: msa - multiple sequence alignment * dsq - digitized unaligned aseq's * matassign - 1..alen bit flags for column assignments * ret_hmm - RETURN: counts-form HMM * ret_tr - RETURN: array of tracebacks for aseq's * * Return: (void) * ret_hmm and ret_tr alloc'ed here for the calling * modelmaker function. */static voidmatassign2hmm(MSA *msa, char **dsq, int *matassign, struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr){ struct plan7_s *hmm; /* RETURN: new hmm */ struct p7trace_s **tr; /* fake tracebacks for each seq */ int M; /* length of new model in match states */ int idx; /* counter over sequences */ int apos; /* counter for aligned columns */ /* how many match states in the HMM? */ M = 0; for (apos = 1; apos <= msa->alen; apos++) { if (matassign[apos] & ASSIGN_MATCH) M++; } /* delimit N-terminal tail */ for (apos=1; matassign[apos] & ASSIGN_INSERT && apos <= msa->alen; apos++) matassign[apos] |= EXTERNAL_INSERT_N; if (apos <= msa->alen) matassign[apos] |= FIRST_MATCH; /* delimit C-terminal tail */ for (apos=msa->alen; matassign[apos] & ASSIGN_INSERT && apos > 0; apos--) matassign[apos] |= EXTERNAL_INSERT_C; if (apos > 0) matassign[apos] |= LAST_MATCH; /* print_matassign(matassign, msa->alen); */ /* make fake tracebacks for each seq */ fake_tracebacks(msa->aseq, msa->nseq, msa->alen, matassign, &tr); /* build model from tracebacks */ hmm = AllocPlan7(M); ZeroPlan7(hmm); for (idx = 0; idx < msa->nseq; idx++) { /* P7PrintTrace(stdout, tr[idx], NULL, NULL); */ P7TraceCount(hmm, dsq[idx], msa->wgt[idx], tr[idx]); } /* annotate new model */ annotate_model(hmm, matassign, msa); /* Set #=RF line of alignment to reflect our assignment * of match, delete. matassign is valid from 1..alen and is off * by one from msa->rf. */ if (msa->rf != NULL) free(msa->rf); msa->rf = (char *) MallocOrDie (sizeof(char) * (msa->alen + 1)); for (apos = 0; apos < msa->alen; apos++) msa->rf[apos] = matassign[apos+1] & ASSIGN_MATCH ? 'x' : '.'; msa->rf[msa->alen] = '\0'; /* Cleanup and return. */ if (ret_tr != NULL) *ret_tr = tr; else { for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]); free(tr); } if (ret_hmm != NULL) *ret_hmm = hmm; else FreePlan7(hmm); return;} /* Function: fake_tracebacks() * * Purpose: From a consensus assignment of columns to MAT/INS, construct fake * tracebacks for each individual sequence. * * Note: Fragment tolerant by default. Internal entries are * B->M_x, instead of B->D1->D2->...->M_x; analogously * for internal exits. * * Args: aseqs - alignment [0..nseq-1][0..alen-1] * nseq - number of seqs in alignment * alen - length of alignment in columns * matassign - assignment of column; [1..alen] (off one from aseqs) * ret_tr - RETURN: array of tracebacks * * Return: (void) * ret_tr is alloc'ed here. Caller must free. */ static voidfake_tracebacks(char **aseq, int nseq, int alen, int *matassign, struct p7trace_s ***ret_tr){ struct p7trace_s **tr; int idx; /* counter over sequences */ int i; /* position in raw sequence (1..L) */ int k; /* position in HMM */ int apos; /* position in alignment columns */ int tpos; /* position in traceback */ tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * nseq); for (idx = 0; idx < nseq; idx++) { P7AllocTrace(alen+6, &tr[idx]); /* allow room for S,N,B,E,C,T */ /* all traces start with S state... */ tr[idx]->statetype[0] = STS; tr[idx]->nodeidx[0] = 0; tr[idx]->pos[0] = 0; /* ...and transit to N state; N-term tail is emitted on N->N transitions */ tr[idx]->statetype[1] = STN; tr[idx]->nodeidx[1] = 0; tr[idx]->pos[1] = 0; i = 1; k = 0; tpos = 2; for (apos = 0; apos < alen; apos++) { tr[idx]->statetype[tpos] = STBOGUS; /* bogus, deliberately, to debug */ if (matassign[apos+1] & FIRST_MATCH) { /* BEGIN */ tr[idx]->statetype[tpos] = STB; tr[idx]->nodeidx[tpos] = 0; tr[idx]->pos[tpos] = 0; tpos++; } if (matassign[apos+1] & ASSIGN_MATCH && ! isgap(aseq[idx][apos])) { /* MATCH */ k++; /* move to next model pos */ tr[idx]->statetype[tpos] = STM;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -