📄 masks.c
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* masks.c * SRE, Tue Nov 18 10:12:28 1997 * * Sequence masking routines; corrections for biased composition * target sequences. * * The Claverie/States XNU code is not used by default because I * consider X'ing out sequence to be too black/white and too * aggressive, but it's available as an option. * * The Wooton/Federhen SEG code was studied, but deemed too * nonportable to include; it would've suffered the same drawback * as XNU. * * The TraceScoreCorrection() code is the default. * * RCS $Id: masks.c,v 1.13 2001/08/04 21:14:44 eddy Exp $ */#include <stdio.h>#include <math.h>#include <float.h>#include "squid.h"#include "config.h"#include "structs.h"#include "funcs.h"#ifdef MEMDEBUG#include "dbmalloc.h"#endif/* The PAM120 score matrix, in HMMER's AMINO_ALPHABET alphabetic order */static int xpam120[23][23] = { { 3, -3, 0, 0, -4, 1, -3, -1, -2, -3, -2, -1, 1, -1, -3, 1, 1, 0, -7, -4, 1, 0, 0 }, {-3, 9, -7, -7, -6, -4, -4, -3, -7, -7, -6, -5, -4, -7, -4, 0, -3, -3, -8, -1, -4, -6, 0 }, { 0, -7, 5, 3, -7, 0, 0, -3, -1, -5, -4, 2, -3, 1, -3, 0, -1, -3, -8, -5, 5, 3, 0 }, { 0, -7, 3, 5, -7, -1, -1, -3, -1, -4, -3, 1, -2, 2, -3, -1, -2, -3, -8, -5, 3, 5, 0 }, {-4, -6, -7, -7, 8, -5, -3, 0, -7, 0, -1, -4, -5, -6, -5, -3, -4, -3, -1, 4, -4, -5, 0 }, { 1, -4, 0, -1, -5, 5, -4, -4, -3, -5, -4, 0, -2, -3, -4, 1, -1, -2, -8, -6, 1, -1, 0 }, {-3, -4, 0, -1, -3, -4, 7, -4, -2, -3, -4, 2, -1, 3, 1, -2, -3, -3, -3, -1, 2, 2, 0 }, {-1, -3, -3, -3, 0, -4, -4, 6, -3, 1, 1, -2, -3, -3, -2, -2, 0, 3, -6, -2, -2, -2, 0 }, {-2, -7, -1, -1, -7, -3, -2, -3, 5, -4, 0, 1, -2, 0, 2, -1, -1, -4, -5, -5, 1, 0, 0 }, {-3, -7, -5, -4, 0, -5, -3, 1, -4, 5, 3, -4, -3, -2, -4, -4, -3, 1, -3, -2, -3, -2, 0 }, {-2, -6, -4, -3, -1, -4, -4, 1, 0, 3, 8, -3, -3, -1, -1, -2, -1, 1, -6, -4, -3, -1, 0 }, {-1, -5, 2, 1, -4, 0, 2, -2, 1, -4, -3, 4, -2, 0, -1, 1, 0, -3, -4, -2, 4, 1, 0 }, { 1, -4, -3, -2, -5, -2, -1, -3, -2, -3, -3, -2, 6, 0, -1, 1, -1, -2, -7, -6, -1, 0, 0 }, {-1, -7, 1, 2, -6, -3, 3, -3, 0, -2, -1, 0, 0, 6, 1, -2, -2, -3, -6, -5, 1, 5, 0 }, {-3, -4, -3, -3, -5, -4, 1, -2, 2, -4, -1, -1, -1, 1, 6, -1, -2, -3, 1, -5, -1, 0, 0 }, { 1, 0, 0, -1, -3, 1, -2, -2, -1, -4, -2, 1, 1, -2, -1, 3, 2, -2, -2, -3, 1, 0, 0 }, { 1, -3, -1, -2, -4, -1, -3, 0, -1, -3, -1, 0, -1, -2, -2, 2, 4, 0, -6, -3, 1, -1, 0 }, { 0, -3, -3, -3, -3, -2, -3, 3, -4, 1, 1, -3, -2, -3, -3, -2, 0, 5, -8, -3, -2, -2, 0 }, {-7, -8, -8, -8, -1, -8, -3, -6, -5, -3, -6, -4, -7, -6, 1, -2, -6, -8, 12, -2, -5, -6, 0 }, {-4, -1, -5, -5, 4, -6, -1, -2, -5, -2, -4, -2, -6, -5, -5, -3, -3, -3, -2, 8, -2, -4, 0 }, { 1, -4, 5, 3, -4, 1, 2, -2, 1, -3, -3, 4, -1, 1, -1, 1, 1, -2, -5, -2, 6, 4, 0 }, { 0, -6, 3, 5, -5, -1, 2, -2, 0, -2, -1, 1, 0, 5, 0, 0, -1, -2, -6, -4, 4, 6, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },};/* Function: XNU() * Date: 18 Nov 1997 [StL] * * Purpose: x-out of repetitive sequence. XNU tends to be * good at x'ing out short period tandem repeats. * * Note: Apply /only/ to protein sequence. * * Args: dsq: 1..len digitized sequence * len: length of dsq * * Return: number of characters x'ed out. */ intXNU(char *dsq, int len){ int i,k,off,sum,beg,end,top; int topcut,fallcut; double s0; int noff = 4; /* maximum search offset */ int mcut = 1; double pcut = 0.01; int *hit; double lambda = 0.346574; double K = 0.2; double H = 0.664; int xnum = 0; if (len == 0) return 0; hit = MallocOrDie(sizeof(int) * (len+1)); for (i=1; i<=len; i++) hit[i]=0; /* ** Determine the score cutoff so that pcut will be the fraction ** of random sequence eliminated assuming lambda, K, and H are ** characteristic of the database as a whole */ s0 = - log( pcut*H / (noff*K) ) / lambda; if (s0>0) topcut = floor(s0 + log(s0)/lambda + 0.5); else topcut = 0; fallcut = (int)log(K/0.001)/lambda; for (off=mcut; off<=noff; off++) { sum=top=0; beg=off; end=0; for (i=off+1; i<=len; i++) { sum += xpam120[(int) dsq[i]][(int) dsq[i-off]]; if (sum>top) { top=sum; end=i; } if (top>=topcut && top-sum>fallcut) { for (k=beg; k<=end; k++) hit[k] = hit[k-off] = 1; sum=top=0; beg=end=i+1; } else if (top-sum>fallcut) { sum=top=0; beg=end=i+1; } if (sum<0) { beg=end=i+1; sum=top=0; } } if (top>=topcut) { for (k=beg; k<=end; k++) hit[k] = hit[k-off] = 1; } } /* Now mask off detected repeats */ for (i=1; i<=len; i++) if (hit[i]) { xnum++; dsq[i] = Alphabet_iupac-1;} /* e.g. 'X' */ free(hit); return xnum;}/* Function: TraceScoreCorrection() * Date: Sun Dec 21 12:05:47 1997 [StL] * * Purpose: Calculate a correction (in integer log_2 odds) to be * applied to a sequence, using a second null model, * based on a traceback. M/I emissions are corrected; * C/N/J are not -- as if the nonmatching part and * matching part were each generated by the best null model. * The null model is constructed /post hoc/ as the * average over all the M,I distributions used by the trace. * * Return: the log_2-odds score correction. */floatTraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq){ float p[MAXABET]; /* null model distribution */ int sc[MAXCODE]; /* null model scores */ int x; int tpos; int score; /* Set up model: average over the emission distributions of * all M, I states that appear in the trace. Ad hoc? Sure, you betcha. */ FSet(p, Alphabet_size, 0.0); for (tpos = 0; tpos < tr->tlen; tpos++) if (tr->statetype[tpos] == STM) FAdd(p, hmm->mat[tr->nodeidx[tpos]], Alphabet_size); else if (tr->statetype[tpos] == STI) FAdd(p, hmm->ins[tr->nodeidx[tpos]], Alphabet_size); FNorm(p, Alphabet_size); for (x = 0; x < Alphabet_size; x++) sc[x] = Prob2Score(p[x], hmm->null[x]); /* could avoid this chunk if we knew we didn't need any degenerate char scores */ for (x = Alphabet_size; x < Alphabet_iupac; x++) sc[x] = DegenerateSymbolScore(p, hmm->null, x); /* Score all the M,I state emissions that appear in the trace. */ score = 0; for (tpos = 0; tpos < tr->tlen; tpos++) if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STI) score += sc[(int) dsq[tr->pos[tpos]]]; /* Apply an ad hoc 8 bit fudge factor penalty; * interpreted as a prior, saying that the second null model is * 1/2^8 (1/256) as likely as the standard null model */ score -= 8 * INTSCALE; /* Return the correction to the bit score. */ return Scorify(ILogsum(0, score)); }/* THE FOLLOWING CODE IS IN DEVELOPMENT. * it is commented out of the current release deliberately. * If you activate it, I'm not responsible for the consequences. */#if MICHAEL_JORDAN_BUYS_THE_PACERS/* Function: NewTraceScoreCorrection() * Date: Wed Feb 17 14:32:45 1999 [StL] * * Purpose: Calculate a correction (in integer log_2 odds) to be * applied to a sequence, using a second null model, * based on sequence endpoints. M/I emissions are corrected; * C/N/J are not -- as if the nonmatching part and * matching part were each generated by the best null model. * Each null model is constructed /post hoc/ from the * sequence composition of each matching domain (e.g. * a null2 model is constructed for each domain in a * multihit trace). * * Constraints on the construction of this function include: * 1) Paracel hardware can't deal with trace-dependent * null2 models. Original implementation of * TraceScoreCorrection() was dependent on traceback * and could not be reproduced on GeneMatcher. * GeneMatcher may be able to deal w/ sequence endpoint * dependent rescoring, though. * Although this function looks like it's trace- * dependent (because it's being passed a p7trace_s * structure), it's really not; only the sequence * endpoints are being used. * * 2) It is desirable that for multihit traces, * per-domain scores sum to the per-sequence score. * Otherwise people see this as a "bug" (cf. * bug #2, David Kerk, NRC). HMMER calculates the * per-domain scores by going through a separate * TraceScore() call for each one and separately * correcting them with TraceScoreCorrection(), * so we have to do each domain in a full trace * by a similar mechanism -- even if this means that * we're adopting a very dubiously post hoc * null model. * * Return: the log_2-odds score correction. */floatNewTraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq){ float ct[MAXABET]; /* counts of observed residues */ float p[MAXABET]; /* null2 model distribution (also counts) */ float sc[MAXCODE]; /* null2 model scores (as floats not int) */ int x; int tpos; int score; /* tmp score for real HMM, integer logodds */ float hmmscore; /* score for real HMM for this domain */ float null2score; /* score for null2 model for this domain */ float totscore; /* overall score for trace */ float maxscore; /* best score so far for single domain */ int in_domain; /* flag for whether we're counting this domain */ int sym; /* digitized symbol in dsq */ int ndom; /* number of domains counted towards score */ int nsym; /* number of symbols in this alignment */ totscore = 0.; maxscore = -FLT_MAX; in_domain = FALSE; ndom = 0; for (tpos = 0; tpos < tr->tlen; tpos++) { /* detect start of domain; start at N or J */ if (tpos < tr->tlen-1 && tr->statetype[tpos+1] == STB) { FCopy(ct, hmm->null, Alphabet_size); /* simple Dirichlet prior */ score = 0; null2score = 0.; nsym = 0; in_domain = TRUE; } /* Count stuff in domain starting with N->B or J->B transition */ if (in_domain) { sym = (int) dsq[tr->pos[tpos]]; /* count emitted symbols in domain */ if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STI) { P7CountSymbol(ct, sym, 1.0); nsym++; } /* score emitted symbols in domain towards HMM */ if (tr->statetype[tpos] == STM) score += hmm->msc[sym][tr->nodeidx[tpos]]; else if (tr->statetype[tpos] == STI) score += hmm->isc[sym][tr->nodeidx[tpos]]; /* score transitions in domain towards HMM */ score += TransitionScoreLookup(hmm, tr->statetype[tpos], tr->nodeidx[tpos], tr->statetype[tpos+1], tr->nodeidx[tpos+1]); } if (tr->statetype[tpos] == STE) /* done w/ a domain; calc its score */ { /* convert counts to null2 prob distribution */ FCopy(p, ct, Alphabet_size); FNorm(p, Alphabet_size); /* Convert probs to log-odds_e scores */ /* p can't be zero, because of prior */ for (x = 0; x < Alphabet_size; x++) sc[x] = log(p[x] / hmm->null[x]); /* null2 score = counts \dot scores */ null2score = FDot(ct, sc, Alphabet_size); printf("NSYM = %d NULL2 = %.1f\n", nsym, null2score); /* Apply an ad hoc 12 bit fudge factor penalty, per domain. * Interpreted probabilistically, saying that there's about * a 1/256 probability to transition into the second null model. */ null2score -= 12.; /* Now correct score1 using the null2 score. * If it's still > 0, add it to accumulated score. */ hmmscore = Scorify(score); hmmscore -= 1.44269504 * LogSum(0, null2score); if (hmmscore > 0.) { totscore += hmmscore; ndom++; } if (hmmscore > maxscore) maxscore = hmmscore; in_domain = FALSE; } } /* Single domain special case. */ if (ndom == 0) totscore = maxscore; /* Return the correction to the bit score */ return (P7TraceScore(hmm, dsq, tr) - totscore);}#endif /*0*/floatSantaCruzCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq){ return 0.0; /* UNFINISHED CODE */}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -