⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 camjul97.c

📁 这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer
💻 C
📖 第 1 页 / 共 2 页
字号:
/* Source code from Cambridge visit July 1997 * * Position-specific matrices. */#include "config.h"#include "squidconf.h"#include <stdlib.h>#include <stdio.h>#include <string.h>#include <limits.h>#include <float.h>#include <math.h>		#include "funcs.h"#include "structs.h"#include "squid.h"/* Function: MakeStarHMM() *  * Purpose:  Given an HMM with counts, create an HMM according *           to the star rule. In star models we typically expect *           that the counts have been collected using BLOSUM style *           weights. *            * Args:     hmm - HMM structure containing counts data *           mx  - Star vectors, mx[q][x] *           pq  - vector prior P(q) *           nq  - number of vectors *           pri - Dirichlet priors for other parameters  *            * Return:   (void) *           hmm is converted to probabilities.               */voidMakeStarHMM(struct plan7_s *hmm, float **mx, float *pq, int nq, struct p7prior_s *pri){  int    k;			/* counter over model position       */  int    x;			/* counter over symbol/transition    */  float *pxa;                   /* P(x | a) : our parameter estimate */  float *pqa;			/* P(q | a) for all q                */  int    q;			/* counters over vectors q           */  int    ai;			/* counter over symbols              */  /* Match emissions: Star rule implementation.   */  pxa = (float *) MallocOrDie(sizeof(float) * Alphabet_size);   pqa = (float *) MallocOrDie(sizeof(float) * nq);   for (k = 1; k <= hmm->M; k++)    {			/* calculate log P(q | a) unnormalized (i.e. + log P(a))*/      for (q = 0; q < nq; q++)	{	  pqa[q] = log(pq[q]);	  for (ai = 0; ai < Alphabet_size; ai++)	    pqa[q] += hmm->mat[k][ai] * log(mx[q][ai]);	}			/* calculate log P(x | a) unnormalized (i.e + log P(a))*/      for (x = 0; x < Alphabet_size; x++)	{	  pxa[x] = pqa[0] + log(mx[0][x]);	  for (q = 1; q < nq; q++)	    pxa[x] = LogSum(pxa[x], (pqa[q] + log(mx[q][x])));	}			/* normalize now to get P(x|a) and store */      LogNorm(pxa, Alphabet_size);                   FCopy(hmm->mat[k], pxa, Alphabet_size);      }    /* Everything else is done according to P7PriorifyHMM()   */  /* Model-dependent transitions are handled simply; Laplace.   */  FSet(hmm->begin+2, hmm->M-1, 0.);     /* wipe internal BM entries */  FSet(hmm->end+1, hmm->M-1, 0.);	/* wipe internal ME exits   */  hmm->tbd1        += 1.0;  hmm->begin[1]    += 1.0;  /* Main model transitions and insert emissions   */  for (k = 1; k < hmm->M; k++)    {      P7PriorifyTransitionVector(hmm->t[k], pri);      P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, pri->iq, pri->i, NULL);    }  Plan7Renormalize(hmm);  free(pxa);  free(pqa);  return;} #ifdef SRE_REMOVED/* Function: MakeIslandHMM() * * Purpose:  Given a sequence alignment of i = 1..nseq sequences, *           with columns j = 1..alen; and a sequence index idx *           to build the island from. Return a Plan7 island HMM in *           probability form. * * Args:     aseqs  - alignment *           ainfo  - alignment info *           idx    - index of which sequence to build island from *           null   - random sequence model [0..Alphabet_size-1] *           mx     - probability matrices mx[q][root b][x] *           bpri   - priors on root distributions bpri[q][root b] *           qpri   - prior probability distribution over matrices *           nmx    - number of joint probability matrices  * * Return:   a new Plan7 HMM  */struct plan7_s *MakeIslandHMM(char **aseqs, AINFO *ainfo, int idx, 	      float null[MAXABET], float ***mx, float **bpri, 	      float *qpri, int nmx){  struct plan7_s *hmm;		/* RETURN: Plan7 HMM                   */  int             j;		/* column position index               */  int             k;		/* model position index                */  int             q;		/* counter for matrices                */  int             x;		/* counter for symbols                 */  float          *mat;		/* a match emission probability vector */  float         **probq;	/* posterior P(q | column)             */  int             sym;		/* index of a symbol in alphabet       */  float           max;  int             qmax;  float         **pxaq;         /* P(x | a,q) vectors, [q][x]     */    int             b;		/* counter over root symbols */  /* Allocate a model which is the length of the   * raw sequence.   */   hmm  = AllocPlan7(DealignedLength(aseqs[idx]));  if (ainfo->sqinfo[idx].flags & SQINFO_NAME)    Plan7SetName(hmm, ainfo->sqinfo[idx].name);  if (ainfo->sqinfo[idx].flags & SQINFO_DESC)    Plan7SetDescription(hmm, ainfo->sqinfo[idx].desc);  Plan7SetNullModel(hmm, null, 350./351.); /* p1 made up; shouldn't matter*/  mat = (float *) MallocOrDie( sizeof(float) * Alphabet_size);  pxaq = FMX2Alloc(nmx, Alphabet_size);  /* Calculate the posterior probability distribution   * probq (= P(q | col)) over nmx different matrices   * at each column j -- probq[0..alen-1][0..nmx-1];   * currently does not use the prior on q, but does a   * winner-take-all rule.     */  probq = FMX2Alloc(ainfo->alen, nmx);  calc_probq(aseqs, ainfo, mx, bpri, qpri, nmx, probq);  /* Debugging   */  print_probq(stdout, probq, ainfo->alen, nmx);   for (k = 1, j = 0; j < ainfo->alen; j++)    {      if (isgap(aseqs[idx][j])) continue;      if (strchr(Alphabet, aseqs[idx][j]) != NULL)	sym = SYMIDX(aseqs[idx][j]);      else 	Die("MakeIslandHMM() can't handle ambiguous query symbols yet");      /* Calculate P(x | a, q) emission vectors for all matrices q       */      for (q = 0; q < nmx; q++) 	{	  for (x = 0; x < Alphabet_size; x++)	    {	      pxaq[q][x] = 0.0;	      for (b = 0; b < 20; b++)		pxaq[q][x] += mx[q][b][x] * mx[q][b][sym] * bpri[q][b];	    }	  FNorm(pxaq[q], Alphabet_size);	}	      /* Sum P(x | a, q) emission vectors over matrices q:       *  P(x | a, col) = \sum_q P(x | a, q, col) P(q | a, col)       *                = \sum_q P(x | a, q) P(q | col)       */      for (x = 0; x < Alphabet_size; x++) 	{	  hmm->mat[k][x] = 0.;	  for (q = 0; q < nmx; q++)	    hmm->mat[k][x] += probq[j][q] * pxaq[q][x];	  if (k < hmm->M)	    hmm->ins[k][x] = null[x];	}	        /* Reference annotation on columns: most probable matrix       */      max = -FLT_MAX;      for (q = 0; q < nmx; q++)	if (probq[j][q] > max) { qmax = q; max = probq[j][q]; }      hmm->rf[k] = 'a'+(char)qmax;   /* q > 9, so convert to char a-z*/            /* Consensus annotation on columns: original sequence.       */      hmm->cs[k] = aseqs[idx][j];      k++;    }  /* State transitions are set subjectively   */  hmm->tbd1 = 0.02;  for (k = 1; k < hmm->M; k++)    {      hmm->t[k][TMM] = 0.97;      hmm->t[k][TMI] = 0.02;      hmm->t[k][TMD] = 0.01;      hmm->t[k][TIM] = 0.20;      hmm->t[k][TII] = 0.80;      hmm->t[k][TDM] = 0.90;      hmm->t[k][TDD] = 0.10;    }    hmm->flags |= PLAN7_HASPROB | PLAN7_RF | PLAN7_CS;  FMX2Free(pxaq);  FMX2Free(probq);  free(mat);   return hmm;}#endif/* Function: ReadGJMMatrices() *  * Purpose:  Read GJM's file format for star-based mixture matrices. *           Very first line is nq. *           First line of a set is P(q), the prior of the matrix. *           Second line contains P(b|q), the prior of the root symbols, *              _in arbitrary order_ (the root distribution is not over AA's!)  *           Third line is blank. *           Next 20 lines give a 20x20 matrix of conditional probabilities; *             rows = root symbols b; cols = leaf symbols x; *             mx[row][col] = P(x | b).  *     *           Instead of storing as matrices, store as q x r vectors. * * Return:   (void) *           mx, pq, nq are returned via passed pointers. *           Caller must free FMX2Free(mx) *           Caller must free(pq).  */voidReadGJMMatrices(FILE *fp, float ***ret_mx, float **ret_pq, int *ret_nq){  float  **mx;			/* conditional p's [0..nq-1][0..19] */  float   *pq;	         	/* priors on vectors, [0..nq-1] */  int      nq, nr;		/* number of matrices, rows */  char buf[2048];  float tmppq;			/* prior for matrix */  int  q,r;			/* counter for matrices, rows */  int  x;			/* counter for symbols */  char *s;			/* tmp pointer into buf    */  				/* allocations */  if (fgets(buf, 2048, fp) == NULL) Die("read failed");  nr  = 20;  nq  = atoi(buf);    mx  = FMX2Alloc(nq*nr, 20);  pq  = (float *) MallocOrDie (nq*nr * sizeof(float));				/* parse matrices */  for (q = 0; q < nq; q++)     {      if (fgets(buf, 2048, fp) == NULL) Die("parse failed");       tmppq = atof(buf);      if (fgets(buf, 2048, fp) == NULL) Die("parse failed");       s = strtok(buf, "\n\t ");      for (r = 0; r < nr; r++) 	{	  pq[q*nr + r] = atof(s) * tmppq;	  s = strtok(NULL, "\n\t ");	}      if (fgets(buf, 2048, fp) == NULL) Die("parse failed");      for (r = 0; r < 20; r++)	{	  if (fgets(buf, 2048, fp) == NULL) Die("parse failed");	  s = strtok(buf, "\n\t ");	  for (x = 0; x < 20; x++)	    {	      mx[q*nr+r][x] = atof(s);	      s = strtok(NULL, "\n\t ");	    }	}				/* two blank lines */      if (fgets(buf, 2048, fp) == NULL) Die("parse failed");      if (fgets(buf, 2048, fp) == NULL) Die("parse failed");    }  *ret_mx  = mx;  *ret_pq  = pq;   *ret_nq  = nq*nr;  return;}#ifdef SRE_REMOVED/* Function: OldReadGJMMatrices() *  * Purpose:  Read GJM's file format for joint probability matrix sets. *           * Return:   (void) *           mx, qprior, nmx are returned via passed pointers. *           Caller must free mx: each matrix by FMX2Free(), then free(mx). *           Caller must also free(qprior).  */voidOldReadGJMMatrices(FILE *fp, float ****ret_mx, float **ret_qprior, int *ret_nmx){  float ***mx;			/* joint prob matrix [0..nmx-1][0..19][0..19] */  float   *qprior;		/* priors on matrices, [0..nmx-1] */  int  nmx;			/* number of matrices   */  char buf[2048];  int  q;			/* counter for matrices */  int  idx;			/* index for this matrix seen in file */  int  r,c;			/* counter for row, column */  char *s;			/* tmp pointer into buf    */  				/* pass one: count matrices */  nmx = 0;  while (fgets(buf, 2048, fp) != NULL)     if (Strparse("use [0-9]+ = .+", buf, 0) == 0)       nmx++;  rewind(fp);				/* allocations */  qprior = (float *) MallocOrDie (20 * sizeof(float));  mx     = (float ***) MallocOrDie (nmx * sizeof(float **));  for (q = 0; q < nmx; q++)    mx[q] = FMX2Alloc(20, 20);				/* pass two: parse matrices */  q = 0;  while (fgets(buf, 2048, fp) != NULL)     {      if (Strparse("use ([0-9]+) = (.+)", buf, 2) != 0) 	continue;      idx       = atoi(sqd_parse[1]);      qprior[q] = atof(sqd_parse[2]);				/* skip two lines in his new format */      if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed");      if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed");      for (r = 0; r < 20; r++)	{	  if (fgets(buf, 2048, fp) == NULL) 	    Die("ReadGJMMatrices(): parse failed");	  s = strtok(buf, "\n\t ");	  for (c = 0; c < 20; c++)	    {	      mx[q][r][c] = atof(s);	      s = strtok(NULL, "\n\t ");	    }	}      q++;    }  *ret_mx     = mx;  *ret_qprior = qprior;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -