oops.c

来自「EM算法的改进」· C语言 代码 · 共 342 行

C
342
字号
/* * $Id: oops.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.4  2006/03/08 20:50:11  nadya * merge chamges from v3_5_2 branch * * Revision 1.3.4.1  2006/01/24 20:44:08  nadya * update copyright * * Revision 1.3  2006/01/09 02:47:09  tbailey * Fixed bug in e_step: put parentheses around conditional in * 	double log_pXijtheta = LOG(not_o[j]) + (conditional) * * Revision 1.2  2005/10/25 19:06:39  nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1  2005/07/29 17:23:52  nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************								       **	MEME++							       **	Copyright 1994-2006, The Regents of the University of California    **	Author: Timothy L. Bailey				       **								       ************************************************************************//* tlb 3-10-00; remove updating of background model *//* tlb 7-23-99; add nsites and nsites_obs to model; keep lambda <= 1 *//* tlb 7-12-99; multiply pXijtheta by not_o_ij in e_step before calc of zij *//* tlb 7-02-99; remove clobbering of theta by using logtheta[] in model *//* tlb 6-28-99; add prior to lambda estimation using wnsites *//* tlb 6-23-99; use same method for background counts for all models *//* tlb 6-23-99; use average probability of strand/rc strand for background *//* tlb 6-23-99; remove regularization of background *//* tlb 6-22-99; precalculate background probability prior to inner loop *//* tlb 6-22-99; combine zoops.c into this file *//* tlb 6-22-99; change theta to obs in many places *//* tlb 6-21-99; changed strand directions to compute     szik[j] = Pr(z_ij=1, s_ijk=1 | X_i, \theta)    zi[j] = Pr(z_ij=1 | X_i, \theta) = sum_k=0^NDIR-1 szik[j]*//* tlb 9-13-94; added different strand directions *//**********************************************************************//*	EM algorithm*//**********************************************************************/#include "meme.h"/**********************************************************************//*  m_step 	Do the M step of EM.	Calculate the expected residue frequencies using the	posterior probabilities of the site starts for each	sequence.	Time: O(n_samples*lseq*lseq)*//**********************************************************************/void m_step(  MODEL *model,			/* the model */  DATASET *dataset,		/* the dataset */  PRIORS *priors,		/* the priors */  double wnsites 		/* weight on prior on nsites */){  int i, j, k, ii, jj;  THETA theta = model->theta;		/* theta of motif */  THETA obs = model->obs;		/* observed frequencies of motif */  int w = model->w; 			/* width of motif */  BOOLEAN invcomp = model->invcomp;	/* use reverse complement strand, too */  int ndir = invcomp ? 2 : 1;		/* number of strands */  int alength = dataset->alength;	/* length of alphabet */  int n_samples = dataset->n_samples;	/* number of sequences in dataset */  SAMPLE **samples = dataset->samples;	/* samples */  double *back = dataset->back;		/* background frequencies */  double q=0;				/* $Q$; sum of motif expectations */  PTYPE ptype = priors->ptype;		/* type of prior */  PriorLib *plib = priors->plib;	/* library for mixture priors */  double total, total_obs;		/* scratch */  double loglike0 = 0;			/* log-likelihood of sites under null */  double loglike1;			/* log-likelihood of sites undr motif */  double *p_count = priors->prior_count;/* pseudo counts */  /* M step */  /* initialize the observed frequency matrix to zero */  for (i=0; i<w; i++) {    for (j=0; j < alength; j++) obs(i, j) = 0;   }  /* calculate the expected frequencies of residues in the different      site positions   */  /* get the expected COUNTS, $c_k$ */  for (i=0; i < n_samples; i++) { 		/* sequence */    SAMPLE *s = samples[i];			/* array of samples */    int lseq = s->length;			/* length this seq */    double sw = s->sw;				/* sequence weight */    double *lcb = s->logcumback;		/* cumulative backgnd. prob. */    double qi = 0;				/* sum of z_ij */    int last_j = lseq-w;			/* last site start */    if (lseq < w) continue;			/* skip sequence; too short */    /* get the counts on the two strands (DNA) */    for (k=0; k<ndir; k++) {			/* strand direction */      BOOLEAN ic = (k==1);			/* doing - strand */      double *szik = s->sz[k];			/* sz_ik */            for (j=0; j<=last_j; j++) {		/* site start */        int off = ic ? lseq-w-j : j;		/* - strand offset from rgt. */        char *res = ic ? s->resic+off : s->res+off;	/* integer sequence */	double z = szik[j] * sw;		/* weight for motif */	/* calculate: E[theta | X, z] */        for (ii=0; ii<w; ii++) {		/* position in sequence */	  int r = res[ii];          if (r<alength) {			/* normal letter */	    obs(ii, r) += z;			/* motif counts */          } else { 				/* 'X': spread z over all */            for (jj=0; jj<alength; jj++) obs(ii, jj) += z * back[jj];          }	} /* sequence position */	/* 	  calculate log-likelihood of sites under background model	*/        loglike0 += z * Log_back(lcb, j, w);	qi += z;				/* sum of z_ij */      } /* site start */    } /* strand */    q += qi;					/* $Q$; sum of q_i */  }  model->mll_0 = loglike0;			/* site log like. background */  /*      M step for theta: convert COUNTS $c_k$ to FREQUENCIES $f_k$      and use frequencies as estimate for theta--- $p_k^{(t+1)} = f_k$   */  /* convert counts to frequencies and regularize using pseudo-counts */  for (i=0; i<w; i++) {				/* width */    total = total_obs = 0.0;			/* total count for position i */    /* get the total observed counts */    for (j=0; j<alength; j++) total_obs += obs(i, j);    /* get pseudo counts using prior */    if (ptype == Dmix || ptype == Mega || ptype == MegaP)      mixture_regularizer(obs[i], plib, p_count);    /* adjust counts and total count by prior counts */    for (j=0; j<alength; j++) {      total += theta(i, j) = obs(i, j) + p_count[j];    }    /* normalize counts to probabilities */    for (j=0; j<alength; j++) {      obs(i, j) /= (total_obs ? total_obs : 1);	/* normalize to frequencies */      theta(i, j) /= total;			/* normalize to probability */     }  } /* w */  /* palindrome: enforce constraints */  if (model->pal) {    palindrome(theta, theta, w, alength);    palindrome(obs, obs, w, alength);  }  /* compute log likelihood of sites under motif model */  loglike1 = 0;  for (i=0; i<w; i++) {    for (j=0; j<alength; j++) {      double f = obs(i, j);			/* letter frequency */      if (f) loglike1 += f * LOG(f);		/* motif likelihood */    } /* letter */  } /* position in site */  model->mll_1 = loglike1 *= q;			/* site log like. under motif */  /* M step for observed lambda */  model->nsites_obs = q;  model->lambda_obs = MIN(model->nsites_obs / wps(dataset, w), 1.0);  /* M step for estimated lambda */  model->nsites = q*(1-wnsites) + model->psites*wnsites;  model->lambda = MIN(model->nsites / wps(dataset, w), 1.0);} /* m_step *//**********************************************************************//*	e_step	Do the E step of EM. 	OOPS and ZOOPS models.	Updates z array.	Returns log Pr(X | theta).	Time: O(n_samples*lseq*w)	See notes 9/13/94*//**********************************************************************/double e_step(  MODEL *model,                 /* the model */  DATASET *dataset		/* the dataset */){  int i, j, k, ii;  MTYPE mtype = model->mtype;		/* type of model */  THETA logtheta1 = model->logtheta;	/* motif log(theta) */  int w = model->w;			/* width of motif */  int n_samples = dataset->n_samples;	/* number of sequences */  BOOLEAN invcomp = model->invcomp;	/* use reverse complement strand, too */  int ndir = invcomp ? 2 : 1;		/* number of strands */  double log_sigma = log(1.0/ndir);	/* log \sigma */  double lambda = (mtype==Zoops) ? model->lambda : 0;	/* lambda */  double gamma = (mtype==Zoops) ? MIN(1.0,(lambda*wps(dataset,w))/n_samples) : 0;  double log_1mgamma = (mtype==Zoops) ? LOG(1.0 - gamma) : 0;  double logpX;				/* log Pr(X | \theta) */  /* E step */  convert_theta_to_log(model, dataset);	/* convert theta to log(theta) */  /* calculate all the posterior offset probabilities */  logpX = 0.0;				/* prob X given theta */  for (i=0; i < n_samples; i++){	/* sequence */    SAMPLE *s = dataset->samples[i]; 	/* sequence struct */    int lseq = s->length;		/* length of the sequence */    int m = lseq - w + 1;		/* number of possible sites */    double *zi = s->z;			/* Pr(z_ij=1 | X_i, \theta) */     double *not_o = s->not_o;		/* Pr(v_ij = 1) */    double *lcb = s->logcumback;	/* cumulative background probability */    double log_gamma =  (m && mtype==Oops) ? -LOG((double)m) : 0;/* log (1/m) */    double log_lambda = (m && mtype==Zoops) ? LOG(gamma / m) : 0;/* exact */    double log_pXitheta;		/* log Pr(X_i | theta) */    if (lseq < w) continue;		/* skip sequence; too short */    /* calculate probabilities for each strand direction */    for (k=0; k<ndir; k++) {			/* strand */      BOOLEAN ic = (k==1);			/* doing - strand */      double *szik = s->sz[k];		/* Pr(X_i | z_ij=1, s_ijk=1, \theta) */      /* calculate P(X_i | z_ij=1, \phi) */      for (j=0; j<m; j++) {			/* site start */        /* sum_j=1^m sum_k=0^ndir-1 Pr(X_i | z_ij=1, s_ijk=1, \phi) =		log Pr(X_i | z_ij=1, s_ijk=1 \phi) */	double log_pXijtheta = LOG(not_o[j]) +           ((mtype==Oops) ? log_sigma+log_gamma : log_sigma+log_lambda);	int off = ic ? lseq-w-j : j; 	/* - strand offset from rgt. */        char *res = ic ? s->resic+off : s->res+off;	/* integer sequence */	/* calculate the probability of positions outside of the site */        log_pXijtheta += lcb[lseq] - Log_back(lcb, j, w);	/* calculate the probability of positions in the site */	for (ii=0; ii<w; ii++) log_pXijtheta += logtheta1(ii, (int) res[ii]);        /* set log szik to:           Pr(X_i | z_ij=1, s_ijk=1, \theta) \sigma (\gamma or \lambda)        */	szik[j] = log_pXijtheta;        /* set log z_ij to:           sum_k=0^ndir-1 Pr(X_i|z_ij=1,s_ijk=1,\theta)\sigma(\gamma or \lambda)        */        zi[j] = !ic ? log_pXijtheta : LOG_SUM(zi[j], log_pXijtheta);      } /* site start */      } /* strand */    /* compute Pr(X_i | \phi) */    log_pXitheta = (mtype==Oops) ? LITTLE : lcb[lseq] + log_1mgamma;    for (j=0; j < m; j++) {			/* site start */      log_pXitheta = LOG_SUM(log_pXitheta, zi[j]);    }    /* calculate logpX = \sum_{i=1}^n Pr(X_i | \phi) */    logpX += log_pXitheta;    /* sz_ijk and z_ij: normalize, delog and account for erasing      1) Pr(z_ij=1, s_ijk=1 | X_i, \phi) \approx           P(z_ij=1, s_ijk=1 | X_i, \phi) P(v_ij = 1)       2) P(z_ij=1 | X_i, \phi, V) = sum_k Pr(z_ij=1, s_ijk=1 | X_i, \phi)    */    for (k=0; k<ndir; k++) {		/* strand */      double *szik = s->sz[k];		/* Pr(X_i | z_ij=1, s_ijk=1, \phi) */      for (j=0; j<m; j++) {		/* site start */        szik[j] = exp(szik[j] - log_pXitheta) * not_o[j];        zi[j] = (k==0) ? szik[j] : zi[j] + szik[j];      } /* site */    } /* strand */    for (j=m; j<lseq; j++) {		/* tail of sequence */      zi[j] = 0;    }  } /* sequence */  return logpX/log(2.0);} /* e_step *//**********************************************************************//*	convert_theta_to_log	Convert theta to log(theta) and include Pr(X).*//**********************************************************************/extern void convert_theta_to_log(  MODEL *model,				/* the model */  DATASET *dataset			/* the dataset */){  int i, j;  THETA theta = model->theta;			/* theta */  THETA logtheta = model->logtheta;		/* log(theta) */  int w = model->w;				/* width */  int alength = dataset->alength;		/* length of alphabet */  double *back = dataset->back;			/* background freqs */  for (i=0; i<w; i++) {				/* position */    for (j=0; j<alength; j++) {			/* letter */      logtheta(i, j) = LOGL(theta(i,j));    }    logtheta(i, j) = LOGL(back[j]);		/* Pr(X) */  }} /* convert_theta_to_log */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?