subseq7.c

来自「EM算法的改进」· C语言 代码 · 共 763 行 · 第 1/2 页

C
763
字号
/* * $Id: subseq7.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.3  2006/03/08 20:50:11  nadya * merge chamges from v3_5_2 branch * * Revision 1.2.2.1  2006/01/24 20:44:08  nadya * update copyright * * Revision 1.2  2006/01/09 08:17:34  tbailey * *** empty log message *** * * Revision 1.1.1.1  2005/07/29 17:27:24  nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************								       **	MEME++							       **	Copyright 1994-2006, the Regents of the University of California**	Author: Timothy L. Bailey				       **								       ************************************************************************//* subseq7.c *//* 5-23-00 tlb; change objective function to be log likelihood ratio for  all models *//* 7-12-99 tlb; move not_o out of pY calculation and move to local/global_max *//* 7-12-99 tlb; multiply sw * not_o in m_step of align_top_subsequences so  that erased starts will have low likelihood *//* 7-01-99 tlb; multiply background counts by motif width for Tcm model *//* 6-29-99 tlb; add reverse complement DNA strand support */#include "meme.h"/* minimum probability of NOT overlapping a previous site for starting points */#define MIN_NOT_O .1/* use logs and integers */#define LOG_THETA_TYPE(V)		int *V[2][MAXSITE]#define LOG_THETAG_TYPE(V)		int *V[MAXSITE]/* print a subsequence from its integer encoded form */#define Print_res(out, res, w) 						\{									\  int i;								\  for (i=0; i<(w); i++) { fprintf(out, "%c", unhash((res)[i])); }	\  fprintf(out, "\n");							\}/* local functions */static void get_pY(  DATASET *dataset,			/* the dataset */  LOG_THETAG_TYPE(theta_1),		/* integer log theta_1 */  int w,				/* width of motif */  int pYindex				/* which pY array to use */);static void next_pY(  DATASET *dataset,			/* the dataset */  LOG_THETAG_TYPE(theta_1),		/* integer log theta_1 */  int w,				/* width of motif */  int *theta_0,				/* first column of previous theta_1 */  int pYindex				/* which pY array to use */);static void init_theta_1(  int w,			/* width of site */  char *res,			/* (encoded) letters of subsequence */  LOG_THETAG_TYPE(theta_1),	/* theta_1 */  int lmap[MAXALPH][MAXALPH] 	/* matrix of frequency vectors */ );static int global_max(  DATASET *dataset,	/* the dataset */  int w,		/* length of sites */   P_PROB maxima, 	/* array of encoded site starts of local maxima */  BOOLEAN ic 		/* use reverse complement, too */);static int local_max(  DATASET *dataset,	/* the dataset */  int w,		/* length of sites */   P_PROB maxima,  	/* array of encoded site starts of local maxima */  BOOLEAN ic 		/* use reverse complement, too */);/**********************************************************************//*	subseq7		Try subsequences as starting points and choose the	one which yields the highest score.	Score is computed by:		1) computing p(Y | theta_1)		2) finding the (sorted) postions of maximum pY		3) aligning the top NSITES0 scores for each value of NSITES0		4) computing the expected likelihood for each value of NSITES0	The computing of p(Y | theta_1) for successive	subsequences (values of theta_1) is optimized by	using p_i to calculate p_{i+1}.	Returns number of starting points updated in s_points array.	Updates s_points, array of starting points, one	for each value of NSITES0 tried-- finds one \theta for each	value of nsites0 specified in the input.	Uses globals:*//**********************************************************************/extern int subseq7(  MTYPE mtype,			/* type of model */  BOOLEAN ic,			/* use reverse complement strand of DNA, too */  THETA map,			/* freq x letter map */  DATASET *dataset,		/* the dataset */  int w,			/* width of motif */  int n_nsites0,		/* number of nsites0 values to try */  S_POINT s_points[]  		/* array of starting points: 1 per nsites0 */){  LOG_THETA_TYPE(ltheta);		/* integer encoded log theta */  int i, j, k, iseq, ioff;  int alength = dataset->alength;	/* length of alphabet */  int n_samples = dataset->n_samples;	/* number of samples in dataset */  SAMPLE **samples = dataset->samples;	/* samples in dataset */  int n_starts = 0;			/* number of sampled start subseq */  int n_maxima = ps(dataset, w);	/* upper bound on # maxima */  /* the local maxima positions */  P_PROB maxima = (P_PROB) mymalloc(n_maxima * sizeof(p_prob));  int lmap[MAXALPH][MAXALPH];	/* consensus letter vs. log frequency matrix */  double col_scores[MAXSITE];		/* not used */#ifdef PARALLEL  int start_seq, start_off=0, end_seq, end_off=0;#endif  /*     Set up the matrix of frequency vectors for each letter in the alphabet.    Column for X and matches to X are set to 1.0/alength so that    such matches are neither favored nor disfavored, and where they match    is irrelevant.  */  for (i=0; i<alength+1; i++) {    for (j=0; j<alength; j++) {      lmap[i][j] = (i<alength) ? INT_LOG(map[j][i]) : INT_LOG(1.0/alength);    }    lmap[i][j] = INT_LOG(1.0/alength);			/* X */  }  if (TRACE) { printf("w= %d\n", w); }  /* get the probability that a site starting at position x_ij would     NOT overlap a previously found motif  */  get_not_o(dataset, w, TRUE);  /* score all the sampled positions saving the best position for     each value of NSITES0 */#ifdef PARALLEL  /* Retrieve the previously-calculated starting and ending points. */  get_start_n_end(&start_seq, &start_off, &end_seq, &end_off);  /* Divide the various samples among processors. */  for (iseq = start_seq; iseq <= end_seq; iseq++) { /* sequence */#else /* not PARALLEL */  for (iseq = 0; iseq < n_samples; iseq++) {	/* sequence */#endif /* PARALLEL */    SAMPLE *s = samples[iseq];    int lseq = s->length;    char *res = s->res;				/* left to right */    char *name = s->sample_name;    double *not_o = s->not_o;    int max_off, init_off;    if (lseq < w) continue;			/* shorter than motif */#ifdef PARALLEL    if (mpMyID() == 0)#endif    if ((!NO_STATUS) && ((iseq % 5) == 0)) {      fprintf(stderr, "starts: w=%d, seq=%d, l=%d          \r", w, iseq, lseq);     }    /* Set the appropriate starting and ending points. */#ifdef PARALLEL    if (iseq == start_seq)      init_off = start_off;    else#endif      init_off = 0;#ifdef PARALLEL    if (iseq == end_seq)      max_off = MIN(end_off, lseq - w);    else#endif      max_off = lseq - w;    /*      Loop over all subsequences in the current sequence testing them      each as "starting points" (inital values) for theta    */    for (ioff = init_off; ioff <= max_off; ioff++) {/* subsequence */       /* warning: always do the next step; don't ever         "continue" or the value of pY will not be correct since         it is computed based the previous value       */      /* convert subsequence in dataset to starting point for EM */      init_theta_1(w, res+ioff, &ltheta[1][0], lmap);      if (ioff == init_off) { 			/* new sequence */        /* Compute p(Y_ij | theta_1^0) */        if (!ic) {          get_pY(dataset, &ltheta[1][0], w, 0);        } else {          get_pY(dataset, &ltheta[1][0], w, 1);          get_pY(dataset, &ltheta[1][0], w, 2);        }      } else {					/* same sequence */                /* get theta[0][0]^{k-1} */        init_theta_1(1, res+ioff-1, &ltheta[0][0], lmap);        /* compute p(Y_ij | theta_1^k) */        if (!ic) {          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 0);        } else {          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 1);          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 2);        }      } /* same sequence */      /* skip if there is a high probability that this subsequence         is part of a site which has already been found       */      if (not_o[ioff] < MIN_NOT_O) continue;      /*fprintf(stderr, "subseq: %d %d\r", iseq+1, ioff+1);*/      /* put highest pY into first scratch array if using both DNA strands */      for (i=0; ic && i<n_samples; i++) {	/* sequence */	SAMPLE *s = samples[i];	int lseq = s->length;	int *pY = s->pY[0];			/* p(Y_j | theta_1) both */	int *pY1 = s->pY[1];			/* p(Y_j | theta_1) + strand */	int *pY2 = s->pY[2];			/* p(Y_j | theta_1) - strand */	char *pYic = s->pYic;			/* site on - strand */	int last_j = lseq-w;			/* last possible site */        if (lseq < w) continue;		/* skip if sequence too short */	for (j=0, k=last_j; j<=last_j; j++, k--) {	/* site start */          if (pY2[k] > pY1[j]) {		/* site on - strand */            pYic[j] = '\1'; pY[j] = pY2[k];          } else {				/* site on + strand */            pYic[j] = '\0'; pY[j] = pY1[j];           }	} /* site start */      } /* sequence */      /* get a sorted list of the maxima of pY */      n_maxima = get_max(mtype, dataset, w, maxima, ic, TRUE);      /* align the top nsites0 subsequences for each value         of nsites0 and save the alignments with the highest likelihood       */      n_starts += align_top_subsequences(mtype, w, dataset, iseq, ioff,         res+ioff, name, n_nsites0, n_maxima, maxima, col_scores, s_points);    } /* subsequence */  } /* sequence */#ifdef PARALLEL    reduce_across_s_points(s_points, samples, n_nsites0, n_starts);#endif /* PARALLEL */  /* remove starting points that were not initialized from s_points */  for (i=0; i<n_nsites0; i++) {    if (s_points[i].score == LITTLE) {      for (j=i; j<n_nsites0-1; j++) s_points[j] = s_points[j+1];      n_nsites0--;      i--;      /*fprintf(stderr,"\nremoving starting points: i=%d\n", i);*/    }  }  if (TRACE){ printf("Tested %d possible starts...\n", n_starts); }  myfree(maxima);  return n_nsites0;} /* subseq7 *//**********************************************************************//*	get_pY	Compute log Pr(Y_ij | theta_1) *//**********************************************************************/static void get_pY(  DATASET *dataset,			/* the dataset */  LOG_THETAG_TYPE(theta_1),		/* integer log theta_1 */  int w,				/* width of motif */  int pYindex				/* which pY array to use */){  int i, j, k;  int n_samples = dataset->n_samples;  SAMPLE **samples = dataset->samples;    for (i=0; i<n_samples; i++) { 	/* sequence */    SAMPLE *s = samples[i];    int lseq = s->length;    char *res = pYindex<2 ? s->res : s->resic;	/* integer sequence */    int *pY = s->pY[pYindex];		/* p(Y_j | theta_1) */    if (lseq < w) continue;		/* skip if sequence too short */    for (j=0; j<=lseq-w; j++) {		/* site start */      char *r = res + j;      int p = 0;      for (k=0; k<w; k++) {		/* position in site */	p += theta_1[k][(int) (*r++)];      }      pY[j] = p;    }    for (j=lseq-w+1; j<lseq; j++) pY[j] = 0;	/* impossible positions */  }}/**********************************************************************//*	next_pY	Compute the value of p(Y_ij | theta_1^{k+1})	from p(Y_ij | theta_1^{k} and the probability	of first letter of Y_ij given theta_1^k,	p(Y_ij^0 | theta_1^k).*//**********************************************************************/static void next_pY(  DATASET *dataset,			/* the dataset */  LOG_THETAG_TYPE(theta_1),		/* integer log theta_1 */  int w,				/* width of motif */  int *theta_0,				/* first column of previous theta_1 */  int pYindex				/* which pY array to use */){  int i, k;  int *theta_last = theta_1[w-1];	/* last column of theta_1 */  int n_samples = dataset->n_samples;  SAMPLE **samples = dataset->samples;    for (i=0; i < n_samples; i++) { 	/* sequence */    SAMPLE *s = samples[i];		/* sequence */    int lseq = s->length;		/* length of sequence */    char *res = pYindex<2 ? s->res : s->resic;	/* integer sequence */    int *pY = s->pY[pYindex];		/* p(Y_j | theta_1) */    char *r = res+lseq-1;		/* last position in sequence */    char *r0 = res+lseq-w-1;	/* prior to start of last subsequence */    int j, p;    if (lseq < w) continue;		/* skip if sequence too short */    /* calculate p(Y_ij | theta_1) */    for (j=lseq-w; j>0; j--) {      pY[j] = pY[j-1] + theta_last[(int)(*r--)] - theta_0[(int)(*r0--)];    }    /* calculate p(Y_i0 | theta_1) */    p = 0;    r = res;    for (k=0; k<w; k++) {     		/* position in site */      p += theta_1[k][(int)(*r++)];    }    pY[0] = p;  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?