discretize.c

来自「EM算法的改进」· C语言 代码 · 共 468 行

C
468
字号
/* * $Id: discretize.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.3  2006/03/08 20:50:11  nadya * merge chamges from v3_5_2 branch * * Revision 1.2.4.1  2006/01/24 20:44:08  nadya * update copyright * * Revision 1.2  2005/10/25 19:06:39  nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1  2005/07/29 00:18:18  nadya * Importing from meme-3.0.14, and adding configure/make * *//*#define DEBUG*//************************************************************************								       **	MEME++							       **	Copyright 2000-2006, The Regents of the University of California    **	Author: Timothy L. Bailey				       **								       ************************************************************************/#include "meme.h"#include "dpalign.h"/* rounding stuff */#define RNDEPS 1e-12static int ma_adjust(  P_PROB sites,                                 /* the sites */  int nsites,                                   /* the number of sites */  int w,                                        /* width of sites */  int flank,                                    /* add flank cols on left+rgt */  int min_w,		 			/* minimum width allowed */  DATASET *dataset,  				/* the dataset */  int *off 					/* best offset */);/***********************************************************************//*	get_best_nsites	Find the value of nsites that minimizes the significance of the	log likelihood ratio.	Returns nsites.*//***********************************************************************/static int get_best_nsites (  MODEL *model,					/* the model */  DATASET *dataset,				/* the dataset */  int min_nsites,				/* minimum sites */  int max_nsites,				/* minimum sites */  int w, 	 				/* width of motif */  int n_maxima,					/* number of maxima */  P_PROB maxima,				/* the maxima positions */  double *col_scores,				/* column scores */  double *best_log_ev				/* best E-value */){  int i;  MTYPE mtype = model->mtype;			/* type of model */  int n_nsites; 				/* number of different nsites */  int nsites;					/* optimal number of sites */  S_POINT *s_points = NULL;			/* for use with align_top...*/  /* limit maximum nsites */  max_nsites = MIN(n_maxima, max_nsites);	  n_nsites = max_nsites-min_nsites+1;	  /* create array for use by align_top_subsequences */  Resize(s_points, n_nsites, S_POINT);  /* initialize the s_points */  for (i=0, nsites=min_nsites; i<n_nsites; i++, nsites++) {    s_points[i].nsites0 = nsites;		/* number of sites */    s_points[i].score = LITTLE;			/* no score yet */  }  /*      align the top nsites sorted subsequences and compute the objective      function on each alignment   */  (void) align_top_subsequences(mtype, w, dataset, 0, 0, 0, 0, n_nsites,     n_maxima, maxima, col_scores, s_points);  /*      determine the significance of the score for each number of      sites and chose the number of sites to minimize it   */  *best_log_ev = BIG;  nsites = 0;  for (i=0; i<n_nsites; i++) {			/* nsites */    double score = s_points[i].score;		/* -log_pop */    int N = s_points[i].nsites0;    double wN = s_points[i].wgt_nsites;     double log_ev = get_log_sig(score, model->mtype, w, wN, N, model->invcomp,      model->pal, dataset);    RND(log_ev, RNDDIG, log_ev);    if (TRACE) printf("w %d N %d wN %f log_ev %f \n", w, N, wN, log_ev);    /* save if best so far */    if (RNDEPS < *best_log_ev - log_ev) {	/* best E-value */      nsites = N;				/* number of sites */      *best_log_ev = log_ev;    }  } /* nsites */  myfree(s_points);  return nsites;} /* get_best_nsites *//***********************************************************************//*	set_z	Set the z to 1/0 using list of sites (maxima).*//***********************************************************************/extern void set_z (  MODEL *model,					/* the model */  DATASET *dataset 				/* the dataset */){  int i, j;  int nsites = model->nsites_dis;		/* new nsites */  P_PROB maxima = model->maxima;		/* the maxima positions */  int n_samples = dataset->n_samples;		/* number of sequences */  SAMPLE **samples = dataset->samples;		/* sequences */  /* set all z and sz to 0 */  for (i=0; i<n_samples; i++) {			/* sequence */    SAMPLE *s = samples[i];			/* sample i */    int lseq = s->length;			/* length of sequence */    double *zi= s->z;				/* z */    double **szi= s->sz;			/* sz */    for (j=0; j<lseq; j++) {      zi[j] = szi[0][j] = szi[1][j] = 0;    }  }  /* set z and sz to 1 for selected sites */  for (i=0; i<nsites; i++) {    SAMPLE *s = samples[maxima[i].x];		/* sample */    int y = maxima[i].y;			/* position of site */    double *zi= s->z;				/* z */    double **szi= s->sz;			/* sz */    zi[y] = 1.0;    if (maxima[i].ic) szi[1][y] = 1.0; else szi[0][y] = 1.0;  }} /* set_z *//***********************************************************************//*	set_pY	Initialize pY from z with given offset.*//***********************************************************************/static void set_pY(  int w, 				/* motif width */  BOOLEAN invcomp, 			/* use reverse complement strand, too */  BOOLEAN pal,				/* force palindrome */  DATASET *dataset,			/* the dataset */  int off				/* offset to shift motif */){  int i, j;  int n_samples = dataset->n_samples;		/* number of sequences */  SAMPLE **samples = dataset->samples;		/* sequences */  /*     put integerized, weighted log z into pY array  */  for (i=0; i<n_samples; i++) {			/* sequence */    SAMPLE *s = samples[i];			/* sequence i */    int lseq = s->length;			/* sequence length */    int last_j = lseq-w;			/* last start */    double *z = s->z;				/* z */    double *sz1 = s->sz[0];			/* + strand */    double *sz2 = s->sz[1];			/* - strand */    double sw = s->sw;				/* weight of sequence */    int *pY = s->pY[0];				/* p(Y_j | theta_1) both */    char *pYic = s->pYic;			/* site on - strand */    if (lseq < w) continue;			/* sequence too short */    /* initialize pY and pYic */    for (j=0; j<=last_j; j++) {      pY[j] = INT_LOG(0.0);			/* z == 0 */      pYic[j] = '\0';				/* site on + strand */    }    for (j=0; j<lseq; j++) {			/* site start */      int jj = j + off;				/* new site start */      double zj, sz1j, sz2j;			/* rounded z's */      if (jj<0 || jj>last_j) continue;      /* no z available? */      if (j > last_j) {				/* no z available */        pY[jj] = 0;				/* no site */        pYic[jj] = '\0';			/* strand doesn't matter */        continue;      }       /* not using inverse strand, too? */      zj = z[j];      if (!invcomp) {				/* use z */        pY[jj] = INT_LOG(sw * zj);        pYic[jj] = '\0';			/* site on + strand */        continue;      }       /* using inverse complement strand, too */      sz1j = sz1[j];      sz2j = sz2[j];      if (pal) {				/* use z (sum of sz1 and sz2)*/        pY[jj] = INT_LOG(sw * zj);        pYic[jj] = (sz2j > sz1j) ? '\1' : '\0';	/* choose strand */      } else if (sz2[j] > sz1[j]) {		/* use sz2 */        pY[jj] = INT_LOG(sw * sz2j);        pYic[jj] = '\1'; 			/* site on - strand */      } else {					/* use sz1 */	        pY[jj] = INT_LOG(sw * sz1j);        pYic[jj] = '\0'; 			/* site on + strand */      }    } /* site start */  } /* sequence */} /* set_pY *//***********************************************************************//*	discretize		Search over width and offset of motif to minimize E-value of	log likelihood ratio.  		1) get best nsites using E-value		2) calculate p-value of each column of motif		3) shorten using p-value		4) get best nsites using E-value	Sets z to 1.0 for sites, 0 for non-sites in dataset.	Sets w, nsites_dis, maxima and sig in model.	Returns the optimum number of sites.*//***********************************************************************/extern double discretize(  MODEL *model,				/* the model */  DATASET *dataset			/* the dataset */){  int i;  int n_samples = dataset->n_samples;		/* number of sequences */  BOOLEAN invcomp = model->invcomp;     	/* reverse complement strand */  BOOLEAN pal = model->pal;     		/* force DNA palindromes */  MTYPE mtype = model->mtype;			/* type of model */  int w, best_w, min_w, max_w, ini_w, ma_w;	/* motif width */  int min_nsites = model->min_nsites;		/* minimum nsites */  int max_nsites = model->max_nsites;		/* maximum nsites */  int n_maxima;		 			/* number of possible sites */  double log_pop;				/* log product of p-values */  double log_pv, best_log_pv;			/* log p-value of motif */  double log_ev, best_log_ev;			/* log E-value of score */  int ma_off=0;					/* multiple alignment offset */  int best_off;					/* motif offset */  int nsites=0, best_nsites=0;			/* number of sites */  double col_scores[MAXSITE];			/* column scores */  /*     set minimum and maximum widths  */  ini_w = best_w = model->w;			/* initial motif width */  min_w = model->min_w;				/* minimum width */  max_w = ini_w;				/* maximum width */  w = 0;  /* create space for the maxima */  n_maxima = (mtype==Tcm) ? ps(dataset, min_w) : n_samples;  Resize(model->maxima, n_maxima, p_prob);  /*     initialize pY from z   */  set_pY(model->w, invcomp, pal, dataset, 0);  /*     get the possible sites using maximum width  */  n_maxima = get_max(mtype, dataset, max_w, model->maxima, invcomp, FALSE);  if (n_maxima < min_nsites) return nsites;  /*     sort them by z   */  for (i=0; i<n_maxima; i++) {    int x = model->maxima[i].x;    SAMPLE *s = dataset->samples[x];    int y = model->maxima[i].y;    RND(s->z[y], 11, model->maxima[i].prob);  }  qsort((char *) model->maxima, n_maxima, sizeof(p_prob), pY_compare);#ifdef DEBUG#ifdef PARALLEL#undef printf#endif  printf("nmax = %d start %d %d %3.0f %3.0f %s\n", n_maxima,     model->iseq, model->ioff, model->pw, model->psites, model->cons0);  print_theta(1000*model->iseq+model->ioff, 1, model->nsites_dis,    model->theta, model->w, 0, "", dataset, stdout);  for (i=0; i<n_maxima; i++) {    printf("max %d %15.12f x %d y %d\n", i, model->maxima[i].prob,      model->maxima[i].x, model->maxima[i].y);  }  fflush(stdout);#ifdef PARALLEL#define printf if (mpMyID() == 0) printf#endif#endif  /*     get the best number of sites using the full motif width  */  best_nsites = (min_nsites==max_nsites) ? min_nsites :    get_best_nsites(model, dataset, min_nsites, max_nsites,     max_w, n_maxima, model->maxima, col_scores, &log_ev);  if (best_nsites < min_nsites) return nsites;  /* trim the alignment to include the minimum possible number of gaps */  ma_w = ini_w;  if (dataset->ma_adj) {    int flank = w/2;				/* amt. of flank for mult. a. */    ma_w = ma_adjust(model->maxima, best_nsites, max_w, flank, min_w, dataset,       &ma_off);    max_w = ma_w;				/* new maximum width */  } /* ma_adj */  /*    update the maxima positions by shifting them  */  for (i=0; i<best_nsites; i++) {    BOOLEAN ic = model->maxima[i].ic;			/* on - strand */    model->maxima[i].y += (ic ? ini_w-ma_w-ma_off : ma_off);  }  /*     get the p-values of the columns given the best number of sites    and the gap-trimmed width  */  (void) get_best_nsites(model, dataset, best_nsites, best_nsites,    max_w, n_maxima, model->maxima, col_scores, &log_ev);  /*     shorten the motif: find subsequence of columns with best p-value  */  best_log_pv = best_log_ev = BIG;  best_off = 0;  for (w=min_w; w<=max_w; w++) {		/* width */    int l_off;					/* left offset */    for (l_off=0; l_off<=max_w-w; l_off++) {	/* left edge */      /* get the product of column p-values */      for (i=log_pop=0; i<w; i++) log_pop += col_scores[i+l_off];      /* get the p-value of the pop */      log_pv =        get_log_sig(-log_pop, mtype, w, best_nsites, 0, invcomp, pal, dataset);      if (TRACE)         printf(        "ini_w %d ma_w %d w %d ma_off %d off %d log_pv %f init cons %*.*s\n",         ini_w, ma_w, w, ma_off, l_off, log_pv, w, w, model->cons0+l_off+ma_off);      /* save if best so far: better log_pv */      if (RNDEPS < best_log_pv - log_pv) {        if (TRACE) printf("better: w %d best log_pv %g\n", w, log_pv);	best_w = w;	best_off = l_off;	best_log_pv = log_pv;      }    } /* l_off */  } /* w  */  /*    update the maxima positions by shifting them  */  for (i=0; i<best_nsites; i++) {    BOOLEAN ic = model->maxima[i].ic;			/* on - strand */    model->maxima[i].y += (ic ? ma_w-best_w-best_off : best_off);  }    /*     get the best number of sites for the shortened motif and the final E-value   */  best_nsites =    get_best_nsites(model, dataset, min_nsites, best_nsites,     best_w, n_maxima, model->maxima, col_scores, &best_log_ev);  /*     set the best motif info in the model and    discretize the sites in z using the best width, offset and nsites   */  model->w = best_w;				/* best width */  model->nsites_dis = best_nsites;		/* after discretization */  model->logpv = best_log_pv;			/* p-value */  model->logev = best_log_ev;			/* E-value */  set_z(model, dataset);  if (TRACE)    printf(    "ini_w %d ma_w %d w %d ma_off %d off %d nsites %d pv %9.2e EV %9.2e cons %*.*s %s\n",    ini_w, ma_w, best_w, ma_off, best_off, best_nsites, exp(best_log_pv),     exp(best_log_ev), best_w, best_w, (model->cons0)+best_off, model->cons0);  return best_nsites; } /* discretize *//**********************************************************************//*	ma_adj	Shorten a motif to the longest g-alignment of width at least	min_w.  A g-alignment is an alignment with no more than g	gapped sequences per column.  Values of g in [0..] are tried	until an aligment of width min_w or greater is found.	Returns best width and offset.*//**********************************************************************/static int ma_adjust(  P_PROB sites,                                 /* the sites */  int nsites,                                   /* the number of sites */   int w,                                        /* width of sites */   int flank,                                    /* add flank cols on left+rgt */  int min_w,		 			/* minimum width allowed */  DATASET *dataset,  				/* the dataset */  int *off					/* best offset */){  char **ma;					/* multiple aligment */  int left = MIN(flank, sites[0].y);		/* left edge after algnmnt. */  int right = left+w-1;				/* right edge after algnmnt. */  /* get the multiple alignment */  ma = dp_multi_align(sites, nsites, w, flank, dataset);  /* get longest g-alignment of width at least min_w */  (void) g_align(ma, nsites, strlen(ma[0]), left, right, min_w, off, &w);  /* free space */  free_2array(ma, nsites);  return w;					/* return the width */} /* ma_adjust */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?