subseq7.c

来自「EM算法的改进」· C语言 代码 · 共 763 行 · 第 1/2 页

C
763
字号
/**********************************************************************//*	get_max	Find the erased maxima of pY.	Returns the length of the (sorted) list of maxima.	For the oops and zoops models, one maximum is found per sequence.	For the tcm model, all non-overlapping local maxima are found.*//**********************************************************************/extern int get_max(  MTYPE mtype,		/* the model type */  DATASET *dataset,	/* the dataset */  int w,		/* width of sites */   P_PROB maxima, 	/* array of encoded site starts of local maxima */  BOOLEAN ic, 		/* use reverse complement, too */  BOOLEAN sort		/* sort the maxima */){  int n_maxima;  /* find the maxima */  if (mtype == Oops || mtype == Zoops) {    n_maxima = global_max(dataset, w, maxima, ic);  } else {    n_maxima = local_max(dataset, w, maxima, ic);  }  /* sort the local maxima of pY[1] */  if (sort) qsort((char *) maxima, n_maxima, sizeof(p_prob), pY_compare);  return n_maxima;} /* get_max *//**********************************************************************//*	global_max		Find the position in each sequence with the globally maximal	value of pY * not_o.	Returns the number of maxima found and 	the updated array of maxima positions.*//**********************************************************************/static int global_max(  DATASET *dataset,	/* the dataset */  int w,		/* length of sites */   P_PROB maxima, 	/* array of encoded site starts of local maxima */  BOOLEAN ic 		/* use reverse complement, too */){  int i, j;  SAMPLE **samples = dataset->samples;		/* datset samples */  int n_samples = dataset->n_samples;		/* number samples in dataset */  int n_maxima = 0;				/* number of maxima found */  /* find the position with maximum pY in each sequence */  for (i=0; i<n_samples; i++) {			/* sequence */    SAMPLE *s = samples[i];    int lseq = s->length;    int last_j = lseq-w;			/* start of last subseq */    int *pY = s->pY[0];				/* p(Y_j | theta_1) */    char *pYic = s->pYic;			/* site on - strand */    int *log_not_o = s->log_not_o;		/* Pr(no overlap prior site) */    int max_j = 0;				/* best offset */    int max = pY[0] + log_not_o[0]; 		/* initial maximum */    if (lseq < w) continue;			/* skip if too short */    for (j=0; j<=last_j; j++) {			/* subsequence */      if (pY[j] + log_not_o[j] > max) {		/* new maximum found */        max_j = j;	max = pY[j] + log_not_o[j]; 		/* pY * Pr(no overlap) */      } /* new maximum */    } /* subsequence */    /* record the maximum for this sequence */    maxima[n_maxima].x = i;    maxima[n_maxima].y = max_j;    maxima[n_maxima].ic = ic && pYic[max_j];	/* on - strand */    maxima[n_maxima].prob = max;    n_maxima++;  } /* sequence */  return n_maxima;} /* global_max *//**********************************************************************//*	local_max	Find the local maxima of pY * not_o 	subject to the constraint that they are separated by at 	least w positions. 	Returns the number of local maxima found and the	updated array of maxima positions.*//**********************************************************************/static int local_max(  DATASET *dataset,	/* the dataset */  int w,		/* length of sites */   P_PROB maxima,  	/* array of encoded site starts of local maxima */  BOOLEAN ic		/* use reverse complement, too */){  int i, j, k, next_j, n_maxima;  SAMPLE **samples = dataset->samples;		/* datset samples */  int n_samples = dataset->n_samples;		/* number samples in dataset */  /* Find the non-overlapping local maxima of p(Y_ij | theta_1) */  n_maxima = 0;  for (i=0; i<n_samples; i++) {			/* sequence */    SAMPLE *s = samples[i];    int lseq = s->length;			/* length of sequence */    int *pY = s->pY[0];				/* p(Y_j | theta_1) */    int *log_not_o = s->log_not_o;		/* Pr(no overlap prior site) */    int last_j = lseq-w;			/* last possible site */    int max = pY[0]+log_not_o[0]; 		/* initial maximum */    if (lseq < w) continue;			/* skip if too short */    maxima[n_maxima].x = i;			/* candidate */    maxima[n_maxima].y = 0;			/* candidate site */    maxima[n_maxima].prob = max;    next_j = MIN(w, last_j+1);			/* next possible maximum */    for (j=0; j<=last_j; j++) {			/* subsequence */      int prob = pY[j]+log_not_o[j]; 		/* pY * Pr(no overlap) */      if (j==next_j) n_maxima++;		/* candidate not exceeded */      if (j==next_j || prob>max) {		/* create/overwrite */        max = prob;				/* new max */        maxima[n_maxima].x = i;			/* overwrite the candidate */        maxima[n_maxima].y = j;			/* site */        maxima[n_maxima].prob = max;		/* max */        next_j = MIN(j+w, last_j+1);		/* next possible maximum */      } /* create/overwrite candidate */    }    n_maxima++;					/* record last maxima */  }  /* set the strand and position */  for (k=0; k<n_maxima; k++) {    int i = maxima[k].x;			/* site position */    int j = maxima[k].y;			/* site position */    SAMPLE *s = samples[i];			/* sequence record */    maxima[k].ic = ic && s->pYic[j];		/* on - strand */  } /* n_maxima */  return n_maxima;} /* local_max *//**********************************************************************//*        pY_compare        Compare the pY of two start sequences.  Return <0 0 >0        if the first pY is <, =, > the first pY.*//**********************************************************************/extern int pY_compare(  const void *v1,   const void *v2 ){  double result;  const struct p_prob * s1 = (const struct p_prob *) v1;   const struct p_prob * s2 = (const struct p_prob *) v2;   if ((result = s2->prob - s1->prob) != 0) {    return (result<0) ? -1 : +1;  } else if ((result = s2->x - s1->x) != 0) {    return result;  } else {    return s2->y - s1->y;  }}/**********************************************************************//*	init_theta_1	Convert a subsequence to a motif.	Uses globals:*//**********************************************************************/static void init_theta_1(  int w,			/* width of site */  char *res,			/* (encoded) letters of subsequence */  LOG_THETAG_TYPE(theta_1),	/* theta_1 */  int lmap[MAXALPH][MAXALPH]  	/* matrix of frequency vectors */ ){  int m;  for (m=0; m<w; m++) {    theta_1[m] = lmap[(int)res[m]];  }} /* init_theta_1 *//**********************************************************************//*	score_llr_pop     	Align the top nsites0 subsequences for each value	of nsites0 and save the alignments with the highest         product of p-values of log likelihood ratio of the columns.	Saves -log_pop as the score for the start.	Returns number of values of nsites0 tried.*/ /**********************************************************************/static int score_llr_pop(  MTYPE mtype,				/* type of model */  int w,				/* width of motif */  DATASET *dataset,			/* the dataset */  int iseq,				/* sequence number of starting point */  int ioff,				/* sequence offset of starting point */  char *eseq,				/* integer encoded subsequence */  char *name,				/* name of sequence */  int n_nsites0,			/* number of nsites0 values to try */  int n_maxima,				/* number of local maxima */  P_PROB maxima,			/* sorted local maxima indices */  double *col_scores,			/* column scores for last start point */  S_POINT s_points[]			/* array of starting points */){  int i, j, k, i_nsites0;  int next_seq;				/* index of next subsequence to align */  int n_starts = 0;			/* number of nsites0 tried */  int nsites0;				/* starting nsites rounded down */  int alength = dataset->alength;	/* lenght of alphabet */  double *back = dataset->back;		/* background frequencies */  SAMPLE **samples = dataset->samples;	/* the sequences */  double counts[MAXSITE][MAXALPH];	/* array to hold observed counts */  double wN;				/* weighted number of sites */  double log_pop;			/* log product of p-values */  double min_ic = dataset->min_ic;	/* min. per-column IC */  /* initialize letter counts to 0 */  wN = 0;				/* weighted number of sites */  for (i=0; i<w; i++) for (j=0; j<alength; j++) { counts[i][j] = 0; }  /* calculate the product of p-values of information content     of the top nsite0 probability positions   */  for (i_nsites0=0, next_seq=0; i_nsites0 < n_nsites0; i_nsites0++) {    /* don't score this start if not enough maxima found */    nsites0 = (int) s_points[i_nsites0].nsites0;	/* round down */    if (n_maxima < nsites0) { continue; }    n_starts++;					/* number of nsites0 tried */    /* Align the next highest probability sites 	1) count the number of occurrences of each letter in each column 	   of the motif and,         2) compute the log likelihood of the sites under the background model    */    for (k=next_seq; k<nsites0; k++) {		/* site */      int jj;      BOOLEAN ic = maxima[k].ic;		/* on - strand */      int y = maxima[k].y;			/* position of site */      SAMPLE *s = samples[maxima[k].x];		/* sequence */      int off = ic ? s->length-w-y : y;		/* - strand offset from rgt. */      char *res = ic ? s->resic+off : s->res+off;	/* integer sequence */      double sw = s->sw;			/* sequence weight */      double esw = sw * s->not_o[off];		/* erased weight */      /* residue counts */      for (j=0; j<w; j++) {			/* position in sequence */        int c = res[j];        if (c < alength) {			/* normal letter */          counts[j][c] += esw;	} else {				/* 'X' : esw * back[letter] */          for (jj=0; jj<alength; jj++) counts[j][jj] += esw * back[jj];	}      } /* position */      wN += esw;				/* total sequence wgt */    } /* site */    next_seq = k;				/* next site to align */    /*       For DNA palindromes, combine the counts in symmetrically opposing columns    */    if (dataset->pal) palindrome(counts, counts, w, alength);    /*       convert COUNTS to FREQUENCIES and calculate log likelihood ratio    */    log_pop = 0;				/* product of p-values */    for (i=0; i<w; i++) {			/* position in site */      double llr = 0;				/* log-like-ratio of column */      double log_pv;				/* log of column p-value */      double ic;      /* compute log likelihood for position i */      for (j=0; j<alength; j++) {		/* letter */	double f = wN ? counts[i][j] / wN : 1; 	/* observed letter frequency */	double p = back[j];			/* backgrnd letter frequency */        double log_f = LOGL(f);        double log_p = LOGL(p);	llr += (f&&p) ? f*(log_f-log_p) : 0;	/* column log likelihood */      } /* letter */      RND(llr/0.6934, RNDDIG, ic);		/* info content in bits */      llr *= wN;				/* convert entropy to ll */       RND(llr, RNDDIG, llr);			/* round to RNDDIG places */      log_pv = get_llr_pv(llr, wN, 1, LLR_RANGE, 1.0, alength, back);       if (ic < min_ic) log_pv = 0; 		/* ignore low ic columns */      log_pop += col_scores[i] = log_pv;	    } /* position in site */    RND(log_pop, RNDDIG, log_pop);    /* print the start sequence and other stuff */    if (TRACE) {      if (eseq) {        char seq[MAXSITE+1];        r2seq(seq, eseq, w);	fprintf(stdout, 	  "( %3d %3d ) ( %*.*s ) %.*s logpop %8.3f nsites0 %6d\n",	  iseq+1, ioff+1, MSN, MSN, name, w, seq, -log_pop, nsites0);      } else {	fprintf(stdout, 	  "l_off %3d w %d logpop %8.3f nsites0 %6d\n",	  iseq, w, -log_pop, nsites0);      }    }    /* save the best start */    if (-log_pop > s_points[i_nsites0].score) {      /* Save the starting point and offset so we can re-calculate	 eseq later. */      s_points[i_nsites0].iseq = iseq;      s_points[i_nsites0].ioff = ioff;      s_points[i_nsites0].e_cons0 = eseq;      s_points[i_nsites0].wgt_nsites = wN;      s_points[i_nsites0].score = -log_pop;    }  } /* nsites0 */  return n_starts;} /* score_llr_pop *//**********************************************************************//*	align_top_subsequences     	Align the top nsites0 subsequences for each value	of nsites0 and save the alignments with the best values	according to the objective function.	Returns number of values of nsites0 tried.*/ /**********************************************************************/extern int align_top_subsequences(  MTYPE mtype,				/* type of model */  int w,				/* width of motif */  DATASET *dataset,			/* the dataset */  int iseq,				/* sequence number of starting point */  int ioff,				/* sequence offset of starting point */  char *eseq,				/* integer encoded subsequence */  char *name,				/* name of sequence */  int n_nsites0,			/* number of nsites0 values to try */  int n_maxima,				/* number of local maxima */  P_PROB maxima,			/* sorted local maxima indices */  double *col_scores,			/* column scores for last start point */  S_POINT s_points[]			/* array of starting points */){  return(score_llr_pop(mtype, w, dataset, iseq, ioff, eseq, name,       n_nsites0, n_maxima, maxima, col_scores, s_points));} /* align_top_subsequences */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?