background.c

来自「EM算法的改进」· C语言 代码 · 共 569 行 · 第 1/2 页

C
569
字号
  int n,					/* widest tuple */  char *tuple,					/* call with "" */  int tuplew,					/* tuple width; call with 0 */  char *alpha 					/* alphabet */){  int i, j;  char *t = NULL, *rc = NULL; 			/* tuple and rc-tuple */  int ti, rci;					/* index of tuple */  if (n==0) return;				/* everything is OK */  Resize(t, tuplew+2, char);  Resize(rc, tuplew+2, char);    for(i=0; alpha[i+add_x]; i++) {		/* ignore last letter (X) */    /* append letter to tuple */    strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0';    /* make reverse complement of tuple */    for (j=0; j<=tuplew; j++) rc[j] = comp_dna(t[tuplew-j]);    rc[tuplew+1] = '\0';    /* get tuple and rc indices */    ti = s2i(t);				/* index of tuple */    rci = s2i(rc);				/* index of reverse complement*/    /* average their probabilites */    a_p[ti] = a_p[rci] = (a_p[ti] + a_p[rci]) / 2.0;    /* recur */    average_rc(add_x, a_p, n-1, t, tuplew+1, alpha);  } /* letter */  myfree(t);  myfree(rc);  return;} /* average_rc *//***************************************************************************//*  	print_prob  	Recursively print the probabilities for all tuples	up to width n.  *//***************************************************************************/static void print_prob(  double *a_p,					/* tuple->prob assoc. array */  int n,					/* widest tuple */  char *tuple,					/* call with "" */  int tuplew,					/* tuple width; call with 0 */  char *alpha 					/* alphabet */) {  int i;  char *t = NULL; 				/* tuple */  if (n==0) return;				/* everything is OK */  Resize(t, tuplew+2, char);    for(i=0; alpha[i]; i++) {			/* include last letter (X) */    /* append letter to tuple */    strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0';    /* print tuple and probability */    printf("%10s %8.4f\n", t, a_p[s2i(t)]);    /* recur */    print_prob(a_p, n-1, t, tuplew+1, alpha);  } /* letter */  myfree(t);} /* print_prob *//***************************************************************************//*	add_x_tuples	Compute the probability of all X-containing tuples by	adding the probability of the current non-X-containing tuple	to each X-containing tuple it matches.  When this has	been done for all non-X-containing tuples, the X-containing	tuple probabilities will be correct.*//***************************************************************************/static void add_x_tuples(  double *a_p,					/* tuple->prob assoc. array */  char *ntuple,					/* non-X-containing tuple */  int ntuplew, 					/* ntuple width */  int ntuplei, 					/* ntuple index */  char *xtuple,					/* X-containing tuple */  int xtuplew, 					/* xtuple width */  int xcnt					/* count of X's in xtuple */){  if (xtuplew == ntuplew) {			/* add prob. of non-X-tuple */    int xtuplei;    xtuple[xtuplew] = '\0';			/* end tuple */    xtuplei = s2i(xtuple);    if (a_p[xtuplei] == -1) a_p[xtuplei] = 0;	/* first time for x-tuple */    a_p[xtuplei] += a_p[ntuplei];  } else {					/* continue building x-tuple */    xtuple[xtuplew] = 'X';			/* add X to xtuple */    add_x_tuples(a_p, ntuple, ntuplew, ntuplei, xtuple, xtuplew+1, xcnt+1);    if (xcnt || xtuplew < ntuplew-1) {		/* OK to add non-X to xtuple */      xtuple[xtuplew] = ntuple[xtuplew];	/* add X to xtuple */      add_x_tuples(a_p, ntuple, ntuplew, ntuplei, xtuple, xtuplew+1, xcnt);    }  }} /* add_x_tuples *//***************************************************************************//*	get_cond_prob	Compute 		Pr(a | w)	where "w" is a word and "a" is a character and store in an	array with index "s2i(wa)".*//***************************************************************************/static double *get_cond_prob (  double *a_p, 					/* tuple->prob array */  int n						/* length of array */){  int wa, w;  double *a_cp=NULL;  Resize(a_cp, n, double);			/* create array */    for (wa=0; wa<n; wa++) {			/* tuple "wa" */    if (wa < index_alen) {			/* w = "" */      a_cp[wa] = a_p[wa];			/* Pr(a) */    } else {      w = (wa/index_alen)-1;			/* get prefix w */      a_cp[wa] = MIN(a_p[wa]/a_p[w], 1.0);	/* Pr(a | w) = Pr(wa)/Pr(w) */    }  }  return(a_cp);} /* get_cond_prob *//***************************************************************************//* 	log_cum_back	Get log of the probability of each position in a string given a 	Markov background model.	Returns the cumulative background as an array:		logcumback_i = 0, i=0		             = log Pr(s_{0,i-1} | H_0), otherwise.	and the total (log) cumulative background probability.	The probability of any length-w substring starting at position i in the	string (in the context of the string) can then be computed as:		last_p = i+w-1;		log_p = logcumback[last_p+1] - logcumback[i];	The background model, H_0, is a Markov model defined by the	order, n, and the conditional probabilities, a_cp, where		a_cp[s2i(wa)] = Pr(a | w).*//***************************************************************************/extern double log_cum_back(  char *seq,					/* the sequence */  double *a_cp,					/* the conditional probs */  int order,					/* the order of the model */  double *logcumback				/* the result */ ){  int i;  int len = strlen(seq);			/* length of sequence */  /* compute probability for each position in string */  for (i=0, logcumback[0]=0; i<len; i++) {	/* position of "a" */    int lp = MIN(i+1, order+1);			/* substring length */    char *wa;    char *wa_end = &seq[i+1];    char save;    double log_pwa;				/* log Pr(a | w) */#ifdef dumb    Substr(wa, seq, i-lp+1, lp);		/* get wa */#endif    wa = &seq[i-lp+1];				/* next wa */    save = *wa_end;				/* save character after wa */    *wa_end = '\0';				/* make wa a string */    log_pwa = log(a_cp[s2i(wa)]);		/* log Pr(a | w) */    logcumback[i+1] = logcumback[i] + log_pwa;    *wa_end = save;				/* restore seq *//*    myfree(wa);*/  }  return(logcumback[i]);			/* total cumulative prob. */} /* log_cum_back *//***************************************************************************//*	setup_index	Setup the index from alphabet to integer and back.	Upper and lower case index to same position; position indexes to	upper case only.	Returns length of alphabet.*//***************************************************************************/static int setup_index(  char *alpha					/* the alphabet */){  int i, a;  /* flag unused letters */  for (i=0; i<MAXASCII; i++) a2i(i) = -1;	/* illegal letters */  /* set up the hashing and unhashing indices */  for (i = 0; (a = alpha[i]); i++) {    a = islower(a) ? toupper(a) : a;		/* convert to uppercase */    a2i(a) = i;					/* position in alphabet */    i2a(i) = a;    if (isupper(a)) a2i(tolower(a)) = i;	/* set lowercase as well */  }  index_alen = i;				/* set global */  return(i);} /* setup_index *//***************************************************************************//*	s2i	Convert a string to an array index.	Returns the index or -1 if the string contains illegal characters.*//***************************************************************************/static int s2i(  char *string					/* the string to index */){  int i;  char a;  int index=0;					/* array index */  /* complete the computation */  while ((a = *(string++))) {    i = a2i(a);					/* first charcter */    if (i < 0) return(-1);    index = index_alen*index + (i+1);  }  return(index-1);} /* s2i *//*	To compile:	gcc background.c -o background -I INCLUDE -lm -DSO*/#ifdef SOmain(  int argc,  char **argv) {  int i;  char *pfile = argv[1];  double logcumback[1000];  char *string = "TTTTTTTTTTAAAAAAAAA";  int order;  int len = strlen(string);  double *a_cp = read_markov_model(pfile, "ACGT", TRUE, &order);  printf("order = %d\n", order);  log_cum_back(string, a_cp, order, logcumback);  for (i=0; i<len; i++) printf("%c %f %f\n", string[i], exp(logcumback[i]),     exp(Log_back(logcumback,i,6)) );   printf("All done\n");  return(0);} /* main */#endif /* SO */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?