background.c
来自「EM算法的改进」· C语言 代码 · 共 569 行 · 第 1/2 页
C
569 行
int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ char *alpha /* alphabet */){ int i, j; char *t = NULL, *rc = NULL; /* tuple and rc-tuple */ int ti, rci; /* index of tuple */ if (n==0) return; /* everything is OK */ Resize(t, tuplew+2, char); Resize(rc, tuplew+2, char); for(i=0; alpha[i+add_x]; i++) { /* ignore last letter (X) */ /* append letter to tuple */ strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0'; /* make reverse complement of tuple */ for (j=0; j<=tuplew; j++) rc[j] = comp_dna(t[tuplew-j]); rc[tuplew+1] = '\0'; /* get tuple and rc indices */ ti = s2i(t); /* index of tuple */ rci = s2i(rc); /* index of reverse complement*/ /* average their probabilites */ a_p[ti] = a_p[rci] = (a_p[ti] + a_p[rci]) / 2.0; /* recur */ average_rc(add_x, a_p, n-1, t, tuplew+1, alpha); } /* letter */ myfree(t); myfree(rc); return;} /* average_rc *//***************************************************************************//* print_prob Recursively print the probabilities for all tuples up to width n. *//***************************************************************************/static void print_prob( double *a_p, /* tuple->prob assoc. array */ int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ char *alpha /* alphabet */) { int i; char *t = NULL; /* tuple */ if (n==0) return; /* everything is OK */ Resize(t, tuplew+2, char); for(i=0; alpha[i]; i++) { /* include last letter (X) */ /* append letter to tuple */ strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0'; /* print tuple and probability */ printf("%10s %8.4f\n", t, a_p[s2i(t)]); /* recur */ print_prob(a_p, n-1, t, tuplew+1, alpha); } /* letter */ myfree(t);} /* print_prob *//***************************************************************************//* add_x_tuples Compute the probability of all X-containing tuples by adding the probability of the current non-X-containing tuple to each X-containing tuple it matches. When this has been done for all non-X-containing tuples, the X-containing tuple probabilities will be correct.*//***************************************************************************/static void add_x_tuples( double *a_p, /* tuple->prob assoc. array */ char *ntuple, /* non-X-containing tuple */ int ntuplew, /* ntuple width */ int ntuplei, /* ntuple index */ char *xtuple, /* X-containing tuple */ int xtuplew, /* xtuple width */ int xcnt /* count of X's in xtuple */){ if (xtuplew == ntuplew) { /* add prob. of non-X-tuple */ int xtuplei; xtuple[xtuplew] = '\0'; /* end tuple */ xtuplei = s2i(xtuple); if (a_p[xtuplei] == -1) a_p[xtuplei] = 0; /* first time for x-tuple */ a_p[xtuplei] += a_p[ntuplei]; } else { /* continue building x-tuple */ xtuple[xtuplew] = 'X'; /* add X to xtuple */ add_x_tuples(a_p, ntuple, ntuplew, ntuplei, xtuple, xtuplew+1, xcnt+1); if (xcnt || xtuplew < ntuplew-1) { /* OK to add non-X to xtuple */ xtuple[xtuplew] = ntuple[xtuplew]; /* add X to xtuple */ add_x_tuples(a_p, ntuple, ntuplew, ntuplei, xtuple, xtuplew+1, xcnt); } }} /* add_x_tuples *//***************************************************************************//* get_cond_prob Compute Pr(a | w) where "w" is a word and "a" is a character and store in an array with index "s2i(wa)".*//***************************************************************************/static double *get_cond_prob ( double *a_p, /* tuple->prob array */ int n /* length of array */){ int wa, w; double *a_cp=NULL; Resize(a_cp, n, double); /* create array */ for (wa=0; wa<n; wa++) { /* tuple "wa" */ if (wa < index_alen) { /* w = "" */ a_cp[wa] = a_p[wa]; /* Pr(a) */ } else { w = (wa/index_alen)-1; /* get prefix w */ a_cp[wa] = MIN(a_p[wa]/a_p[w], 1.0); /* Pr(a | w) = Pr(wa)/Pr(w) */ } } return(a_cp);} /* get_cond_prob *//***************************************************************************//* log_cum_back Get log of the probability of each position in a string given a Markov background model. Returns the cumulative background as an array: logcumback_i = 0, i=0 = log Pr(s_{0,i-1} | H_0), otherwise. and the total (log) cumulative background probability. The probability of any length-w substring starting at position i in the string (in the context of the string) can then be computed as: last_p = i+w-1; log_p = logcumback[last_p+1] - logcumback[i]; The background model, H_0, is a Markov model defined by the order, n, and the conditional probabilities, a_cp, where a_cp[s2i(wa)] = Pr(a | w).*//***************************************************************************/extern double log_cum_back( char *seq, /* the sequence */ double *a_cp, /* the conditional probs */ int order, /* the order of the model */ double *logcumback /* the result */ ){ int i; int len = strlen(seq); /* length of sequence */ /* compute probability for each position in string */ for (i=0, logcumback[0]=0; i<len; i++) { /* position of "a" */ int lp = MIN(i+1, order+1); /* substring length */ char *wa; char *wa_end = &seq[i+1]; char save; double log_pwa; /* log Pr(a | w) */#ifdef dumb Substr(wa, seq, i-lp+1, lp); /* get wa */#endif wa = &seq[i-lp+1]; /* next wa */ save = *wa_end; /* save character after wa */ *wa_end = '\0'; /* make wa a string */ log_pwa = log(a_cp[s2i(wa)]); /* log Pr(a | w) */ logcumback[i+1] = logcumback[i] + log_pwa; *wa_end = save; /* restore seq *//* myfree(wa);*/ } return(logcumback[i]); /* total cumulative prob. */} /* log_cum_back *//***************************************************************************//* setup_index Setup the index from alphabet to integer and back. Upper and lower case index to same position; position indexes to upper case only. Returns length of alphabet.*//***************************************************************************/static int setup_index( char *alpha /* the alphabet */){ int i, a; /* flag unused letters */ for (i=0; i<MAXASCII; i++) a2i(i) = -1; /* illegal letters */ /* set up the hashing and unhashing indices */ for (i = 0; (a = alpha[i]); i++) { a = islower(a) ? toupper(a) : a; /* convert to uppercase */ a2i(a) = i; /* position in alphabet */ i2a(i) = a; if (isupper(a)) a2i(tolower(a)) = i; /* set lowercase as well */ } index_alen = i; /* set global */ return(i);} /* setup_index *//***************************************************************************//* s2i Convert a string to an array index. Returns the index or -1 if the string contains illegal characters.*//***************************************************************************/static int s2i( char *string /* the string to index */){ int i; char a; int index=0; /* array index */ /* complete the computation */ while ((a = *(string++))) { i = a2i(a); /* first charcter */ if (i < 0) return(-1); index = index_alen*index + (i+1); } return(index-1);} /* s2i *//* To compile: gcc background.c -o background -I INCLUDE -lm -DSO*/#ifdef SOmain( int argc, char **argv) { int i; char *pfile = argv[1]; double logcumback[1000]; char *string = "TTTTTTTTTTAAAAAAAAA"; int order; int len = strlen(string); double *a_cp = read_markov_model(pfile, "ACGT", TRUE, &order); printf("order = %d\n", order); log_cum_back(string, a_cp, order, logcumback); for (i=0; i<len; i++) printf("%c %f %f\n", string[i], exp(logcumback[i]), exp(Log_back(logcumback,i,6)) ); printf("All done\n"); return(0);} /* main */#endif /* SO */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?