init.c
来自「EM算法的改进」· C语言 代码 · 共 831 行 · 第 1/2 页
C
831 行
priors = create_priors(ptype, beta, dataset, plib_name); /* check that alphabet in mixture prior matches alphabet */ if (ptype == Dmix || ptype == Mega || ptype == MegaP) { if (strcmp(alphabet, priors->plib->alphabet)) { fprintf(stderr, "The alphabet in the mixture prior file (%s) doesn't match the sequence alphabet.\n", plib_name); fprintf(stderr, "prior alphabet : %s\n", priors->plib->alphabet); fprintf(stderr, "sequence alphabet: %s\n", alphabet); exit(1); } } /* set number of occurrences of sites */ if (nsites != 0) { if (mtype == Oops) { fprintf(stderr, "You may not specify -sites with -mod oops\n"); exit(1); } min_nsites = max_nsites = nsites; } /* set search range for nsites */ if (mtype == Oops) { min_nsites = max_nsites = dataset->n_samples; } else if (mtype == Zoops) { if (min_nsites > dataset->n_samples) { fprintf(stderr, "Minimum number of sites too large. Setting to 2.\n"); min_nsites = 2; } if (!min_nsites) min_nsites = 2; /* default */ if (max_nsites > dataset->n_samples) { fprintf(stderr, "Maximum number of sites exceeded. Setting to %d.\n", dataset->n_samples); max_nsites = dataset->n_samples; } if (!max_nsites) max_nsites = dataset->n_samples; /* default */ } else { /* TCM model */ if (min_nsites<2) min_nsites = 2; /* default */ if (!max_nsites) max_nsites = MIN(5*dataset->n_samples, 50); } /* check that there are enough possible sites */ if (min_nsites < 2) { fprintf(stderr, "You must specify a minimum of 2 or more sites.\n"); exit(1); } if (max_nsites < 2) { fprintf(stderr, "It must be possible for at least 2 sites to fit.\n"); exit(1); } /* check weight on prior on nsites */ if (wnsites >= 1 || wnsites < 0) { fprintf(stderr, "<wnsites> must be in range [0..1).\n"); exit(1); } /* check that no sequence too short */ if (dataset->min_slength < MIN_W) { fprintf(stderr, "All sequences must be at least %d characters long. Remove ", MIN_W); fprintf(stderr, "shorter sequences\nand rerun.\n"); exit(1); } /* set up globals */ if (w != 0) { /* w specified; set min_w and max_w */ max_w = min_w = w; } /* oops model: limit max_w to shortest seq */ if (mtype == Oops && max_w > dataset->min_slength) { max_w = dataset->min_slength; fprintf(stderr, "maxw > length of shortest sequence (%ld).", dataset->min_slength); fprintf(stderr, " Setting maxw to %d.\n", max_w); } /* all models: limit max_w to longest seq */ if (max_w > dataset->max_slength) { max_w = dataset->max_slength; fprintf(stderr, "maxw > length of longest sequence (%ld).", dataset->max_slength); fprintf(stderr, " Setting maxw to %d.\n", max_w); } if (max_w > MAXSITE) { fprintf(stderr, "maxw too large (> %d). Recompile with larger MAXSITE.\n", MAXSITE); exit(1); } if (max_w < 0) { /* use default */ max_w = MIN(MAXSITE, dataset->min_slength); /* maximum W0 */ } /* check that min_w <= max_w */ if (min_w > max_w) { fprintf(stderr, "minw > maxw. Setting minw to %d.\n", max_w); min_w = max_w; } /* check that min_w and max_w are at least ABS_MIN_W */ if (min_w < ABS_MIN_W) { fprintf(stderr, "Minimum width must be >= %d. Respecify larger -w or -minw.\n", ABS_MIN_W); exit(1); } else if (max_w < ABS_MIN_W) { fprintf(stderr, "Maximum width must be >= %d. Respecify larger -w or -maxw.\n", ABS_MIN_W); exit(1); } /* must use TCM if only one sequence */ if (mtype != Tcm && dataset->n_samples==1) { fprintf(stderr, "You must specify '-mod anr' since your dataset contains only one sequence.\n" ); fprintf(stderr,"Alternatively, you might wish to break your sequence into several sequences.\n" ); exit(1); } /* check that using right alphabet if model type requires it */ if (strcmp(alph, "DNA")) { if (invcomp) { fprintf(stderr, "You must use default DNA alphabet if using complementary strand!\n"); exit(1); } if (pal) { fprintf(stderr, "You must use default DNA alphabet if using -pal !\n"); exit(1); } } /* flag search for palindromes */ dataset->pal = pal; /* get the type of mapping between sequences and thetas */ if (!strcmp(mapname, "uni")) { map_type = Uni; if (map_scale == -1) map_scale = .5; /* default add .5 */ } else if (!strcmp(mapname, "pam")) { map_type = Pam; if (map_scale == -1) map_scale = 120; /* default PAM 120 */ } else { fprintf(stderr, "Unknown mapping type %s. \n", mapname); exit(1); } /* check that IUPAC alphabet if using PAM mapping */ if (mapname == "pam" && alphabet != PROTEIN0) { fprintf(stderr, "Setting sequence to theta mapping type to `Uni' since alphabet not IUPAC.\n"); mapname = "uni"; } /* set up the sequence to theta mapping matrix for starts */ dataset->map = init_map(map_type, map_scale, alength, dataset->back, FALSE); dataset->lomap = init_map(map_type, map_scale, alength, dataset->back, TRUE); /* set up p_point: previously learned components start points */ p_point = (P_POINT *) mymalloc(sizeof(P_POINT)); p_point->c = n_spcons; /* number of specified starts */ /* the default starting points */ for (i=0; i<MAXG; i++) { p_point->e_cons0[i] = NULL; p_point->w[i] = max_w; p_point->nsites[i] = nsites; } /* setup user-specified start points */ for (i=0; i<n_spcons; i++) { char *e_cons = (char *) mymalloc(MAXSITE); /* encoded version */ if (i >= MAXG) { fprintf(stderr, "Too many starting points. Increase MAXG and recompile."); exit(1); } p_point->e_cons0[i] = e_cons; /* convert the consensus sequence to upper case and encode as integer */ for (j=0; (cc = spcons[i][j]) != (int)NULL; j++) { char uc = (islower(cc) ? toupper(cc) : cc); e_cons[j] = (uc == 'X') ? alength : hash(uc); if (e_cons[j] > alength) { fprintf(stderr, "Illegal letter %c in consensus string!\n", cc); exit(1); } } /* set width to length of consensus unless -w specified */ if (w == 0) p_point->w[i] = j; /* pad out the consensus sequence with A's */ for ( ; j < MAXSITE; j++) e_cons[j] = 0; } /* make sure nmotifs is as large as the number of starting points */ if (nmotifs < n_spcons) { nmotifs = n_spcons; fprintf(stderr, "Setting nmotifs to %d\n", nmotifs); } /* create the model */ model = create_model(mtype, invcomp, max_w, alength); best_model = create_model(mtype, invcomp, max_w, alength); scratch_model = create_model(mtype, invcomp, max_w, alength); /* create the negative dataset and negative model if negative file given */ if (negfile) { neg_dataset = read_seq_file(negfile, dataset->alphabet, invcomp, seqfrac); if (!strcmp(ntype, "pair")) { neg_dataset->negtype = Pair; } else if (!strcmp(ntype, "blend")) { neg_dataset->negtype = Blend; } else { fprintf(stderr, "Unknown ntype %s. \n", ntype); exit(1); } neg_dataset->pal = dataset->pal; neg_dataset->back = dataset->back; neg_model = create_model(mtype, invcomp, max_w, alength); } /* initialize log and exp lookup tables */ init_log(); init_exp(); /* Initialize the probability tables for the objective function. */ /* Get for 2 and up because weighting may cause < min_nsites sites */ init_llr_pv_tables(2, max_nsites, alength, dataset->back, dataset->pal); /* set up scratch models */ model->min_w = min_w; model->max_w = max_w; model->min_nsites = min_nsites; model->max_nsites = max_nsites; copy_model(model, best_model, alength); copy_model(model, scratch_model, alength); /* put meme parameters in dataset */ dataset->priors = priors; dataset->p_point = p_point; dataset->wg = wg; dataset->ws = ws; dataset->endgaps = endgaps; dataset->wnsites = wnsites; dataset->ma_adj = ma_trim; dataset->distance = distance; dataset->prob = prob; dataset->nmotifs = nmotifs; dataset->maxiter = maxiter; dataset->evt = evt; dataset->mod = mod; dataset->mapname = mapname; dataset->map_scale = map_scale; dataset->priorname = prior; dataset->beta = beta; dataset->seed = seed; dataset->seqfrac = seqfrac; /* save name of prior library */ if (plib_name) { dataset->plib_name = plib_name + strlen(plib_name); while (*dataset->plib_name != '/') dataset->plib_name--; dataset->plib_name++; /* strip off directory */ } else { dataset->plib_name = NULL; } dataset->datafile = sf ? sf : datafile; /* name to print */ dataset->negfile = negfile; dataset->bfile = bfile; dataset->min_ic = min_ic; dataset->max_time = max_time; /* save command line */ dataset->command = NULL; argv[0] = "meme"; /*for (i=pos=len=0; i<argc-2; i++) {*/ /* don't save last arguments */ /* save all arguments; used to not save last 2; why??? */ for (i=pos=len=0; i<argc; i++) { len += strlen(argv[i])+1; /* +1 for space following */ Resize(dataset->command, len+2, char); /* +1 for null */ strcpy((dataset->command)+pos, argv[i]); dataset->command[len-1] = ' '; dataset->command[len] = '\0'; pos = len; } dataset->meme_directory = meme_directory; /* set up return values */ *model_p = model; *scratch_model_p = scratch_model; *best_model_p = best_model; *neg_model_p = neg_model; *dataset_p = dataset; *neg_dataset_p = neg_dataset; /* announce meme */ banner("MEME"); printf("\n\n");} /* init_meme *//***************************************************************************//* init_meme_background Read in the background Markov model or use the residue frequencies adjusted by add-one prior. Precalculate the log cumulative background probabilities. The log probability of any substring of any sequence will then be accessible via: Log_back(sample[i]->logcumback, j, w) where j is the start of the length-w substring of sequence i.*//***************************************************************************/static void init_meme_background ( char *bfile, /* background model file */ BOOLEAN rc, /* average reverse comps */ DATASET *dataset /* the dataset */){ int i, j; char *alphabet = dataset->alphabet; /* alphabet */ SAMPLE **samples = dataset->samples; /* the sequences */ int n_samples = dataset->n_samples; /* # of sequences in dataset */ double *back; /* freq. of tuples */ int order; /* order of background model */ BOOLEAN add_x = TRUE; /* add x-tuples to model */ /* read in background Markov model or use dataset frequencies (0-order) */ if (bfile) { /* read in probs; set X probs */ back = read_markov_model(bfile, NULL, alphabet, add_x, rc, &order); } else { /* use dataset frequencies */ int alength = dataset->alength; /* length of alphabet */ double *res_freq = dataset->res_freq; /* weighted residue freq */ double wtr = dataset->wgt_total_res; /* weighted residue count */ double *adj_res_freq = NULL; Resize(adj_res_freq, alength, double); /* adjust residue frequencies with add-one prior to avoid 0s */ for (i=0; i<alength; i++) adj_res_freq[i] = (res_freq[i]*wtr + 1)/(wtr+alength); back = read_markov_model(NULL, adj_res_freq, alphabet, add_x, rc, &order); order = 0; myfree(adj_res_freq); } /* precalculate the log cumulative background probabilities for each seq */ dataset->log_total_prob = 0; for (i=0; i<n_samples; i++) { /* sequence */ SAMPLE *s = samples[i]; /* sequence */ char *seq = NULL; /* ascii sequence */ double *lcb = s->logcumback; /* log cum. back. prob */ /* unhash the sequence to convert ambiguous characters to X */ Resize(seq, s->length+1, char); for (j=0; j<s->length; j++) seq[j] = unhash(s->res[j]); seq[s->length] = '\0'; /* compute probabilites */ dataset->log_total_prob += log_cum_back(seq, back, order, lcb); myfree(seq); } /* sequence */ /* return values */ dataset->back = back; dataset->back_order = order;} /* init_meme_background *//**********************************************************************//* create_priors*//**********************************************************************/static PRIORS *create_priors( PTYPE ptype, /* type of prior to use */ double beta, /* beta for dirichlet priors; < 0 only returns alphabet */ DATASET *dataset, /* the dataset */ char *plib_name /* name of prior library */){ int i; int alength = (dataset ? dataset->alength : 0); double *back = (dataset ? dataset->back : (double *)NULL); PRIORS *priors = (PRIORS *) mymalloc(sizeof(PRIORS)); priors->ptype = ptype; /* set up the prior counts */ switch (ptype) { case Addone: /* add one prior */ for (i=0; i<alength; i++) priors->prior_count[i] = 1.0; break; case Dirichlet: /* simple dirichlet prior */ for (i=0; i<alength; i++) priors->prior_count[i] = beta * back[i]; break; case Dmix: /* mixture of dirichlet's */ case Mega: /* megaprior heuristic */ case MegaP: /* mod. megaprior heuristic */ { priors->plib = read_PriorLib(plib_name, beta); /* get b=0 prior for modified mega prior heuristic */ if (ptype == MegaP || ptype == Mega) { /* used adj freq with Mega */ double b = 0; priors->plib0 = read_PriorLib(plib_name, b); } break; } } return priors;} /* create_priors */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?