init.c
来自「EM算法的改进」· C语言 代码 · 共 831 行 · 第 1/2 页
C
831 行
/* * $Id: init.c 1339 2006-09-21 19:46:28Z tbailey $ * * $Log$ * Revision 1.2 2005/10/25 19:06:39 nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1 2005/07/29 23:34:54 nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************ ** MEME++ ** Copyright 1994, The Regents of the University of California ** Author: Timothy L. Bailey ** ************************************************************************//* init.c *//* Initialize meme.*/#define ABS_MIN_W 2 #include "meme.h"#include "general.h"#include "banner.h"#ifndef EXP#define EXP 0#else#define EXP 1#endif/* priors */#define PROTEIN_PLIB "prior30.plib"#define DNA_PLIB "dna.plib"/* User input parameters */static BOOLEAN check_syntax = FALSE; /* exit after checking syntax if true */static char *datafile = NULL; /* positive examples */static char *sf = NULL; /* name to print for datafile */static char *negfile = NULL; /* negative examples */static char *ntype = "pair"; /* output two matrices per motif if negatives */static char *obj = "ev"; /* objective function */static char *bfile = NULL; /* use default background Markov model file */static char *plib_name = NULL; /* use default library */static char *mod = "zoops"; /* model type input string; default ZOOPS */static char *alph = "PROTEIN"; /* default alphabet IUPAC protein 1-letter */static BOOLEAN invcomp = FALSE; /* don't use reverse complement strand of DNA */static int pal = 0; /* = 0, no palindromes = 1, force DNA palindromes, */static BOOLEAN ma_trim = TRUE; /* trim width using multiple alignment method */static double wg = 11; /* default gap weight */static double ws = 1; /* default space weight */static BOOLEAN endgaps = TRUE; /* count end gaps in multiple alignment */static double distance = 1e-5; /* squared euclidean distance for convergence */static char *prior = NULL; /* prior type input string */static double beta = -1; /* scale factor for prior; defaults differ */static double prob = 1.0; /* try enough subsequences so P=prob */static int nmotifs = 1; /* number of motifs to find */static char *mfile = NULL; /* name of known .motifs file*/static int maxiter = 50; /* max number iterations of EM on best start */static double nsites = 0; /* try one value of nsites0 only if > 0 */static int min_nsites = 0; /* minimum nsites0 to try */static int max_nsites = 0; /* maximum nsites0 to try */static double wnsites = 0.8; /* weight on prior on nsites */static int w = 0; /* width of motifs */static int min_w = MIN_W; /* minimum W0 to try */static int max_w = MAX_W; /* maximum W0 to try */static int min_ic = 0; /* minimum per column IC */static MAP_TYPE map_type; /* type of sequence to theta mapping */static char *mapname = NULL; /* map type input string */static double map_scale=-1; /* scale of sequence to theta mapping: Uni - size of add-n prior (n) Pam - PAM distance (120) Default set in init_em. */static int n_spcons = 0; /* number of specified start points */static char *spcons[MAXG]; /* starting point consensus strings */static int maxsize= 100000; /* dataset size limit */static int seed = 0; /* random number seed */static double seqfrac = 1; /* fraction of input dataset sequences to use */static char *meme_directory = NULL; /* meme source directory */ static double max_time = 0; /* maximum allowed CPU time; ignore if 0 *//* subroutines*/static void init_meme_background ( char *bfile, /* background model file */ BOOLEAN rc, /* average reverse comps */ DATASET *dataset /* the dataset */);static PRIORS *create_priors( PTYPE ptype, /* type of prior to use */ double beta, /* beta for dirichlet priors; < 0 only returns alphabet */ DATASET *dataset, /* the dataset */ char *plib_name /* name of prior library */);/**********************************************************************//* init_meme Set up all the stuff for meme.*//**********************************************************************/extern void init_meme( int argc, /* number of input arguments */ char **argv, /* input arguments */ MODEL **model_p, /* the model */ MODEL **best_model_p, /* the best model */ MODEL **scratch_model_p, /* the best model */ MODEL **neg_model_p, /* model of negative examples */ DATASET **dataset_p, /* the dataset */ DATASET **neg_dataset_p /* dataset of negative examples */){ int i, j, cc=0, len, pos; OBJTYPE objfun = Ev; /* type of objective function */ MTYPE mtype; /* type of model */ PTYPE ptype; /* type of prior */ PRIORS *priors; /* the prior probabilities model */ P_POINT *p_point; /* previously learned starting points */ char *alphabet; /* alphabet for dataset */ int alength; /* length of alphabet */ MODEL *model=NULL, *best_model=NULL, *scratch_model=NULL, *neg_model=NULL; DATASET *dataset=NULL, *neg_dataset=NULL; double evt = BIG; /* no E-value cutoff */ /* get the command line arguments */ i = 1;#ifndef lint /* print the command line */ argv[0] = ""; DO_STANDARD_COMMAND_LINE(1, USAGE(USAGE:\n\tmeme\t<dataset> [optional arguments]\n); NON_SWITCH(1, <dataset> \t\tfile containing sequences in FASTA format\n, switch (i++) { case 1: datafile = _OPTION_; break; default: COMMAND_LINE_ERROR; }); FLAG_OPTN(1, h, \t\t\tprint this message, USAGE_MESSAGE); DATA_OPTN(0, neg, <negdataset>, \tfile containing negative example sequences, negfile = _OPTION_); DATA_OPTN(0, ntype, pair|blend, \thow to use negative examples, ntype = _OPTION_); DATA_OPTN(EXP, objfun, ev|pv, \tobjective function (default: ev), obj = _OPTION_); FLAG_OPTN(1, dna, \t\t\tsequences use DNA alphabet, alph = "DNA"); FLAG_OPTN(1, protein, \t\tsequences use protein alphabet, alph = "PROTEIN"); DATA_OPTN(1, mod, oops|zoops|anr, \tdistribution of motifs,mod = _OPTION_); DATA_OPTN(1, nmotifs, <nmotifs>, \tmaximum number of motifs to find, nmotifs = atoi(_OPTION_)); DATA_OPTN(1, evt, <ev>, \t\tstop if motif E-value greater than <evt>, evt = atof(_OPTION_)); DATA_OPTN(1, nsites, <sites>, \tnumber of sites for each motif, nsites=atof(_OPTION_)); DATA_OPTN(1, minsites, <minsites>, \tminimum number of sites for each motif, min_nsites=atoi(_OPTION_)); DATA_OPTN(1, maxsites, <maxsites>, \tmaximum number of sites for each motif, max_nsites=atoi(_OPTION_)); DATA_OPTN(1, wnsites, <wnsites>, \tweight on expected number of sites, wnsites=atof(_OPTION_)); DATA_OPTN(1, w, <w>, \t\tmotif width, w = atoi(_OPTION_)); DATA_OPTN(1, minw, <minw>, \t\tminumum motif width, min_w = atoi(_OPTION_)); DATA_OPTN(1, maxw, <maxw>, \t\tmaximum motif width, max_w = atoi(_OPTION_)); DATA_OPTN(EXP, minic, <minic>, \tminumum column information content (default: 0), min_ic = atof(_OPTION_)); FLAG_OPTN(1, nomatrim, \t\tdo not adjust motif width using multiple\n\t\t\t\talignment, ma_trim = FALSE); DATA_OPTN(1, wg, <wg>, \t\tgap opening cost for multiple alignments, wg=atof(_OPTION_)); DATA_OPTN(1, ws, <ws>, \t\tgap extension cost for multiple alignments, ws=atof(_OPTION_)); FLAG_OPTN(1, noendgaps, \t\tdo not count end gaps in multiple alignments, endgaps = FALSE); DATA_OPTN(1, bfile, <bfile>, \tname of background Markov model file, bfile = _OPTION_); FLAG_OPTN(1, revcomp, \t\tallow sites on + or - DNA strands, invcomp = TRUE); FLAG_OPTN(1, pal, \t\t\tforce palindromes (requires -dna), pal = 1); DATA_OPTN(1, maxiter, <maxiter>, \tmaximum EM iterations to run, maxiter = atoi(_OPTION_)); DATA_OPTN(1, distance, <distance>, \tEM convergence criterion, distance = atof(_OPTION_)); DATA_OPTN(1, prior, dirichlet|dmix|mega|megap|addone, \n\t\t\t\ttype of prior to use, prior = _OPTION_); DATA_OPTN(1, b, <b>, \t\tstrength of the prior, beta = atof(_OPTION_)); DATA_OPTN(1, plib, <plib>, \t\tname of Dirichlet prior file, plib_name = _OPTION_); DATA_OPTN(1, spfuzz, <spfuzz>, \tfuzziness of sequence to theta mapping, map_scale = atof(_OPTION_)); DATA_OPTN(1, spmap, uni|pam, \tstarting point seq to theta mapping type, mapname = _OPTION_); DATA_OPTN(1, cons, <cons>, \t\tconsensus sequence to start EM from, spcons[n_spcons++] = _OPTION_); FLAG_OPTN(1, text, \t\t\toutput in text format (default is HTML), i=i); DATA_OPTN(1, maxsize, <maxsize>, \tmaximum dataset size in characters, maxsize = atoi(_OPTION_)); FLAG_OPTN(1, nostatus, \t\tdo not print progress reports to terminal, NO_STATUS = TRUE); DATA_OPTN(1, p, <np>, \t\tuse parallel version with <np> processors, i=i); DATA_OPTN(1, time, <t>, \t\tquit before <t> CPU seconds consumed, max_time = atof(_OPTION_)); DATA_OPTN(1, sf, <sf>, \t\tprint <sf> as name of sequence file, sf = _OPTION_); DATA_OPTN(2, dir, <dir>, \t\tmeme source directory, meme_directory = _OPTION_); FLAG_OPTN(2, check_syntax, \t\tcheck input syntax and exit, check_syntax = TRUE); DATA_OPTN(EXP, mfile, <mfile>, \tfile of known motifs, mfile = _OPTION_); FLAG_OPTN(EXP, V, \t\t\tverbose mode, VERBOSE = TRUE); DATA_OPTN(EXP, seed, <seed>, \t\tseed for random numbers in sampling, seed = atoi(_OPTION_)); DATA_OPTN(EXP, seqfrac, <seqfrac>, \tfraction of sequences to use, seqfrac= atof(_OPTION_)); FLAG_OPTN(EXP, trace, \t\ttrace starting points, TRACE = TRUE); FLAG_OPTN(EXP, print_all, \t\tprint all debug information, PRINTALL = TRUE); FLAG_OPTN(EXP, print_w, \t\tprint erasure matrix, PRINT_W = TRUE); FLAG_OPTN(EXP, print_z, \t\tprint missing information matrix, PRINT_Z = TRUE); FLAG_OPTN(EXP, print_ll, \t\tprint log-likelihood during EM, PRINT_LL = TRUE); FLAG_OPTN(EXP, print_starts, \t\tprint starting points, PRINT_STARTS = TRUE); )#endif /* exit if check_syntax is on */ if (check_syntax) exit(0); /* set random number generator */ srand48(seed); /* set all the print flags appropriately */ if (PRINTALL) { PRINT_W = TRUE; PRINT_Z = TRUE; PRINT_LL = TRUE; PRINT_STARTS = TRUE; } /* check input arguments */ if (prob < 0 || prob > 1) { fprintf(stderr, "-prob <p>, <p> must be between 0 and 1\n"); exit(1); } if (nmotifs >= MAXG) { fprintf(stderr, "-nmotifs larger than MAXG-1. Use smaller -nmotifs or recompile with larger MAXG.\n"); exit(1); } /* get the name of the MEME directory */ if (!meme_directory) { /* not given on command line */ meme_directory = getenv("MEME_DIRECTORY"); if (meme_directory == NULL) { fprintf(stderr, "You must define environment variable MEME_DIRECTORY\n"); exit(1); } } /* get the name of the sequence file */ if (!sf) sf = getenv("MEME_SEQUENCE_FILE"); /* get the objective function type */ if (!strcmp(obj, "pv")) { objfun = Pv; } else if (!strcmp(obj, "ev")) { objfun = Ev; } else { printf("Unknown objective function type %s. \n", obj); exit(1); } /* get the model type */ if (!strcmp(mod, "anr") || !strcmp(mod, "tcm")) { mtype = Tcm; } else if (!strcmp(mod, "oops")) { mtype = Oops; } else if (!strcmp(mod, "zoops")) { mtype = Zoops; } else { mtype = Zoops; /* prevent warning */ fprintf(stderr, "Unknown model type %s. \n", mod); exit(1); } /* check seqfrac */ if (seqfrac > 1 || seqfrac <=0) { fprintf(stderr, "seqfrac must be in (0, 1]\n"); exit(1); } /* check the alphabet and set up default mappings and priors */ if (!strcmp(alph, "DNA")) { /* DNA */ alphabet = DNA0; if (!mapname) mapname = "uni"; /* uniform prior mapping */ if (!prior) prior = "dirichlet"; /* simple dirichlet prior */ } else { /* PROTEIN */ alphabet = PROTEIN0; if (!mapname) mapname = "pam"; /* PAM mapping */ if (!prior) { switch (mtype) { case Oops: prior = "dmix"; break; case Zoops: case Tcm: prior = "megap"; break; default: prior = "dirichlet"; break; } } if (invcomp) { fprintf(stderr, "You cannot use -revcomp with -protein.\n"); exit(1); } } /* set alphabet and priors defaults */ /* find out type of prior */ if (!strcmp(prior, "dirichlet")) { ptype = Dirichlet; if (beta < 0) beta = 0.01; /* default b = 0.01 */ } else if (!strcmp(prior, "dmix")) { ptype = Dmix; if (beta < 0) beta = 0; /* default b = 0 for dmix */ } else if (!strcmp(prior, "megadmix") || !strcmp(prior, "mega")) { ptype = Mega; /* use mega prior heuristic */ } else if (!strcmp(prior, "megap")) { ptype = MegaP; /* discretization uses b=0 */ } else if (!strcmp(prior, "addone")) { ptype = Addone; } else { ptype = Dirichlet; /* prevent warning */ fprintf(stderr, "Unknown type of prior: %s!\n", prior); exit(1); } /* convert the alphabet to upper case */ for (i=0; alphabet && (cc = alphabet[i]); i++) if (islower(cc)) alphabet[i] = toupper(cc); /* setup hashing function for encoding strings as integers */ alength = setup_hash_alph(alphabet); /* read the samples and set up globals */ dataset = read_seq_file(datafile, alphabet, invcomp, seqfrac); if (!dataset) exit(1); /* set dataset alphabet flag */ dataset->dna = !strcmp(alph, "DNA"); /* set the dataset objfun */ dataset->objfun = objfun; /* initialize the background model */ init_meme_background(bfile, invcomp, dataset); /* reset the current alphabet since it may be clobbered by the above */ (void) setup_hash_alph(alphabet); /* prevent too long jobs */ if (dataset->total_res > maxsize) { fprintf(stderr, "Dataset too large (> %d). Rerun with larger -maxsize.\n", maxsize); exit(1); } /* Calculate optimal split points for this dataset; no-op if non-parallel */ balance_loop(dataset->samples, dataset->n_samples); /* read in known motifs file */ if (mfile) { FILE *fdata = fopen(datafile, "r"); dataset->nkmotifs = read_motifs(fdata, mfile, dataset->motifs, FALSE, NULL); nmotifs = dataset->nkmotifs; prob = 0; /* don't sample starts */ dataset->pal = 0; /* no palindrome testing */ } else { dataset->nkmotifs = 0; } /* create the priors */ if (ptype == Dmix || ptype == Mega || ptype == MegaP) { /* make the name of the prior library */ if (!plib_name) { char *tmp1, *tmp2; if (!strcmp(alphabet, PROTEIN0)) { plib_name = PROTEIN_PLIB; /* default mixture prior for proteins */ } else { plib_name = DNA_PLIB; /* default mixture prior for DNA */ } /* prepend meme_directory to file name */ Strcat(tmp1, meme_directory, "/etc/"); Strcat(tmp2, tmp1, plib_name); plib_name = tmp2; } } if ((ptype == Mega || ptype == MegaP) && beta == -1) { /* tlb 5-9-97; wgt_total_res */ /*beta = 10.0 * dataset->wgt_total_res;*/ /* size of mega prior */ beta = 5.0 * dataset->wgt_total_res; /* size of mega prior */ }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?