init.c

来自「EM算法的改进」· C语言 代码 · 共 831 行 · 第 1/2 页

C
831
字号
/* * $Id: init.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.2  2005/10/25 19:06:39  nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1  2005/07/29 23:34:54  nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************								       **	MEME++							       **	Copyright 1994, The Regents of the University of California    **	Author: Timothy L. Bailey				       **								       ************************************************************************//* init.c *//*		Initialize meme.*/#define ABS_MIN_W 2	#include "meme.h"#include "general.h"#include "banner.h"#ifndef EXP#define EXP 0#else#define EXP 1#endif/* priors */#define PROTEIN_PLIB "prior30.plib"#define DNA_PLIB "dna.plib"/* User input parameters */static BOOLEAN check_syntax = FALSE;	/* exit after checking syntax if true */static char *datafile = NULL;	/* positive examples */static char *sf = NULL;		/* name to print for datafile */static char *negfile = NULL;	/* negative examples */static char *ntype = "pair";	/* output two matrices per motif if negatives */static char *obj = "ev";	/* objective function */static char *bfile = NULL;	/* use default background Markov model file */static char *plib_name = NULL;	/* use default library */static char *mod = "zoops";	/* model type input string; default ZOOPS */static char *alph = "PROTEIN";	/* default alphabet IUPAC protein 1-letter */static BOOLEAN invcomp = FALSE;	/* don't use reverse complement strand of DNA */static int pal = 0;		/* = 0, no palindromes				   = 1, force DNA palindromes,				*/static BOOLEAN ma_trim = TRUE;	/* trim width using multiple alignment method */static double wg = 11;		/* default gap weight */static double ws = 1;		/* default space weight */static BOOLEAN endgaps = TRUE;	/* count end gaps in multiple alignment */static double distance = 1e-5;	/* squared euclidean distance for convergence */static char *prior = NULL;	/* prior type input string */static double beta = -1;	/* scale factor for prior; defaults differ */static double prob = 1.0;	/* try enough subsequences so P=prob */static int nmotifs = 1;		/* number of motifs to find */static char *mfile = NULL;	/* name of known .motifs file*/static int maxiter = 50;        /* max number iterations of EM on best start */static double nsites = 0;	/* try one value of nsites0 only if > 0 */static int min_nsites = 0;	/* minimum nsites0 to try */static int max_nsites = 0;	/* maximum nsites0 to try */static double wnsites = 0.8;	/* weight on prior on nsites */static int w = 0;		/* width of motifs */static int min_w = MIN_W;	/* minimum W0 to try */static int max_w = MAX_W;	/* maximum W0 to try */static int min_ic = 0;		/* minimum per column IC */static MAP_TYPE map_type; 	/* type of sequence to theta mapping */static char *mapname = NULL;	/* map type input string */static double map_scale=-1; 	/* scale of sequence to theta mapping:					Uni - size of add-n prior (n)					Pam - PAM distance (120) 				   Default set in init_em.				*/static int n_spcons = 0;	/* number of specified start points */static char *spcons[MAXG];	/* starting point consensus strings */static int maxsize= 100000; 	/* dataset size limit */static int seed = 0;		/* random number seed */static double seqfrac = 1;	/* fraction of input dataset sequences to use */static char *meme_directory = NULL;	/* meme source directory */ static double max_time = 0;	/* maximum allowed CPU time; ignore if 0 *//*  subroutines*/static void init_meme_background (  char *bfile, 					/* background model file */  BOOLEAN rc,					/* average reverse comps */  DATASET *dataset 				/* the dataset */);static PRIORS *create_priors(  PTYPE ptype,				/* type of prior to use */  double beta, 				/* beta for dirichlet priors;					   < 0 only returns alphabet */  DATASET *dataset,			/* the dataset */  char *plib_name			/* name of prior library */);/**********************************************************************//*        init_meme        Set up all the stuff for meme.*//**********************************************************************/extern void init_meme(  int argc,                     /* number of input arguments */  char **argv,                  /* input arguments */  MODEL **model_p,              /* the model */  MODEL **best_model_p,         /* the best model */  MODEL **scratch_model_p,      /* the best model */  MODEL **neg_model_p,          /* model of negative examples */  DATASET **dataset_p,          /* the dataset */  DATASET **neg_dataset_p       /* dataset of negative examples */){  int i, j, cc=0, len, pos;  OBJTYPE objfun = Ev;		/* type of objective function */  MTYPE mtype;			/* type of model */  PTYPE ptype;			/* type of prior */  PRIORS *priors;	    	/* the prior probabilities model */  P_POINT *p_point;		/* previously learned starting points */  char *alphabet;		/* alphabet for dataset */  int alength;			/* length of alphabet */  MODEL *model=NULL, *best_model=NULL, *scratch_model=NULL, *neg_model=NULL;  DATASET *dataset=NULL, *neg_dataset=NULL;  double evt = BIG;		/* no E-value cutoff */  /* get the command line arguments */  i = 1;#ifndef lint  /* print the command line */  argv[0] = "";  DO_STANDARD_COMMAND_LINE(1,    USAGE(USAGE:\n\tmeme\t<dataset> [optional arguments]\n);    NON_SWITCH(1, <dataset> \t\tfile containing sequences in FASTA format\n,      switch (i++) {        case 1: datafile = _OPTION_; break;        default: COMMAND_LINE_ERROR;      });     FLAG_OPTN(1, h, \t\t\tprint this message, USAGE_MESSAGE);     DATA_OPTN(0, neg, <negdataset>,        \tfile containing negative example sequences, negfile = _OPTION_);     DATA_OPTN(0, ntype, pair|blend, \thow to use negative examples,        ntype = _OPTION_);     DATA_OPTN(EXP, objfun, ev|pv, \tobjective function (default: ev),        obj = _OPTION_);     FLAG_OPTN(1, dna, \t\t\tsequences use DNA alphabet, alph = "DNA");     FLAG_OPTN(1, protein, \t\tsequences use protein alphabet,        alph = "PROTEIN");     DATA_OPTN(1, mod, oops|zoops|anr, \tdistribution of motifs,mod = _OPTION_);     DATA_OPTN(1, nmotifs, <nmotifs>, \tmaximum number of motifs to find,        nmotifs = atoi(_OPTION_));     DATA_OPTN(1, evt, <ev>, \t\tstop if motif E-value greater than <evt>,        evt = atof(_OPTION_));     DATA_OPTN(1, nsites, <sites>, \tnumber of sites for each motif,        nsites=atof(_OPTION_));     DATA_OPTN(1, minsites, <minsites>,        \tminimum number of sites for each motif, min_nsites=atoi(_OPTION_));     DATA_OPTN(1,        maxsites, <maxsites>, \tmaximum number of sites for each motif,       max_nsites=atoi(_OPTION_));     DATA_OPTN(1, wnsites, <wnsites>, \tweight on expected number of sites,       wnsites=atof(_OPTION_));     DATA_OPTN(1, w, <w>, \t\tmotif width, w = atoi(_OPTION_));     DATA_OPTN(1, minw, <minw>, \t\tminumum motif width,       min_w = atoi(_OPTION_));     DATA_OPTN(1, maxw, <maxw>, \t\tmaximum motif width,       max_w = atoi(_OPTION_));     DATA_OPTN(EXP, minic, <minic>, \tminumum column information content       (default: 0), min_ic = atof(_OPTION_));     FLAG_OPTN(1, nomatrim,        \t\tdo not adjust motif width using multiple\n\t\t\t\talignment,        ma_trim = FALSE);     DATA_OPTN(1, wg, <wg>, \t\tgap opening cost for multiple alignments,        wg=atof(_OPTION_));     DATA_OPTN(1, ws, <ws>, \t\tgap extension cost for multiple alignments,        ws=atof(_OPTION_));     FLAG_OPTN(1, noendgaps, \t\tdo not count end gaps in multiple alignments,       endgaps = FALSE);     DATA_OPTN(1, bfile, <bfile>, \tname of background Markov model file,       bfile = _OPTION_);     FLAG_OPTN(1, revcomp, \t\tallow sites on + or - DNA strands,       invcomp = TRUE);     FLAG_OPTN(1, pal, \t\t\tforce palindromes (requires -dna), pal = 1);     DATA_OPTN(1, maxiter, <maxiter>, \tmaximum EM iterations to run, 	maxiter = atoi(_OPTION_));     DATA_OPTN(1, distance, <distance>, \tEM convergence criterion,        distance = atof(_OPTION_));     DATA_OPTN(1,        prior, dirichlet|dmix|mega|megap|addone, \n\t\t\t\ttype of prior to use,        prior = _OPTION_);     DATA_OPTN(1, b, <b>, \t\tstrength of the prior, beta = atof(_OPTION_));     DATA_OPTN(1, plib, <plib>, \t\tname of Dirichlet prior file,       plib_name = _OPTION_);     DATA_OPTN(1, spfuzz, <spfuzz>, \tfuzziness of sequence to theta mapping, 	map_scale = atof(_OPTION_));     DATA_OPTN(1, spmap, uni|pam, \tstarting point seq to theta mapping type,       mapname = _OPTION_);     DATA_OPTN(1, cons, <cons>, \t\tconsensus sequence to start EM from,        spcons[n_spcons++] = _OPTION_);     FLAG_OPTN(1, text, \t\t\toutput in text format (default is HTML), i=i);     DATA_OPTN(1, maxsize, <maxsize>, \tmaximum dataset size in characters,       maxsize = atoi(_OPTION_));     FLAG_OPTN(1, nostatus, \t\tdo not print progress reports to terminal,        NO_STATUS = TRUE);     DATA_OPTN(1, p, <np>, \t\tuse parallel version with <np> processors, i=i);     DATA_OPTN(1, time, <t>, \t\tquit before <t> CPU seconds consumed,        max_time = atof(_OPTION_));     DATA_OPTN(1, sf, <sf>, \t\tprint <sf> as name of sequence file, sf = _OPTION_);     DATA_OPTN(2, dir, <dir>, \t\tmeme source directory, meme_directory = _OPTION_);     FLAG_OPTN(2, check_syntax, \t\tcheck input syntax and exit,        check_syntax = TRUE);     DATA_OPTN(EXP, mfile, <mfile>, \tfile of known motifs, mfile = _OPTION_);     FLAG_OPTN(EXP, V, \t\t\tverbose mode, VERBOSE = TRUE);     DATA_OPTN(EXP, seed, <seed>, \t\tseed for random numbers in sampling,        seed = atoi(_OPTION_));     DATA_OPTN(EXP, seqfrac, <seqfrac>, \tfraction of sequences to use,	seqfrac= atof(_OPTION_));     FLAG_OPTN(EXP, trace, \t\ttrace starting points, TRACE = TRUE);     FLAG_OPTN(EXP, print_all, \t\tprint all debug information,        PRINTALL = TRUE);     FLAG_OPTN(EXP, print_w, \t\tprint erasure matrix, PRINT_W = TRUE);     FLAG_OPTN(EXP, print_z, \t\tprint missing information matrix,        PRINT_Z = TRUE);     FLAG_OPTN(EXP, print_ll, \t\tprint log-likelihood during EM,        PRINT_LL = TRUE);     FLAG_OPTN(EXP, print_starts, \t\tprint starting points,        PRINT_STARTS = TRUE);  )#endif  /* exit if check_syntax is on */  if (check_syntax) exit(0);  /* set random number generator */  srand48(seed);  /* set all the print flags appropriately */  if (PRINTALL) {    PRINT_W = TRUE;    PRINT_Z = TRUE;    PRINT_LL = TRUE;    PRINT_STARTS = TRUE;  }  /* check input arguments */  if (prob < 0 || prob > 1) {    fprintf(stderr, "-prob <p>, <p> must be between 0 and 1\n");    exit(1);  }  if (nmotifs >= MAXG) {    fprintf(stderr, "-nmotifs larger than MAXG-1.  Use smaller -nmotifs or recompile with larger MAXG.\n");   exit(1);  }  /* get the name of the MEME directory */  if (!meme_directory) {			/* not given on command line */    meme_directory = getenv("MEME_DIRECTORY");    if (meme_directory == NULL) {      fprintf(stderr, "You must define environment variable MEME_DIRECTORY\n");      exit(1);    }  }  /* get the name of the sequence file */  if (!sf) sf = getenv("MEME_SEQUENCE_FILE");   /* get the objective function type */  if (!strcmp(obj, "pv")) {    objfun = Pv;  } else if (!strcmp(obj, "ev")) {    objfun = Ev;  } else {    printf("Unknown objective function type %s. \n", obj);    exit(1);  }  /* get the model type */  if (!strcmp(mod, "anr") || !strcmp(mod, "tcm")) {    mtype = Tcm;  } else if (!strcmp(mod, "oops")) {    mtype = Oops;  } else if (!strcmp(mod, "zoops")) {    mtype = Zoops;  } else {    mtype = Zoops;				/* prevent warning */    fprintf(stderr, "Unknown model type %s. \n", mod);    exit(1);  }  /* check seqfrac */  if (seqfrac > 1 || seqfrac <=0) {    fprintf(stderr, "seqfrac must be in (0, 1]\n");    exit(1);  }  /* check the alphabet and set up default mappings and priors */  if (!strcmp(alph, "DNA")) {			/* DNA */    alphabet = DNA0;    if (!mapname) mapname = "uni";		/* uniform prior mapping */    if (!prior) prior = "dirichlet";		/* simple dirichlet prior */  } else {					/* PROTEIN */    alphabet = PROTEIN0;    if (!mapname) mapname = "pam";		/* PAM mapping */    if (!prior) {      switch (mtype) {	case Oops:	  prior = "dmix";	          break;	case Zoops:	case Tcm:	  prior = "megap";	          break;	default:	  prior = "dirichlet"; break;      }    }    if (invcomp) {      fprintf(stderr, "You cannot use -revcomp with -protein.\n");      exit(1);    }  } /* set alphabet and priors defaults */  /* find out type of prior */  if (!strcmp(prior, "dirichlet")) {    ptype = Dirichlet;    if (beta < 0) beta = 0.01;			/* default b = 0.01 */  } else if (!strcmp(prior, "dmix")) {     ptype = Dmix;    if (beta < 0) beta = 0;			/* default b = 0 for dmix */  } else if (!strcmp(prior, "megadmix") || !strcmp(prior, "mega")) {     ptype = Mega;				/* use mega prior heuristic */  } else if (!strcmp(prior, "megap")) {     ptype = MegaP;				/* discretization uses b=0 */  } else if (!strcmp(prior, "addone")) {    ptype = Addone;  } else {    ptype = Dirichlet;				/* prevent warning */    fprintf(stderr, "Unknown type of prior: %s!\n", prior);    exit(1);  }  /* convert the alphabet to upper case */  for (i=0; alphabet && (cc = alphabet[i]); i++)    if (islower(cc)) alphabet[i] = toupper(cc);  /* setup hashing function for encoding strings as integers */  alength = setup_hash_alph(alphabet);  /* read the samples and set up globals */  dataset = read_seq_file(datafile, alphabet, invcomp, seqfrac);  if (!dataset) exit(1);  /* set dataset alphabet flag */  dataset->dna = !strcmp(alph, "DNA");   /* set the dataset objfun */  dataset->objfun = objfun;  /* initialize the background model */  init_meme_background(bfile, invcomp, dataset);  /* reset the current alphabet since it may be clobbered by the above */  (void) setup_hash_alph(alphabet);  /* prevent too long jobs */  if (dataset->total_res > maxsize) {    fprintf(stderr, "Dataset too large (> %d).  Rerun with larger -maxsize.\n",      maxsize);    exit(1);  }  /* Calculate optimal split points for this dataset; no-op if non-parallel */  balance_loop(dataset->samples, dataset->n_samples);  /* read in known motifs file */  if (mfile) {    FILE  *fdata = fopen(datafile, "r");    dataset->nkmotifs = read_motifs(fdata, mfile, dataset->motifs, FALSE, NULL);    nmotifs = dataset->nkmotifs;    prob = 0;				/* don't sample starts */    dataset->pal = 0;			/* no palindrome testing */  } else {    dataset->nkmotifs = 0;  }  /* create the priors */  if (ptype == Dmix || ptype == Mega || ptype == MegaP) {    /* make the name of the prior library */    if (!plib_name) {       char *tmp1, *tmp2;      if (!strcmp(alphabet, PROTEIN0)) {        plib_name = PROTEIN_PLIB;	/* default mixture prior for proteins */      } else {        plib_name = DNA_PLIB;		/* default mixture prior for DNA */      }      /* prepend meme_directory to file name */      Strcat(tmp1, meme_directory, "/etc/");      Strcat(tmp2, tmp1, plib_name);      plib_name = tmp2;    }  }  if ((ptype == Mega || ptype == MegaP) && beta == -1) {    /* tlb 5-9-97; wgt_total_res */    /*beta = 10.0 * dataset->wgt_total_res;*/	/* size of mega prior */    beta = 5.0 * dataset->wgt_total_res;	/* size of mega prior */  }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?