init.c

来自「EM算法的改进」· C语言 代码 · 共 831 行 · 第 1/2 页

C
831
字号
  priors = create_priors(ptype, beta, dataset, plib_name);  /* check that alphabet in mixture prior matches alphabet */  if (ptype == Dmix || ptype == Mega || ptype == MegaP) {    if (strcmp(alphabet,  priors->plib->alphabet)) {      fprintf(stderr, "The alphabet in the mixture prior file (%s) doesn't match the sequence alphabet.\n", plib_name);      fprintf(stderr, "prior alphabet   : %s\n", priors->plib->alphabet);      fprintf(stderr, "sequence alphabet: %s\n", alphabet);      exit(1);    }  }  /* set number of occurrences of sites */  if (nsites != 0) {    if (mtype == Oops) {      fprintf(stderr, "You may not specify -sites with -mod oops\n");       exit(1);    }    min_nsites = max_nsites = nsites;  }  /* set search range for nsites */  if (mtype == Oops) {    min_nsites = max_nsites = dataset->n_samples;   } else if (mtype == Zoops) {    if (min_nsites > dataset->n_samples) {      fprintf(stderr, "Minimum number of sites too large.  Setting to 2.\n");      min_nsites = 2;    }    if (!min_nsites) min_nsites = 2;			/* default */    if (max_nsites > dataset->n_samples) {      fprintf(stderr, "Maximum number of sites exceeded.  Setting to %d.\n",         dataset->n_samples);      max_nsites = dataset->n_samples;    }    if (!max_nsites) max_nsites = dataset->n_samples;	/* default */  } else {						/* TCM model */    if (min_nsites<2) min_nsites = 2;			/* default */    if (!max_nsites) max_nsites = MIN(5*dataset->n_samples, 50);  }  /* check that there are enough possible sites */  if (min_nsites < 2) {    fprintf(stderr, "You must specify a minimum of 2 or more sites.\n");    exit(1);  }  if (max_nsites < 2) {    fprintf(stderr, "It must be possible for at least 2 sites to fit.\n");    exit(1);  }  /* check weight on prior on nsites */  if (wnsites >= 1 || wnsites < 0) {    fprintf(stderr, "<wnsites> must be in range [0..1).\n"); exit(1);  }  /* check that no sequence too short */  if (dataset->min_slength < MIN_W) {    fprintf(stderr,      "All sequences must be at least %d characters long.  Remove ",        MIN_W);    fprintf(stderr, "shorter sequences\nand rerun.\n");    exit(1);  }   /* set up globals */  if (w != 0) {				/* w specified; set min_w and max_w */    max_w = min_w = w;  }  /* oops model: limit max_w to shortest seq */  if (mtype == Oops && max_w > dataset->min_slength) {    max_w = dataset->min_slength;    fprintf(stderr,       "maxw > length of shortest sequence (%ld).", dataset->min_slength);    fprintf(stderr, "  Setting maxw to %d.\n", max_w);  }  /* all models: limit max_w to longest seq */  if (max_w > dataset->max_slength) {    max_w = dataset->max_slength;    fprintf(stderr,       "maxw > length of longest sequence (%ld).", dataset->max_slength);    fprintf(stderr, "  Setting maxw to %d.\n", max_w);  }  if (max_w > MAXSITE) {    fprintf(stderr,       "maxw too large (> %d).  Recompile with larger MAXSITE.\n", MAXSITE);    exit(1);  }  if (max_w < 0) {					/* use default */    max_w = MIN(MAXSITE, dataset->min_slength);		/* maximum W0 */  }  /* check that min_w <= max_w */  if (min_w > max_w) {     fprintf(stderr, "minw > maxw.  Setting minw to %d.\n", max_w);     min_w = max_w;  }   /* check that min_w and max_w are at least ABS_MIN_W */  if (min_w < ABS_MIN_W) {    fprintf(stderr,       "Minimum width must be >= %d.  Respecify larger -w or -minw.\n",       ABS_MIN_W);    exit(1);  } else if (max_w < ABS_MIN_W) {    fprintf(stderr,      "Maximum width must be >= %d.  Respecify larger -w or -maxw.\n",       ABS_MIN_W);    exit(1);  }  /* must use TCM if only one sequence */  if (mtype != Tcm && dataset->n_samples==1) {    fprintf(stderr, "You must specify '-mod anr' since your dataset contains only one sequence.\n"    );    fprintf(stderr,"Alternatively, you might wish to break your sequence into several sequences.\n"    );    exit(1);  }    /* check that using right alphabet if model type requires it */  if (strcmp(alph, "DNA")) {    if (invcomp) {      fprintf(stderr,         "You must use default DNA alphabet if using complementary strand!\n");      exit(1);    }    if (pal) {      fprintf(stderr, "You must use default DNA alphabet if using -pal !\n");      exit(1);    }  }  /* flag search for palindromes */  dataset->pal = pal;  /* get the type of mapping between sequences and thetas */  if (!strcmp(mapname, "uni")) {    map_type = Uni;     if (map_scale == -1) map_scale = .5;		/* default add .5 */  } else if (!strcmp(mapname, "pam")) {    map_type = Pam;     if (map_scale == -1) map_scale = 120;		/* default PAM 120 */  } else {    fprintf(stderr, "Unknown mapping type %s. \n", mapname);    exit(1);  }  /* check that IUPAC alphabet if using PAM mapping */  if (mapname == "pam" && alphabet != PROTEIN0) {    fprintf(stderr,     "Setting sequence to theta mapping type to `Uni' since alphabet not IUPAC.\n");    mapname = "uni";  }  /* set up the sequence to theta mapping matrix for starts */  dataset->map = init_map(map_type, map_scale, alength, dataset->back, FALSE);  dataset->lomap = init_map(map_type, map_scale, alength, dataset->back, TRUE);  /* set up p_point: previously learned components start points */  p_point = (P_POINT *) mymalloc(sizeof(P_POINT));  p_point->c = n_spcons;		/* number of specified starts */  /* the default starting points */  for (i=0; i<MAXG; i++) {    p_point->e_cons0[i] = NULL;    p_point->w[i] = max_w;    p_point->nsites[i] = nsites;  }  /* setup user-specified start points */  for (i=0; i<n_spcons; i++) {    char *e_cons = (char *) mymalloc(MAXSITE);		/* encoded version */    if (i >= MAXG) {      fprintf(stderr, "Too many starting points.  Increase MAXG and recompile.");      exit(1);    }    p_point->e_cons0[i] = e_cons;    /* convert the consensus sequence to upper case and encode as integer */    for (j=0; (cc = spcons[i][j]) != (int)NULL; j++) {      char uc = (islower(cc) ? toupper(cc) : cc);       e_cons[j] = (uc == 'X') ? alength : hash(uc);      if (e_cons[j] > alength) {	fprintf(stderr, "Illegal letter %c in consensus string!\n", cc);	exit(1);      }    }    /* set width to length of consensus unless -w specified */    if (w == 0) p_point->w[i] = j;    /* pad out the consensus sequence with A's */    for ( ; j < MAXSITE; j++) e_cons[j] = 0;  }  /* make sure nmotifs is as large as the number of starting points */  if (nmotifs < n_spcons) {    nmotifs = n_spcons;    fprintf(stderr, "Setting nmotifs to %d\n", nmotifs);  }  /* create the model */  model = create_model(mtype, invcomp, max_w, alength);  best_model = create_model(mtype, invcomp, max_w, alength);  scratch_model = create_model(mtype, invcomp, max_w, alength);  /* create the negative dataset and negative model if negative file given */  if (negfile) {    neg_dataset = read_seq_file(negfile, dataset->alphabet,       invcomp, seqfrac);    if (!strcmp(ntype, "pair")) {      neg_dataset->negtype = Pair;     } else if (!strcmp(ntype, "blend")) {      neg_dataset->negtype = Blend;    } else {      fprintf(stderr, "Unknown ntype %s. \n", ntype);      exit(1);    }    neg_dataset->pal = dataset->pal;    neg_dataset->back = dataset->back;    neg_model = create_model(mtype, invcomp, max_w, alength);  }   /* initialize log and exp lookup tables */  init_log();  init_exp();  /* Initialize the probability tables for the objective function.  */  /* Get for 2 and up because weighting may cause < min_nsites sites */  init_llr_pv_tables(2, max_nsites, alength, dataset->back, dataset->pal);  /* set up scratch models */  model->min_w = min_w;  model->max_w = max_w;  model->min_nsites = min_nsites;  model->max_nsites = max_nsites;  copy_model(model, best_model, alength);  copy_model(model, scratch_model, alength);  /* put meme parameters in dataset */  dataset->priors = priors;  dataset->p_point = p_point;  dataset->wg = wg;   dataset->ws = ws;   dataset->endgaps = endgaps;   dataset->wnsites = wnsites;  dataset->ma_adj = ma_trim;  dataset->distance = distance;  dataset->prob = prob;  dataset->nmotifs = nmotifs;  dataset->maxiter = maxiter;  dataset->evt = evt;  dataset->mod = mod;  dataset->mapname = mapname;  dataset->map_scale = map_scale;  dataset->priorname = prior;  dataset->beta = beta;  dataset->seed = seed;  dataset->seqfrac = seqfrac;  /* save name of prior library */  if (plib_name) {    dataset->plib_name = plib_name + strlen(plib_name);    while (*dataset->plib_name != '/') dataset->plib_name--;     dataset->plib_name++;	/* strip off directory */  } else {    dataset->plib_name = NULL;  }  dataset->datafile = sf ? sf : datafile;	/* name to print */  dataset->negfile = negfile;  dataset->bfile = bfile;  dataset->min_ic = min_ic;  dataset->max_time = max_time;  /* save command line */  dataset->command = NULL;  argv[0] = "meme";  /*for (i=pos=len=0; i<argc-2; i++) {*/	/* don't save last arguments */  /* save all arguments; used to not save last 2; why??? */  for (i=pos=len=0; i<argc; i++) {    len += strlen(argv[i])+1;			/* +1 for space following */    Resize(dataset->command, len+2, char);	/* +1 for null */    strcpy((dataset->command)+pos, argv[i]);    dataset->command[len-1] = ' ';    dataset->command[len] = '\0';    pos = len;  }  dataset->meme_directory = meme_directory;  /* set up return values */  *model_p = model;  *scratch_model_p = scratch_model;  *best_model_p = best_model;  *neg_model_p = neg_model;  *dataset_p = dataset;  *neg_dataset_p = neg_dataset;  /* announce meme */  banner("MEME");  printf("\n\n");} /* init_meme *//***************************************************************************//*        init_meme_background         Read in the background Markov model or use the residue	frequencies adjusted by add-one prior.        Precalculate the log cumulative background probabilities.        The log probability of any substring of any sequence will then        be accessible via:                Log_back(sample[i]->logcumback, j, w)        where j is the start of the length-w substring of sequence i.*//***************************************************************************/static void init_meme_background (  char *bfile, 					/* background model file */  BOOLEAN rc,					/* average reverse comps */  DATASET *dataset 				/* the dataset */){  int i, j;  char *alphabet = dataset->alphabet;		/* alphabet */  SAMPLE **samples = dataset->samples;		/* the sequences */  int n_samples = dataset->n_samples;		/* # of sequences in dataset */  double *back;					/* freq. of tuples */  int order;					/* order of background model */  BOOLEAN add_x = TRUE;				/* add x-tuples to model */  /* read in background Markov model or use dataset frequencies (0-order) */  if (bfile) {					/* read in probs; set X probs */    back = read_markov_model(bfile, NULL, alphabet, add_x, rc, &order);  } else {					/* use dataset frequencies */    int alength = dataset->alength;		/* length of alphabet */    double *res_freq = dataset->res_freq;	/* weighted residue freq */    double wtr = dataset->wgt_total_res;	/* weighted residue count */    double *adj_res_freq = NULL;    Resize(adj_res_freq, alength, double);    /* adjust residue frequencies with add-one prior to avoid 0s */    for (i=0; i<alength; i++)       adj_res_freq[i] = (res_freq[i]*wtr + 1)/(wtr+alength);     back = read_markov_model(NULL, adj_res_freq, alphabet, add_x, rc, &order);    order = 0;    myfree(adj_res_freq);  }  /* precalculate the log cumulative background probabilities for each seq */  dataset->log_total_prob = 0;  for (i=0; i<n_samples; i++) {			/* sequence */    SAMPLE *s = samples[i];			/* sequence */     char *seq = NULL;				/* ascii sequence */    double *lcb = s->logcumback;		/* log cum. back. prob */    /* unhash the sequence to convert ambiguous characters to X */    Resize(seq, s->length+1, char);    for (j=0; j<s->length; j++) seq[j] = unhash(s->res[j]);    seq[s->length] = '\0';    /* compute probabilites */    dataset->log_total_prob += log_cum_back(seq, back, order, lcb);    myfree(seq);  } /* sequence */  /* return values */  dataset->back = back;  dataset->back_order = order;} /* init_meme_background *//**********************************************************************//*	create_priors*//**********************************************************************/static PRIORS *create_priors(  PTYPE ptype,				/* type of prior to use */  double beta, 				/* beta for dirichlet priors;					   < 0 only returns alphabet */  DATASET *dataset,			/* the dataset */  char *plib_name			/* name of prior library */){  int i;  int alength = (dataset ? dataset->alength : 0);  double *back = (dataset ? dataset->back : (double *)NULL);  PRIORS *priors = (PRIORS *) mymalloc(sizeof(PRIORS));  priors->ptype = ptype;  /* set up the prior counts */  switch (ptype) {    case Addone:				/* add one prior */      for (i=0; i<alength; i++) priors->prior_count[i] = 1.0;      break;    case Dirichlet:				/* simple dirichlet prior */      for (i=0; i<alength; i++) priors->prior_count[i] = beta * back[i];      break;    case Dmix:					/* mixture of dirichlet's */    case Mega:					/* megaprior heuristic */    case MegaP:					/* mod. megaprior heuristic */    {      priors->plib = read_PriorLib(plib_name, beta);      /* get b=0 prior for modified mega prior heuristic */      if (ptype == MegaP || ptype == Mega) {	/* used adj freq with Mega */        double b = 0; 	priors->plib0 = read_PriorLib(plib_name, b);      }      break;    }  }  return priors;} /* create_priors */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?