mast.c

来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页

C
1,794
字号
  char *blast_alphabet;			/* corresponding BLAST alphabet */  int *perm[MAXASCII];			/* permutation/substitution matrix */  /* things for normalization */  SORT_SEQ *seqs;		/* sortable array of seq/score records */  double **pv = NULL;		/* p-value tables for each motif */  double *back;			/* background letter probability distribution */#ifdef MALLOC_DEBUG  int malloc_debug(int);  malloc_debug(2);#endif  (void) myclock();             	/* record CPU time */  /* save script arguments, set command to blank and shift arguments */  arguments = argv[1]; argv[1] = ""; argv++; argc--;  /* get the command line arguments */  i = 1;  DO_STANDARD_COMMAND_LINE(1,    NON_SWITCH(1, \r,      switch (i++) {        case 1: break;	/* dummy */        default: COMMAND_LINE_ERROR;      }    );    DATA_OPTN(2, logodds, , , nlogodds = _OPTION_);    DATA_OPTN(2, database, , , database = _OPTION_);    DATA_OPTN(2, alphabet, , , alphabet = _OPTION_);    FLAG_OPTN(2, stdin, , read_stdin = TRUE);    FLAG_OPTN(1, sep,      \t\tscore reverse complement DNA strand as a separate \n\t\t\tsequence,       stype = Separate);    FLAG_OPTN(1, norc, \t\tdo not score reverse complement DNA strand,       stype = Norc);    FLAG_OPTN(1, dna, \t\ttranslate DNA sequences to protein, xlate_dna = TRUE);    FLAG_OPTN(1, comp, \t\tadjust p-values and E-values for sequence composition,       use_seq_comp = TRUE);    DATA_OPTN(1, rank, <rank>,      \tprint results starting with <rank> best (default: 1),      rank = atoi(_OPTION_));    DATA_OPTN(1, smax, <smax>,      \tprint results for no more than <smax> sequences\n\t\t\t(default: all),      smax = atoi(_OPTION_));    DATA_OPTN(1, ev, <ev>,      \tprint results for sequences with E-value < <ev>\n\t\t\t(default: 10),      e_max = atof(_OPTION_));     DATA_OPTN(1, mt, <mt>,       \tshow motif matches with p-value < mt (default: 0.0001),       m_thresh = atof(_OPTION_));     FLAG_OPTN(1, w,       \t\tshow weak matches (mt<p-value<mt*10) in angle brackets, weak = TRUE);    DATA_OPTN(1, bfile, <bfile>, \tread background frequencies from <bfile>,      bfile = _OPTION_);    FLAG_OPTN(1, seqp, \t\tuse SEQUENCE p-values for motif thresholds\n\t\t\t(default: use POSITION p-values), use_seq_p = TRUE);    DATA_OPTN(1, mf, <mf>, \tprint <mf> as motif file name, mfname = _OPTION_);    DATA_OPTN(1, df, <df>, \tprint <df> as database name, dfname = _OPTION_);    DATA_OPTN(1, minseqs, <minseqs>, \tlower bound on number of sequences in db,      min_seqs = atoi(_OPTION_));    DATA_OPTN(1, mev, <mev>,+\tuse only motifs with E-values less than <mev>,       min_ev = atof(_OPTION_));    DATA_OPTN(1, m, <m>,+\tuse only motif(s) number <m> (overrides -mev),       mlist[umotifs++] = atoi(_OPTION_));    DATA_OPTN(1, diag, <diag>, \tnominal order and spacing of motifs,      diagram = _OPTION_);    FLAG_OPTN(1, best, \t\tinclude only the best motif in diagrams,      best_motifs = TRUE);    FLAG_OPTN(1, remcorr, \tremove highly correlated motifs from query,      rem_corr = TRUE);    FLAG_OPTN(1, brief,       \tbrief output--do not print documentation, doc = FALSE);    FLAG_OPTN(1, b, \t\tprint only sections I and II, print_seq = FALSE);    FLAG_OPTN(1, nostatus, \tdo not print progress report, status = FALSE);    FLAG_OPTN(1, hit_list, \tprint only a list of non-overlapping hits; implies -text, hit_list = TRUE);    /* DEBUG OPTIONS */    FLAG_OPTN(EXP, deb, \t\tprint p-values and exit; implies -text, debug = TRUE);    DATA_OPTN(EXP, k, <k>,       \tfile of known motifs to mark as "+" or for ROC if \n\t\t\t-r given,       motif_file = _OPTION_);    DATA_OPTN(EXP, r, <r>, \tcompute roc_<r> using <k>.tag file,       roc_num = atoi(_OPTION_));    FLAG_OPTN(EXP, shuffle, \tshuffle columns of motifs, shuffle = TRUE);    FLAG_OPTN(EXP, sonly, \tuse only spacing p-values in product, sonly = TRUE);    FLAG_OPTN(EXP, lump, \t\tcombine spacings into one p-value, lump = TRUE);  );  /* check input */  if (rank < 1) {    fprintf(stderr, "<rank> must be at least 1.\n");    exit(1);   }  /* set e_max to BIG if doing ROC or printing all sequence p-values      or printing hit-list */  if (roc_num > 0 || debug || hit_list) e_max = BIG;  /* set maximum p-value for weak hits */  if (weak) {    w_thresh = m_thresh * 10.0;			/* 10 times less likely */  } else {    w_thresh = m_thresh;			/* no weak threshold */  }  /* open the datafile or standard input */  if (!read_stdin) {    if (!(fdata = fopen(database, "r"))) {      fprintf(stderr, "Cannot open file `%s'.\n", database);      exit(1);     }  } else {					/* reading stdin */    fdata = stdin;    fsave = tmpfile();				/* save good sequences here */  } /* open database or standard input */  /* initialize the background frequencies and alphabets */  back = init_mast_background(bfile, alphabet, stype, xlate_dna,     &blast_alphabet, perm);  /* get the motifs and parse the ordering and spacing diagram */  nmotifs = get_motifs(xlate_dna, nlogodds, alphabet, blast_alphabet, perm,    shuffle, min_ev, umotifs, mlist, motifs, los, back, RANGE);  /* get list of motifs that should be removed */  nbad = get_bad_motifs(MAXCORR, nmotifs, los, bad_list);  /* remove highly correlated motifs from query */  if (rem_corr && nbad) {    for (i=j=k=0; i<nmotifs; i++) {      if (los[i]->name < bad_list[k] || k >= nbad) {        los[j++] = los[i];      } else {        k++;      }    }    nmotifs -= nbad;  }  /*    Set current sequence alphabet to DNA if translating.  */  if (xlate_dna) setalph(0);  /* determine the type of database being searched */  dna = (xlate_dna || los[0]->dna);  /* get the type of handling of DNA strands */  if (!dna) {					/* protein database */    if (stype == Separate || stype == Norc) {      fprintf(stderr,         "You may not specify -sep or -norc with a protein database.\n");      exit(1);    }    stype = Protein;  } /* database type */  /* read in (optional) known motif occurrences */  if (motif_file) kmotifs = read_motifs(fdata, motif_file, motif, 0, &dummy);  /*    calculate p-values of all integer score values in range [0...w*RANGE]   */  Resize(pv, nmotifs, double *);  for (i=0, error=FALSE; i<nmotifs; i++) {    /*pv[i] = calc_cdf(los[i], RANGE, back);*/    pv[i] = calc_pssm_cdf(los[i]->w, los[i]->alen, RANGE, los[i]->logodds, back);    if (!pv[i]) {      fprintf(stderr, "There is something wrong with motif %d\n", los[i]->name);      error = TRUE;    }  }  if (error) exit(1);			/* error with motifs */  /*      get the scores and p-values for all sequences in the database and     sort them by p-value in ascending order   */  seqs = get_scores(dna, stype, xlate_dna, back, fdata, fsave, los, nmotifs,     RANGE, pv, sonly, lump, use_seq_comp, e_max,    kmotifs, motif, status, min_seqs, &nseqs, &residues, &n_hits  );  /* print p-values and exit if debug on */  if (debug) {    for (i=1; i<nmotifs; i++) {                 /* from motif */      int j;      printf("CORR  %2d ", los[i]->name);      for (j=0; j<i; j++) {                     /* to motif */        printf(" %5.2f", los[i]->corr[j]);      } /* to motif */      printf("\n");    } /* from motif */    for (i=0; i<n_hits; i++) {      printf("NORM %g\n", seqs[i].Pvalue);    } /* hit */    exit(0);  }  /* calculate the ROC of the motifs and exit if doing ROC */  if (roc_num > 0) {    printf("\n\nROC %d = %f\n", roc_num,       calc_roc(seqs, n_hits, motif[0].pos, roc_num, los[0]->thresh));    exit(0);  }  /* create the three output tables in temporary files */  if (!hit_list) {			/* files if -hit_list; prints directly */    hit_file = tmpfile();    diag_file = tmpfile();    note_file = print_seq ? tmpfile() : NULL;  }  if (fsave) fdata = fsave;			/* saved sequences */  n_hits = make_mast_tables(dna, stype, xlate_dna, use_seq_comp, best_motifs,     kmotifs, motif, motif_file, fdata, nseqs, seqs, n_hits, hit_file,     diag_file, note_file, rank, smax, e_max, m_thresh, w_thresh, use_seq_p,     hit_list, los, nmotifs  );  /* print the results */  if (!hit_list) {    print_mast_results(      read_stdin, alphabet, back, bfile, dna, stype, xlate_dna, use_seq_comp,      hit_file, diag_file, note_file, doc,       rank, e_max, m_thresh, w_thresh, use_seq_p,      n_hits, los, bad_list, nbad, rem_corr, mfname, nmotifs, database, dfname,      nseqs, residues    );  }  /* print the command line to the script which was saved in argv[1] */  if (hit_list) {    printf("# %s\n", arguments);  } else {    printf("%s\n", arguments);  }  /* finish status report */  if (status) fprintf(stderr, "\n");  return 0;} /* main *//**********************************************************************//*        calc_roc         Calculate the ROC (receiver operating characteristic) of        a sorted list of score values with known positives flagged.*//**********************************************************************/static double calc_roc(  SORT_SEQ *seqs,			/* array of score values */  int nseqs,                            /* number of sequences */  int pos,				/* number of known positives */  int roc_num,				/* number of fp's to truncate ROC at */  double thresh				/* threshold for error calculation */){  int i;  double roc = 0;			/* receiver operating characteristic */  double tpp = 0; 			/* true positive proportions */  double fpp = 0;			/* false positive proportions */  double tp = 0;			/* true positives so far */  double fp = 0;			/* false positives so far */  double newtpp, newfpp;  int neg = nseqs - pos;		/* number of known negatives */  printf("thresh = %f\n", thresh);  /* loop over sequences, best score first */  for (i=0; i<nseqs; i++) {		/* sequence */    /* update tp and fp since score has changed (or last score reached) */    /* known positive? */    if (seqs[i].known_pos) {      tp++;    } else {      fp++;    }    /* trapezoidal rule : (y2 - y1)/2 dx */    newtpp = MIN(1.0, tp / pos);	/* seqs marked "?" in prosite are					   marked but not counted in pos					   so newtpp could go over 1.0 */    newfpp = fp / neg;    roc += .5 * (newtpp + tpp) * (newfpp - fpp);     tpp = newtpp;    fpp = newfpp;     if (seqs[i].known_pos) { printf("+ "); } else { printf("- "); }    printf("%-*.*s %8.1e\n", MMSN, MMSN, seqs[i].id, seqs[i].Pvalue);    /* reached limiting number of false positives? */    if (fp >= roc_num) break;  } /* sequence */  /* normalize by fpp to get ROC <roc_num> */  if (fpp == 0) {    roc = 1.0;  } else {    roc /= fpp;  }  return roc;} /* calc_roc *//**********************************************************************//*        p_compare         Compare two p-values in ascending order.  Return <0 >0        if the second is >, < the first.  If they are equal,	resolves ties by returning <0 if the second has smaller fp.*//**********************************************************************/static int p_compare(  const void *v1,  const void *v2){  const SORT_SEQ * s1 = (const SORT_SEQ *) v1;  const SORT_SEQ * s2 = (const SORT_SEQ *) v2;  double diff = s2->Pvalue - s1->Pvalue;  if (diff == 0) diff = (double) (s1->fp - s2->fp);  return ((diff > 0) ? -1 : ( (diff < 0) ? 1 : 0) );} /* p_compare *//**********************************************************************//*	calc_p_value   	Calculate the p-value of the product of the individual motif 	p-values < observed	Returns the combined p-value.*//**********************************************************************/static double calc_p_value(  char *sample_name,	/* name of sample */  long length,           /* length of the sequence */  STYPE stype,		/* handling of DNA strands */  int nmotifs,          /* number of motifs */  LO *los[],            /* array of pointers to log-odds matrices */  double *best_score,   /* array of best score for each motif */  int *best_loc,        /* position of best match for each motif */  double range,		/* number of different score values */  double **pv,		/* p-value tables for each motif */  BOOLEAN sonly,	/* calculate p-value of observed spacings only */  BOOLEAN lump,		/* combine spacings into one p-value */  int norder,		/* number of motifs in diag */  BOOLEAN debug 	/* turn on debugging features */){  int i;  double pvalue;  int nspaces;			/* number of motif pairs spacings given for */  double k_scores;		/* product of motif score p-values */  double k_spacing;		/* product of spacing p-values */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?