mast.c

来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页

C
1,794
字号
  int smotifs;			/* number of motifs that got scored */  /* calculate the product of the motif score p-values */  k_scores = 1.0;  for (i=smotifs=0; i<nmotifs; i++) {    int ws = los[i]->ws;	/* width of motif in sequence */    int x = best_score[i];	/* observed EV (extreme value) */    long n = length - ws + 1;	/* number of samples in EV */    double p;			/* p-value of EV */    if (best_score[i] == LITTLE) continue;	/* motif wasn't scored */    if (stype == Combine) n*=2;	/* combining both DNA strands */    EV(pv[i][x], n, p);		/* compute sequence p-value of x */    k_scores *= p;		/* product of p-values */    smotifs++;			/* number of motifs scored */  }  /* multiply by p-values of motif spacings if provided (norder > 1) */  k_spacing = 1.0;  nspaces = 0;			/* number of spaces p-valued */  for (i=1; i<norder; i++) {	/* space to previous motif */    if (space[i] >= 0) {	/* don't ignore this space */      double p;			/* to hold p-value of obs. spacing */      int err;			/* error in pos ith and i-1th motifs */      int mi = order[i];	/* current motif */      int mim1 = order[i-1];	/* previous motif */       long npos;		/* number of positions for motif */      int ws_avg;		/* average width of adjacent motifs */      long mind;		/* minimum spacing that fits seq */      long maxd;		/* maximum spacing that fits seq */      /* get absolute error: diffence between observed and nominal spacing */      err = abs(space[i] - (best_loc[mi] - best_loc[mim1]));      /* get the average motif width and maximum number of placements */      ws_avg = (int) (los[mim1]->ws + los[mi]->ws)/2.0;	/* round down */      npos = length - ws_avg + 1;      mind = MAX(space[i] - err, ws_avg - length);      maxd = MIN(space[i] + err, length - ws_avg);      if (mind >= 0) {	p = (maxd-mind+1) * (npos - (maxd+mind)/2.0);      } else {	p = -mind * (npos - (1-mind)/2.0) + (maxd+1) * (npos - maxd/2.0);      }      p /= npos * npos;      if (p > 1.0 || p < 0.0) fprintf(stderr, 	"\nerror in spacing p-value:%8.8s L=%8ld mind=%8ld E=%4d %-10g\n", 	 sample_name, length, mind, err, p);      /* skip if no possible spacing */      if (maxd < mind) {	continue;      }      k_spacing *= p;		/* product of p-values */      nspaces++;		/* number of spacings used */    } /* don't ignore */  } /* space to previous motif */  /* finish the calculation */  if (sonly) {					/* spacings only */    pvalue = qfast(nspaces, k_spacing);  } else if (lump && nspaces) {			/* lump spacings */    pvalue = qfast(nspaces, k_spacing);		/* spacing pvalue */    pvalue = qfast(smotifs+1, k_scores*pvalue); /* spacing and motif p-value */  } else {					/* spacings and motif scores */    pvalue = qfast(smotifs+nspaces, k_scores*k_spacing);  }  return pvalue;} /* calc_p_value *//**********************************************************************//*	print_mast_results	Print MAST results.*//**********************************************************************/static void print_mast_results(  BOOLEAN read_stdin,			/* read database on stdin */  char *alphabet,			/* alphabet */  double *back,				/* letter probability distribution */  char *bfile,				/* no background file */  BOOLEAN dna,				/* database is DNA */  STYPE stype,				/* handling of different strands */  BOOLEAN xlate_dna,			/* database is DNA and motifs protein */  BOOLEAN use_seq_comp,			/* compensate for seq composition */  FILE* hit_file,			/* table of hits */  FILE* diag_file,			/* table of motif diagrams */  FILE* note_file,			/* table of annotated sequences */  BOOLEAN doc,				/* print documentation */  int rank,				/* rank of first result to print */  double e_max, 			/* maximum sequence E-value to print */  double m_thresh,			/* maximum motif p-value to print */  double w_thresh,	 		/* maximum motif p-value--weak hits */  BOOLEAN use_seq_p,			/* use sequence not position p-values */  int n_hits,				/* sequences less than e_max */  LO *los[],				/* array of pointers to lo matrices */  int bad_list[], 			/* list of highly correlated motifs */  int nbad,				/* number of bad motifs */  BOOLEAN rem_corr,			/* remove correlated motifs */  char *mfname, 			/* motif file name */  int nmotifs,				/* number motifs read */   char *database,			/* name of database */  char *dfname,				/* name of database to print */  int nseqs, 				/* number of sequences in database */  double res 				/* total number of residues in seqs */){  int i, j;  char *database_date;			/* creation/modif. date of database */  struct stat stbuf;			/* buffer for stat call */  char *stars = "********************************************************************************";  char *mtype = (xlate_dna || !dna) ? "peptide" : "nucleotide";  char *dbtype = (!dna) ? "peptide" : "nucleotide";  /*     announce the program   */  i = strlen(ARCHIVE_DATE) - 9;  printf("%s\n", stars);  printf("MAST - Motif Alignment and Search Tool\n");  printf("%s\n", stars);  printf("\tMAST version %s (Release date: %*.*s)\n\n""\tFor further information on how to interpret these results or to get\n""\ta copy of the MAST software please access http://meme.nbcr.net.\n",    VERSION, i, i, ARCHIVE_DATE+7);  printf("%s\n", stars);  /*     print reference citation   */  printf("\n\n%s\n", stars);  printf("REFERENCE\n");  printf("%s\n", stars);  printf("\tIf you use this program in your research, please cite:\n""\n""\tTimothy L. Bailey and Michael Gribskov,\n""\t\"Combining evidence using p-values: application to sequence homology\n""\tsearches\", Bioinformatics, 14(48-54), 1998.\n"  );  printf("%s\n", stars);  /*     print info on the database   */  printf("\n\n%s\n", stars);  printf("DATABASE AND MOTIFS\n");  printf("%s\n", stars);  printf("\tDATABASE %s (%s)\n", dfname ? dfname : database, dbtype);  /* print name (and date if running in UNIX) of database */#ifdef UNIX  if (!read_stdin) {		/* Get date of database if not reading stdin */    stat(database, &stbuf);    database_date = ctime(&stbuf.st_mtime);    printf("\tLast updated on %s", database_date);  }#endif  /* print number of sequences in database */  i = stype==Separate ? 2 : 1;			/* only count + strand */  printf("\tDatabase contains %d sequences, %.0f residues\n\n", nseqs/i, res/i);  /* print handling of DNA strands */  if (stype == Combine) {    printf("\tScores for positive and reverse complement strands are combined.\n\n");  } else if (stype == Separate) {    printf("\tPositive and reverse complement strands are scored separately.\n\n");  } else if (stype == Norc) {    printf("\tReverse complement strands are not scored.\n\n");  }  /*     print info on the motifs   */  if (mfname != NULL) printf("\tMOTIFS %s (%s)\n", mfname, mtype);  printf("\tMOTIF WIDTH BEST POSSIBLE MATCH\n");  printf("\t----- ----- -------------------\n");  for (i=0; i<nmotifs; i++) {    printf("\t%3d   %3d   %*.*s\n", los[i]->name, los[i]->w,       los[i]->w, los[i]->w, los[i]->best_seq);  }  if (diagram != NULL)     printf("\n\tNominal ordering and spacing of motifs:\n\t %s\n", diagram);  /*    print motif correlations   */  if (nmotifs > 1) {    printf("\n\tPAIRWISE MOTIF CORRELATIONS:\n");    printf("\tMOTIF");    for (i=0; i<nmotifs-1; i++) printf(" %5d", los[i]->name);    printf("\n");    printf("\t-----");    for (i=0; i<nmotifs-1; i++) printf(" -----");    printf("\n");    for (i=1; i<nmotifs; i++) {                 /* from motif */      printf("\t  %2d ", los[i]->name);      for (j=0; j<i; j++) {                     /* to motif */        printf(" %5.2f", los[i]->corr[j]);      } /* to motif */      printf("\n");    } /* from motif */    if (nbad) {      printf("\tCorrelations above %4.2f may cause some combined p-values and\n""\tE-values to be underestimates.\n", MAXCORR      );      if (rem_corr) {        printf("\tRemoved motif");      } else {        printf("\tRemoving motif");      }      if (nbad > 1) printf("s "); else printf(" ");      for (i=0; i<nbad; i++) {        if (i==nbad-1) {          if (i>0) printf(" and ");        } else {          if (i>0) printf(", ");        }        printf("%d", bad_list[i]);      }      if (rem_corr) {        printf(          " because they have correlation > %4.2f\n\twith the remaining motifs.\n",           MAXCORR);      } else {        printf(" from the query may be advisable.\n");      }    } else {      printf("\tNo overly similar pairs (correlation > %4.2f) found.\n",        MAXCORR);    }  } /* nmotifs > 1 */  /* print background model frequencies */  if (!use_seq_comp) {    int i, pcol;    char *c;    printf("\n\tRandom model letter frequencies (from %s):",      bfile ? bfile : "non-redundant database");    for (c=alphabet, i=0, pcol=80; *c; c++, i++) {      int lpos = xlate_dna ? protbhash(alphabet[i]) : hash(alphabet[i]);      pcol += 8;          			/* start of printed thing */      if (pcol >= 80) {pcol=15; printf("\n\t");}      printf("%c %5.3f ", *c, back[lpos]);    }    printf("\n");  } else {    printf("\n\tUsing random model based on each target sequence composition.\n");  }   /* end database and motif section */  printf("%s\n", stars);  /*     print table of hits documentation  */  printf("\n\n%s\nSECTION I: HIGH-SCORING SEQUENCES\n%s\n", stars, stars);  if (doc) {    char *fs1 = xlate_dna && stype==Separate ?       "\n\t- The strand and frame of the (best) motif match(es) is shown." :       xlate_dna ? "\n\t- The frame of the (best) motif match(es) is shown." : "";    char *fs2 = xlate_dna ?       "\n\t  Frames 1, 2, and 3 are labeled a, b c, respectively." : "";    printf("\t- Each of the following %d sequences has E-value less than %g.\n",      n_hits, e_max);    if (rank > 1) {      printf("\t- The %d best-matching sequences have been omitted.\n", rank-1);    }    printf("\t- The E-value of a sequence is the expected number of sequences\n""\t  in a random database of the same size that would match the motifs as\n""\t  well as the sequence does and is equal to the combined p-value of the\n""\t  sequence times the number of sequences in the database.\n""\t- The combined p-value of a sequence measures the strength of the\n""\t  match of the sequence to all the motifs and is calculated by\n""\t    o finding the score of the single best match of each motif\n""\t      to the sequence (best matches may overlap),\n");    printf("\t    o calculating the sequence p-value of each score,\n""\t    o forming the product of the p-values,\n");    if (norder > 1) {      printf("\t    o multiplying by the p-value of the observed spacing of\n""\t      pairs of adjacent motifs (given the nominal spacing),\n");    }    printf("\t    o taking the p-value of the product.\n");    printf("\t- The sequence p-value of a score is defined as the\n""\t  probability of a random sequence of the same length containing\n""\t  some match with as good or better a score.\n");    printf("\t- The score for the match of a position in a sequence to a motif\n""\t  is computed by by summing the appropriate entry from each column of\n""\t  the position-dependent scoring matrix that represents the motif.\n""\t- Sequences shorter than one or more of the motifs are skipped.%s%s\n""\t- The table is sorted by increasing E-value.\n", fs1, fs2);    printf("%s\n\n", stars);  } /* doc */  /*     print table of hits headings and data  */  {    int il = xlate_dna || stype==Separate ? MAXID-3 : MAXID+4;/* length of id */    char *st1 = xlate_dna ? "FRAME  " : stype==Separate ? "STRAND " : "";    char *st2 = xlate_dna ? "-----  " : stype==Separate ? "------ " : "";    char *f = "%-*s %-*s%s %8s %6s\n";    printf(f, MMSN, "SEQUENCE NAME", il, "DESCRIPTION", st1,"E-VALUE ","LENGTH");    printf(f, MMSN, "-------------", il, "-----------", st2,"--------","------");    copy_file(hit_file, stdout);    printf("\n%s\n\n", stars);  }  /*     print table of diagrams documentation  */  printf("\n\n%s\nSECTION II: MOTIF DIAGRAMS\n%s\n", stars, stars);  if (doc) {    char *fs0 = stype==Separate ? "the strand and\n\t  " : "";    char *fs1 = stype==Combine ? "s" : "";    char *fs2 = xlate_dna ? "f" : "";			    char *fs3 = xlate_dna ?       "\n\t\t    in frame f.  Frames 1, 2, and 3 are labeled a, b c." : ".";    char *fs4 = stype==Combine ? "\t\t    A minus sign indicates that the occurrence is on the\n\t\t    reverse complement strand.\n" : "";    char *ptype = use_seq_p ? "SEQUENCE" : "POSITION";    char *set = use_seq_p ?      "some random subsequence in a set of n,\n where n is the sequence length minus the motif width plus 1," :      "a single random subsequence of the length of the motif";    printf("\t- The ordering and spacing of all non-overlapping motif occurrences\n""\t  are shown for each high-scoring sequence listed in Section I.\n""\t- A motif occurrence is defined as a position in the sequence whose\n""\t  match to the motif has %s p-value less than %g.\n""\t- The %s p-value of a match is the probability of\n""\t  %s\n""\t  scoring at least as well as the observed match.\n""\t- For each sequence, all motif occurrences are shown unless there\n""\t  are overlaps.  In that case, a motif occurrence is shown only if its\n""\t  p-value is less than the product of the p-values of the other\n""\t  (lower-numbered) motif occurrences that it overlaps.\n""\t- The table also shows %sthe E-value of each sequence.\n""\t- Spacers and motif occurences are indicated by\n""\t   o -d-    `d' residues separate the end of the preceding motif \n""\t\t    occurrence and the start of the following motif occurrence\n",      ptype, w_thresh, ptype, set, fs0);    printf("\t   o [%sn%s]  occurrence of motif `n' with p-value less than %g%s\n%s",      fs1, fs2, m_thresh, fs3, fs4);    if (w_thresh != m_thresh) printf("\t   o <%sn%s>  occurrence of motif `n' with %g < p-value < %g%s\n%s",

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?