mast.c
来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页
C
1,794 行
int smotifs; /* number of motifs that got scored */ /* calculate the product of the motif score p-values */ k_scores = 1.0; for (i=smotifs=0; i<nmotifs; i++) { int ws = los[i]->ws; /* width of motif in sequence */ int x = best_score[i]; /* observed EV (extreme value) */ long n = length - ws + 1; /* number of samples in EV */ double p; /* p-value of EV */ if (best_score[i] == LITTLE) continue; /* motif wasn't scored */ if (stype == Combine) n*=2; /* combining both DNA strands */ EV(pv[i][x], n, p); /* compute sequence p-value of x */ k_scores *= p; /* product of p-values */ smotifs++; /* number of motifs scored */ } /* multiply by p-values of motif spacings if provided (norder > 1) */ k_spacing = 1.0; nspaces = 0; /* number of spaces p-valued */ for (i=1; i<norder; i++) { /* space to previous motif */ if (space[i] >= 0) { /* don't ignore this space */ double p; /* to hold p-value of obs. spacing */ int err; /* error in pos ith and i-1th motifs */ int mi = order[i]; /* current motif */ int mim1 = order[i-1]; /* previous motif */ long npos; /* number of positions for motif */ int ws_avg; /* average width of adjacent motifs */ long mind; /* minimum spacing that fits seq */ long maxd; /* maximum spacing that fits seq */ /* get absolute error: diffence between observed and nominal spacing */ err = abs(space[i] - (best_loc[mi] - best_loc[mim1])); /* get the average motif width and maximum number of placements */ ws_avg = (int) (los[mim1]->ws + los[mi]->ws)/2.0; /* round down */ npos = length - ws_avg + 1; mind = MAX(space[i] - err, ws_avg - length); maxd = MIN(space[i] + err, length - ws_avg); if (mind >= 0) { p = (maxd-mind+1) * (npos - (maxd+mind)/2.0); } else { p = -mind * (npos - (1-mind)/2.0) + (maxd+1) * (npos - maxd/2.0); } p /= npos * npos; if (p > 1.0 || p < 0.0) fprintf(stderr, "\nerror in spacing p-value:%8.8s L=%8ld mind=%8ld E=%4d %-10g\n", sample_name, length, mind, err, p); /* skip if no possible spacing */ if (maxd < mind) { continue; } k_spacing *= p; /* product of p-values */ nspaces++; /* number of spacings used */ } /* don't ignore */ } /* space to previous motif */ /* finish the calculation */ if (sonly) { /* spacings only */ pvalue = qfast(nspaces, k_spacing); } else if (lump && nspaces) { /* lump spacings */ pvalue = qfast(nspaces, k_spacing); /* spacing pvalue */ pvalue = qfast(smotifs+1, k_scores*pvalue); /* spacing and motif p-value */ } else { /* spacings and motif scores */ pvalue = qfast(smotifs+nspaces, k_scores*k_spacing); } return pvalue;} /* calc_p_value *//**********************************************************************//* print_mast_results Print MAST results.*//**********************************************************************/static void print_mast_results( BOOLEAN read_stdin, /* read database on stdin */ char *alphabet, /* alphabet */ double *back, /* letter probability distribution */ char *bfile, /* no background file */ BOOLEAN dna, /* database is DNA */ STYPE stype, /* handling of different strands */ BOOLEAN xlate_dna, /* database is DNA and motifs protein */ BOOLEAN use_seq_comp, /* compensate for seq composition */ FILE* hit_file, /* table of hits */ FILE* diag_file, /* table of motif diagrams */ FILE* note_file, /* table of annotated sequences */ BOOLEAN doc, /* print documentation */ int rank, /* rank of first result to print */ double e_max, /* maximum sequence E-value to print */ double m_thresh, /* maximum motif p-value to print */ double w_thresh, /* maximum motif p-value--weak hits */ BOOLEAN use_seq_p, /* use sequence not position p-values */ int n_hits, /* sequences less than e_max */ LO *los[], /* array of pointers to lo matrices */ int bad_list[], /* list of highly correlated motifs */ int nbad, /* number of bad motifs */ BOOLEAN rem_corr, /* remove correlated motifs */ char *mfname, /* motif file name */ int nmotifs, /* number motifs read */ char *database, /* name of database */ char *dfname, /* name of database to print */ int nseqs, /* number of sequences in database */ double res /* total number of residues in seqs */){ int i, j; char *database_date; /* creation/modif. date of database */ struct stat stbuf; /* buffer for stat call */ char *stars = "********************************************************************************"; char *mtype = (xlate_dna || !dna) ? "peptide" : "nucleotide"; char *dbtype = (!dna) ? "peptide" : "nucleotide"; /* announce the program */ i = strlen(ARCHIVE_DATE) - 9; printf("%s\n", stars); printf("MAST - Motif Alignment and Search Tool\n"); printf("%s\n", stars); printf("\tMAST version %s (Release date: %*.*s)\n\n""\tFor further information on how to interpret these results or to get\n""\ta copy of the MAST software please access http://meme.nbcr.net.\n", VERSION, i, i, ARCHIVE_DATE+7); printf("%s\n", stars); /* print reference citation */ printf("\n\n%s\n", stars); printf("REFERENCE\n"); printf("%s\n", stars); printf("\tIf you use this program in your research, please cite:\n""\n""\tTimothy L. Bailey and Michael Gribskov,\n""\t\"Combining evidence using p-values: application to sequence homology\n""\tsearches\", Bioinformatics, 14(48-54), 1998.\n" ); printf("%s\n", stars); /* print info on the database */ printf("\n\n%s\n", stars); printf("DATABASE AND MOTIFS\n"); printf("%s\n", stars); printf("\tDATABASE %s (%s)\n", dfname ? dfname : database, dbtype); /* print name (and date if running in UNIX) of database */#ifdef UNIX if (!read_stdin) { /* Get date of database if not reading stdin */ stat(database, &stbuf); database_date = ctime(&stbuf.st_mtime); printf("\tLast updated on %s", database_date); }#endif /* print number of sequences in database */ i = stype==Separate ? 2 : 1; /* only count + strand */ printf("\tDatabase contains %d sequences, %.0f residues\n\n", nseqs/i, res/i); /* print handling of DNA strands */ if (stype == Combine) { printf("\tScores for positive and reverse complement strands are combined.\n\n"); } else if (stype == Separate) { printf("\tPositive and reverse complement strands are scored separately.\n\n"); } else if (stype == Norc) { printf("\tReverse complement strands are not scored.\n\n"); } /* print info on the motifs */ if (mfname != NULL) printf("\tMOTIFS %s (%s)\n", mfname, mtype); printf("\tMOTIF WIDTH BEST POSSIBLE MATCH\n"); printf("\t----- ----- -------------------\n"); for (i=0; i<nmotifs; i++) { printf("\t%3d %3d %*.*s\n", los[i]->name, los[i]->w, los[i]->w, los[i]->w, los[i]->best_seq); } if (diagram != NULL) printf("\n\tNominal ordering and spacing of motifs:\n\t %s\n", diagram); /* print motif correlations */ if (nmotifs > 1) { printf("\n\tPAIRWISE MOTIF CORRELATIONS:\n"); printf("\tMOTIF"); for (i=0; i<nmotifs-1; i++) printf(" %5d", los[i]->name); printf("\n"); printf("\t-----"); for (i=0; i<nmotifs-1; i++) printf(" -----"); printf("\n"); for (i=1; i<nmotifs; i++) { /* from motif */ printf("\t %2d ", los[i]->name); for (j=0; j<i; j++) { /* to motif */ printf(" %5.2f", los[i]->corr[j]); } /* to motif */ printf("\n"); } /* from motif */ if (nbad) { printf("\tCorrelations above %4.2f may cause some combined p-values and\n""\tE-values to be underestimates.\n", MAXCORR ); if (rem_corr) { printf("\tRemoved motif"); } else { printf("\tRemoving motif"); } if (nbad > 1) printf("s "); else printf(" "); for (i=0; i<nbad; i++) { if (i==nbad-1) { if (i>0) printf(" and "); } else { if (i>0) printf(", "); } printf("%d", bad_list[i]); } if (rem_corr) { printf( " because they have correlation > %4.2f\n\twith the remaining motifs.\n", MAXCORR); } else { printf(" from the query may be advisable.\n"); } } else { printf("\tNo overly similar pairs (correlation > %4.2f) found.\n", MAXCORR); } } /* nmotifs > 1 */ /* print background model frequencies */ if (!use_seq_comp) { int i, pcol; char *c; printf("\n\tRandom model letter frequencies (from %s):", bfile ? bfile : "non-redundant database"); for (c=alphabet, i=0, pcol=80; *c; c++, i++) { int lpos = xlate_dna ? protbhash(alphabet[i]) : hash(alphabet[i]); pcol += 8; /* start of printed thing */ if (pcol >= 80) {pcol=15; printf("\n\t");} printf("%c %5.3f ", *c, back[lpos]); } printf("\n"); } else { printf("\n\tUsing random model based on each target sequence composition.\n"); } /* end database and motif section */ printf("%s\n", stars); /* print table of hits documentation */ printf("\n\n%s\nSECTION I: HIGH-SCORING SEQUENCES\n%s\n", stars, stars); if (doc) { char *fs1 = xlate_dna && stype==Separate ? "\n\t- The strand and frame of the (best) motif match(es) is shown." : xlate_dna ? "\n\t- The frame of the (best) motif match(es) is shown." : ""; char *fs2 = xlate_dna ? "\n\t Frames 1, 2, and 3 are labeled a, b c, respectively." : ""; printf("\t- Each of the following %d sequences has E-value less than %g.\n", n_hits, e_max); if (rank > 1) { printf("\t- The %d best-matching sequences have been omitted.\n", rank-1); } printf("\t- The E-value of a sequence is the expected number of sequences\n""\t in a random database of the same size that would match the motifs as\n""\t well as the sequence does and is equal to the combined p-value of the\n""\t sequence times the number of sequences in the database.\n""\t- The combined p-value of a sequence measures the strength of the\n""\t match of the sequence to all the motifs and is calculated by\n""\t o finding the score of the single best match of each motif\n""\t to the sequence (best matches may overlap),\n"); printf("\t o calculating the sequence p-value of each score,\n""\t o forming the product of the p-values,\n"); if (norder > 1) { printf("\t o multiplying by the p-value of the observed spacing of\n""\t pairs of adjacent motifs (given the nominal spacing),\n"); } printf("\t o taking the p-value of the product.\n"); printf("\t- The sequence p-value of a score is defined as the\n""\t probability of a random sequence of the same length containing\n""\t some match with as good or better a score.\n"); printf("\t- The score for the match of a position in a sequence to a motif\n""\t is computed by by summing the appropriate entry from each column of\n""\t the position-dependent scoring matrix that represents the motif.\n""\t- Sequences shorter than one or more of the motifs are skipped.%s%s\n""\t- The table is sorted by increasing E-value.\n", fs1, fs2); printf("%s\n\n", stars); } /* doc */ /* print table of hits headings and data */ { int il = xlate_dna || stype==Separate ? MAXID-3 : MAXID+4;/* length of id */ char *st1 = xlate_dna ? "FRAME " : stype==Separate ? "STRAND " : ""; char *st2 = xlate_dna ? "----- " : stype==Separate ? "------ " : ""; char *f = "%-*s %-*s%s %8s %6s\n"; printf(f, MMSN, "SEQUENCE NAME", il, "DESCRIPTION", st1,"E-VALUE ","LENGTH"); printf(f, MMSN, "-------------", il, "-----------", st2,"--------","------"); copy_file(hit_file, stdout); printf("\n%s\n\n", stars); } /* print table of diagrams documentation */ printf("\n\n%s\nSECTION II: MOTIF DIAGRAMS\n%s\n", stars, stars); if (doc) { char *fs0 = stype==Separate ? "the strand and\n\t " : ""; char *fs1 = stype==Combine ? "s" : ""; char *fs2 = xlate_dna ? "f" : ""; char *fs3 = xlate_dna ? "\n\t\t in frame f. Frames 1, 2, and 3 are labeled a, b c." : "."; char *fs4 = stype==Combine ? "\t\t A minus sign indicates that the occurrence is on the\n\t\t reverse complement strand.\n" : ""; char *ptype = use_seq_p ? "SEQUENCE" : "POSITION"; char *set = use_seq_p ? "some random subsequence in a set of n,\n where n is the sequence length minus the motif width plus 1," : "a single random subsequence of the length of the motif"; printf("\t- The ordering and spacing of all non-overlapping motif occurrences\n""\t are shown for each high-scoring sequence listed in Section I.\n""\t- A motif occurrence is defined as a position in the sequence whose\n""\t match to the motif has %s p-value less than %g.\n""\t- The %s p-value of a match is the probability of\n""\t %s\n""\t scoring at least as well as the observed match.\n""\t- For each sequence, all motif occurrences are shown unless there\n""\t are overlaps. In that case, a motif occurrence is shown only if its\n""\t p-value is less than the product of the p-values of the other\n""\t (lower-numbered) motif occurrences that it overlaps.\n""\t- The table also shows %sthe E-value of each sequence.\n""\t- Spacers and motif occurences are indicated by\n""\t o -d- `d' residues separate the end of the preceding motif \n""\t\t occurrence and the start of the following motif occurrence\n", ptype, w_thresh, ptype, set, fs0); printf("\t o [%sn%s] occurrence of motif `n' with p-value less than %g%s\n%s", fs1, fs2, m_thresh, fs3, fs4); if (w_thresh != m_thresh) printf("\t o <%sn%s> occurrence of motif `n' with %g < p-value < %g%s\n%s",
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?