mast.c
来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页
C
1,794 行
fs1, fs2, m_thresh, w_thresh, fs3, fs4); printf("%s\n\n", stars); } /* doc */ /* print table of diagrams headings and data */ { char *st1 = stype==Separate ? "STRAND" : ""; /* strand heading */ char *st2 = stype==Separate ? "------" : ""; /* strand underline */ int nl = (*st1=='\0') ? MMSN : MMSN-4; /* length of name */ char *f = "%-*s%s %8s %s\n"; /* format */ printf(f, nl, "SEQUENCE NAME", st1, "E-VALUE ", "MOTIF DIAGRAM"); printf(f, nl, "-------------", st2, "--------", "-------------"); copy_file(diag_file, stdout); printf("\n%s\n\n", stars); } /* print table of annotated sequences documentation and data */ if (note_file) { printf("\n\n%s\n", stars); printf("SECTION III: ANNOTATED SEQUENCES\n"); printf("%s\n", stars); if (doc) { char *fs1 = xlate_dna ? "/frame" : ""; /* frame string 3 */ char *fs2 = stype==Combine ? " (a minus sign indicates that\n\t the occurrence is on the reverse complement strand)" : ""; char *fs3 = xlate_dna && stype==Combine ? " (or its reverse)" : stype==Combine ? " (or its reverse complement)" : ""; char *fs4 = xlate_dna ? "," : ", and"; /* frame string 4 */ char *fs5 = xlate_dna && stype==Combine ? ", and\n\t o the protein translation of the match (or its reverse).\n" : xlate_dna ? ", and\n\t o the protein translation of the match.\n" : ".\n"; char *ptype = use_seq_p ? "SEQUENCE" : "POSITION"; printf("\t- The positions and p-values of the non-overlapping motif occurrences\n""\t are shown above the actual sequence for each of the high-scoring\n""\t sequences from Section I.\n""\t- A motif occurrence is defined as a position in the sequence whose\n""\t match to the motif has %s p-value less than %g as \n""\t defined in Section II.\n""\t- For each sequence, the first line specifies the name of the sequence.\n""\t- The second (and possibly more) lines give a description of the \n""\t sequence.\n""\t- Following the description line(s) is a line giving the length, \n""\t combined p-value, and E-value of the sequence as defined in Section I.\n""\t- The next line reproduces the motif diagram from Section II.\n""\t- The entire sequence is printed on the following lines.\n""\t- Motif occurrences are indicated directly above their positions in the\n""\t sequence on lines showing\n""\t o the motif number%s of the occurrence%s,\n""\t o the position p-value of the occurrence,\n""\t o the best possible match to the motif%s%s\n""\t o columns whose match to the motif has a positive score (indicated \n""\t by a plus sign)%s", ptype, w_thresh, fs1, fs2, fs3, fs4, fs5); printf("%s\n", stars); } /* doc */ copy_file(note_file, stdout); printf("\n%s\n\n", stars); } /* annotation table */ /* display elapsed time */ fflush(stdout); system("echo ''; echo CPU: `hostname`;"); printf("Time %f secs.\n\n", myclock()/1E6);} /* print_mast_results *//**********************************************************************//* make_mast_tables Make the three MAST tables in temporary files. Returns the number of sequences less than e_max.*//**********************************************************************/static int make_mast_tables( BOOLEAN dna, /* database is DNA */ STYPE stype, /* handling of different strands */ BOOLEAN xlate_dna, /* database is DNA and motifs protein */ BOOLEAN use_seq_comp, /* compensate for seq composition */ BOOLEAN best_motifs, /* diagrams have only best motif */ int kmotifs, /* number of known motifs */ MOTIF motif[], /* data returned by read_motifs */ char *motif_file, /* motif file name */ FILE *fdata, /* the database file */ int nseqs, /* number of sequences scored */ SORT_SEQ *seqs, /* sortable array of seq/score records*/ int n_hits, /* size of seqs array */ FILE* hit_file, /* table of hits */ FILE* diag_file, /* table of motif diagrams */ FILE* note_file, /* table of annotated sequences */ int rank, /* rank of first result to print */ int smax, /* maximum sequences to print*/ double e_max, /* maximum E-value to print */ double m_thresh, /* maximum motif p-value to print */ double w_thresh, /* maximum motif p-value-- weak hits */ BOOLEAN use_seq_p, /* use sequence not position p-values */ BOOLEAN hit_list, /* create hit list instead of diagram */ LO *los[], /* array of pointers to lo matrices */ int nmotifs /* number motifs read */ ){ int i; int seqno; /* seq. number in sorted sequences */ long length; /* length of sample */ char *sample_name; /* name of sample */ char *sequence; /* sequence of sample */ char *id; /* identifier text for sample */ int pseqs = 0; /* number of sequences printed */ if (hit_list) { printf("# All non-overlapping hits in all sequences with E-values <= %g.\n", e_max); printf("# sequence_name motif hit_start hit_end hit_p-value\n"); } for (seqno=0; seqno < n_hits; seqno++) { SORT_SEQ *seq = seqs + seqno; /* next seq */ TILING tiling; /* tiling of sequence */ int strand = seq->strand; /* -1 neg, 0 both, +1 pos */ double **pv = seq->pv; /* pvalue tables */ double pvalue = seq->Pvalue; /* combined p-value of seq */ double evalue = nseqs * pvalue; /* combined E-value of seq */ char name[1000]; /* name with gi|...| removed */ /* done if E-value above threshold; rest have larger p-values */ if (evalue > e_max) break; /* done if maximum number of sequences reached */ if (smax && pseqs >= smax) break; /* skip if rank not reached */ if (seqno+1 < rank) continue; pseqs++; fseek(fdata, seq->fp, 0); /* go to start of sequence */ read_sequence(fdata, &sample_name, &id, &sequence, &length); strncpy(name, sample_name, 999); /* convert to reverse complement if negative strand of DNA */ if (strand == -1) invcomp_dna(sequence, length); /* score, tile and diagram the sequence with each of the motifs */ tiling = score_tile_diagram(sequence, length, los, nmotifs, dna, stype, xlate_dna, best_motifs, FALSE, pv, m_thresh, w_thresh, use_seq_p, hit_list, name); /* Print the hit in the hits section. */ if (hit_file != NULL) { char *kp = !kmotifs ? "" : (seq->known_pos ? "+ " : "- "); /* +/- */ int nl = !kmotifs ? MMSN : MMSN - 2; /* length of name */ char *frame = xlate_dna ? best_frame(nmotifs, length, tiling) : ""; char *st = stype==Separate ? (strand==-1 ? " -" : " +") : ""; int il = (*st=='\0') ? MAXID : MAXID-2; /* length of id */ char *elipsis = strlen(id)>il ? "... " : " "; if (*frame!='\0') il -= 2; fprintf(hit_file, "%s%-*.*s %-*.*s%3s%s%s %8.2g %6ld\n", kp, nl, nl, name, il, il, id, elipsis, st, frame, evalue, length); } /* print hit */ /* Print the motif diagram in the diagrams section. */ if (diag_file != NULL) { char hdr[80]; /* header */ char *st = stype!=Separate ? "" : (strand==-1 ? " -" : " +"); sprintf(hdr, "%-*.*s%s %8.2g ", MMSN, MMSN, name, st, evalue); print_diagram(tiling.diagram, hdr, diag_file); } /* Print the hit list straight to standard output. */ if (hit_list) { /* print hits straight to stdout */ if (tiling.diagram) printf("%s", tiling.diagram); //print_diagram(tiling.diagram, "", stdout); } /* Print annotated sequence in the annotation section. */ if (note_file) print_annotation(dna, stype, xlate_dna, m_thresh, note_file, los, sequence, length, sample_name, id, strand, pvalue, evalue, tiling); /* free space */ myfree(sample_name); myfree(id); myfree(sequence); free_tiling(tiling); if (use_seq_comp) { for (i=0; i<nmotifs; i++) myfree(seq->pv[i]); myfree(seq->pv); } } /* sequence */ return(pseqs);} /* make_mast_tables *//**********************************************************************//* get_scores Calculate the score and p-value for each sequence in the database and sort them by p-value in ascending order. Returns sorted array of sequence-score records.*//**********************************************************************/static SORT_SEQ *get_scores( BOOLEAN dna, /* database is DNA */ STYPE stype, /* handling of DNA strands */ BOOLEAN xlate_dna, /* database is DNA and motifs protein */ double f[], /* null letter probability distribution */ FILE *fdata, /* the database */ FILE *fsave, /* good sequences if database on stdin */ LO *los[], /* array of pointers to log-odds matrices */ int nmotifs, /* number of log-odds matrices in los */ int range, /* set entries in matrices to [0..range] */ double **pv, /* p-value tables for each motif */ BOOLEAN sonly, /* calculate p-value of observed spacings only */ BOOLEAN lump, /* combine spacings into one p-value */ BOOLEAN use_seq_comp, /* adjust E-value for actual sequence composition */ double e_max, /* maximum sequence E-value to print */ int kmotifs, /* number of known motifs*/ MOTIF motif[], /* data returned by read_motifs */ BOOLEAN status, /* show progress */ int min_seqs, /* lower bound on nseqs */ int *nseqs, /* total sequences in database */ double *residues, /* total number of residues in seqs */ int *n_hits /* sequences less than e_max */){ int i; int imotif; long fp = 0; /* pointer in datafile */ char *sample_name; /* name of sample */ char *sequence; /* sequence of sample */ char *id; /* identifier text for sample */ long length; /* length of the sequence */ double *best_score = NULL; /* best score per motif */ int *best_loc = NULL; /* best location per motif */ SORT_SEQ *seqs = NULL; /* sortable array of seq/score records*/ double pvalue; /* expected number of sequences >= x */ SCORE **scores; /* scores for each motif vs seq. pos. */ int strand = (stype==Separate || stype==Norc) ? 1 : 0; /* current */ int alen = los[0]->alen; /* length of motif alphabet */ BOOLEAN saved = FALSE; /* sequence already saved */ *n_hits = *nseqs = *residues = 0; /* actual and saved sequences */ while (1) { /* read next sequence unless doing - strand; break on EOF or error */ if (strand != -1) { /* not - strand */ fp = fsave ? ftell(fsave) : ftell(fdata); /* save seq position */ if (!read_sequence(fdata, &sample_name, &id, &sequence, &length)) break; saved = FALSE; /* new sequence */ } /* create space for best scores and locations per motif unless they exist */ if (!best_score) Resize(best_score, nmotifs, double); if (!best_loc) Resize(best_loc, nmotifs, int); /* update size of database */ if (stype==Separate && strand==1) { /* treat each DNA sequence as two */ (*nseqs)+=2; /* number of sequences in database */ *residues += 2*length; /* number of residues in database */ } else if (stype!=Separate) { /* treat each sequence as one */ (*nseqs)++; /* number of sequences in database */ *residues += length; /* number of residues in database */ } /* convert DNA to negative strand if just did the positive strand */ if (strand == -1) invcomp_dna(sequence, length); /* compute the motif scores */ scores = score_sequence(stype, xlate_dna, sequence, length, nmotifs, los); /* find best scoring position for each motif for the sequence */ for (imotif=0; imotif<nmotifs; imotif++) { /* motif */ long seq_pos; int ws = los[imotif]->ws; /* width of motif in sequence */ best_score[imotif] = LITTLE; best_loc[imotif] = 0; if (ws > length) continue; /* skip if motif too long */ for (seq_pos=0; seq_pos<=length-ws; seq_pos++) { /* position in sequence */ double s = scores[imotif][seq_pos].score; if (s > best_score[imotif]) { best_score[imotif] = s; best_loc[imotif] = seq_pos; } } /* position */ } /* motif */ free_2array(scores, nmotifs); /* free scores array */ /* calculate the combined p-value for this sequence */ pvalue = calc_p_value(sample_name, length, stype, nmotifs, los, best_score, best_loc, range, pv, sonly, lump, norder, debug); /* save the sequence if its E-value may be under threshold */ if (pvalue*(MAX(min_seqs, *nseqs)) < e_max) { SORT_SEQ *seq; /* create sequence record for sorting */ if (*n_hits % RCHUNK == 0) Resize(seqs, *n_hits+RCHUNK, SORT_SEQ); seq = &seqs[(*n_hits)++]; /* record for sorting */ seq->id = sample_name; /* name of sequence */ seq->fp = fp; /* start of sequence record */ seq->length = length; /* length of sequence */ seq->Pvalue = pvalue; /* p-value of product of p-values */ seq->strand = strand; /* strand of DNA or 0 if both */ seq->score = best_score; /* best score per motif */ seq->loc = best_loc; /* locations of best scores */ seq->comp = use_seq_comp ? get_seq_comp(xlate_dna, sequence, alen) : NULL; seq->pv = pv; /* pvalue tables for each motif */ best_score = NULL; best_loc = NULL; /* don't reuse the space! */ /* write the sequence to the fsave file if reading standard input */ if (fdata == stdin && !saved) { fprintf(fsave, ">%s %s\n%s\n", sample_name, id, sequence); saved = TRUE; /* don't save this seq again */ }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?