mast.c
来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页
C
1,794 行
/* * $Id: mast.c 1339 2006-09-21 19:46:28Z tbailey $ * * $Log$ * Revision 1.5 2006/03/08 20:50:11 nadya * merge chamges from v3_5_2 branch * * Revision 1.4.4.2 2006/01/31 20:51:38 nadya * rm initialization for 'name' * * Revision 1.4.4.1 2006/01/31 20:22:48 nadya * init name with zeros * * Revision 1.4 2005/10/25 19:02:26 nadya * change c++ style comment to proper c * * Revision 1.3 2005/10/20 00:20:52 tbailey * *** empty log message *** * * Revision 1.2 2005/10/01 23:58:04 nadya * update documentation comment * * Revision 1.1.1.1 2005/07/29 18:06:43 nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************* ** MAST ** Author: Timothy L. Bailey ** ** Copyright ** (1994 - 2001) The Regents of the University of California. ** All Rights Reserved. ** ** Permission to use, copy, modify, and distribute any part of ** this software for educational, research and non-profit purposes,** without fee, and without a written agreement is hereby granted, ** provided that the above copyright notice, this paragraph and ** the following three paragraphs appear in all copies. ** ** Those desiring to incorporate this software into commercial ** products or use for commercial purposes should contact the ** Technology Transfer Office, University of California, San Diego,** 9500 Gilman Drive, La Jolla, California, 92093-0910, ** Ph: (619) 534 5815. ** ** IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ** ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR ** CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF ** THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA ** HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ** ** THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE ** UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE ** MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. ** THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND ** EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, ** INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ** MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT ** THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT, ** TRADEMARK OR OTHER RIGHTS. *************************************************************************//**********************************************************************//* mast <logodds> <database> <alphabet> [optional arguments...] See <install-path>/bin/mast for documentation. See <install-path>/bin/make_logodds for format of logodds file.*//**********************************************************************/#define DEFINE_GLOBALS#include "mast.h"#include "diagram.h"#include "projrel.h"/* printing */BOOLEAN debug = FALSE; /* turn on debugging features */#ifndef EXP#define EXP 0#else#define EXP 1#endif/* size of memory chunks for sortable sequences */#define RCHUNK 100/* default e-thresh */#define EXPECT 10/* maximum pairwise motif correlation */#define MAXCORR 0.6/* constants for printing control */#define MMSN 34#define PAGEWIDTH 80#define MAXID (80 - MMSN - 7 - PVALUE - LENGTH)#define PVALUE 8#define LENGTH 6#define MAXIDLINES 10/* number of different integral score values */#define RANGE 100/* macro to copy one open file to another; noop if file NULL */#define copy_file(in_file, out_file) \ if (in_file != NULL) { \ int c; \ rewind(in_file); \ while ((c = getc(in_file)) != EOF) putc(c, out_file); \ }/* Data Definitions*//* sortable sequence score record */typedef struct { long fp; /* file pointer to beginning of sequence rec. */ long length; /* length of sequence */ double *score; /* motif scores */ int *loc; /* motif locations */ int pos; /* positive motif 0 score */ int neg; /* negative motif 0 score */ double Pvalue; /* p-value of product of p-values */ BOOLEAN known_pos; /* known positive if true (for ROC) */ char *id; /* name of sequence */ int strand; /* -1 neg. strand, 0 both/protein, +1 pos. strand */ double *comp; /* actual sequence composition */ double **pv; /* pvalue distribution for each motif */} SORT_SEQ;/* Local Procedures*/static int get_motifs( BOOLEAN xlate_dna, /* database is DNA and motifs protein */ char *nlogodds, /* name of log-odds matrix */ char *alphabet, /* alphabet of logodds matrices */ char *blast_alphabet, /* corresponding BLAST alphabet */ int *p[MAXASCII], /* alphabet permutation/substitution matrix */ BOOLEAN shuffle, /* shuffle the motif columns */ double min_ev, /* minimum E-value of motifs to use */ int umotifs, /* number motifs to be used */ int mlist[MAXLO], /* list of motifs given in -m option */ BOOLEAN motifs[MAXLO], /* list of motifs to output */ LO *los[MAXLO], /* array of logodds matrices */ double *f, /* array of null letter probabilities */ int range /* set logodds matrices to [0..range] */);static SORT_SEQ *get_scores( BOOLEAN dna, /* database is DNA */ STYPE stype, /* handling of DNA strands */ BOOLEAN xlate_dna, /* database is DNA and motifs protein */ double f[], /* null letter probability distribution */ FILE *fdata, /* the database */ FILE *fsave, /* good sequences if database on stdin */ LO *los[], /* array of pointers to log-odds matrices */ int nmotifs, /* number of log-odds matrices in los */ int range, /* set entries in matrices to [0..range] */ double **pv, /* p-value tables for each motif */ BOOLEAN sonly, /* calculate p-value of observed spacings only */ BOOLEAN lump, /* combine spacings into one p-value */ BOOLEAN use_seq_comp, /* adjust E-value for actual sequence composition */ double e_max, /* maximum sequence E-value to print */ int kmotifs, /* number of known motifs */ MOTIF motif[], /* data returned by read_motifs */ BOOLEAN status, /* show progress */ int min_seqs, /* lower bound on nseqs */ int *nseqs, /* total sequences in database */ double *residues, /* total number of residues in seqs */ int *n_hits /* sequences less than e_max */);static double calc_roc( SORT_SEQ *seq, /* array of score values */ int nseqs, /* number of sequences */ int pos, /* number of known positives */ int roc_num, /* number of fp's to truncate ROC at */ double thresh /* threshold for error calculation */);static int p_compare( const void *v1, const void *v2);static double calc_p_value( char *sample_name, /* name of sample */ long length, /* length of the sequence */ STYPE stype, /* handling of DNA strands */ int nmotifs, /* number of motifs */ LO *los[], /* array of pointers to log-odds matrices */ double *best_score, /* array of best score for each motif */ int *best_loc, /* position of best match for each motif */ double range, /* number of different score values */ double **pv, /* p-value tables for each motif */ BOOLEAN sonly, /* calculate p-value of observed spacings only */ BOOLEAN lump, /* combine spacings into one p-value */ int norder, /* number of motifs in diag */ BOOLEAN debug /* turn on debugging features */);static void print_mast_results( BOOLEAN read_stdin, /* read database on stdin */ char *alphabet, /* alphabet */ double *back, /* letter probability distribution */ char *bfile, /* no background file */ BOOLEAN dna, /* database is DNA */ STYPE stype, /* handling of different strands */ BOOLEAN xlate_dna, /* database is DNA and motifs protein */ BOOLEAN use_seq_comp, /* adjust E-value for sequence composition */ FILE* hit_file, /* table of hits */ FILE* diag_file, /* table of motif diagrams */ FILE* note_file, /* table of annotated sequences */ BOOLEAN doc, /* print documentation */ int rank, /* rank of first result to print */ double e_max, /* maximum sequence E-value to print */ double m_thresh, /* maximum motif p-value to print */ double w_thresh, /* maximum motif p-value--weak hits */ BOOLEAN use_seq_p, /* use sequence not position p-values */ int n_hits, /* sequences less than e_max */ LO *los[], /* array of pointers to lo matrices */ int bad_list[], /* list of highly correlated motifs */ int nbad, /* number of bad motifs */ BOOLEAN rem_corr, /* remove correlated motifs */ char *mfname, /* motif file name to print */ int nmotifs, /* number motifs read */ char *database, /* name of database */ char *dfname, /* name of database to print */ int nseqs, /* number of sequences in database */ double res /* total number of residues in seqs */);static int make_mast_tables( BOOLEAN dna, /* database is DNA */ STYPE stype, /* handling of DNA strands */ BOOLEAN xlate_dna, /* database is DNA and motifs protein */ BOOLEAN use_seq_comp, /* adjust E-value for sequence composition */ BOOLEAN best_motifs, /* diagrams have only best motif */ int kmotifs, /* number of known motifs */ MOTIF motif[], /* data returned by read_motifs */ char *motif_file, /* motif file name */ FILE *fdata, /* the database file */ int nseqs, /* number of sequences scored */ SORT_SEQ *seqs, /* sortable array of seq/score records*/ int n_hits, /* size of seqs array */ FILE* hit_file, /* table of hits */ FILE* diag_file, /* table of motif diagrams */ FILE* note_file, /* table of annotated sequences */ int rank, /* rank of first result to print */ int smax, /* maximum sequences to print */ double e_max, /* maximum E-value to print */ double m_thresh, /* maximum motif p-value to print */ double w_thresh, /* maximum motif p-value-- weak hits */ BOOLEAN use_seq_p, /* use sequence not position p-values */ BOOLEAN hit_list, /* create hit list instead of diagram */ LO *los[], /* array of pointers to lo matrices */ int nmotifs /* number motifs read */ );static void print_annotation( BOOLEAN dna, /* database is DNA */ STYPE stype, /* handling of DNA strands */ BOOLEAN xlate_dna, /* database is DNA and motifs protein */ double thresh, /* threshold for strong hits */ FILE *note_file, /* table of annotated sequences */ LO *los[], /* array of pointers to lo matrices */ char *sequence, /* sequence of sample */ long length, /* length of sample */ char *sample_name, /* name of sample */ char *id, /* identifier text for sample */ int strand, /* strand of DNA or 0 if both/protein */ double pvalue, /* combined p-value of sequence */ double evalue, /* combined E-value of sequence */ TILING tiling /* tiling and diagram of sequence */);static char *best_frame( int nmotifs, /* number of motifs */ long length, /* length of sequence */ TILING /* tiling of sequence */);static int get_bad_motifs( double maxcorr, /* maximum allowed corellation */ int nmotifs, /* number of motifs */ LO *los[], /* array of logodds matrices */ int bad_list[] /* list of highly correlated motifs */);/**********************************************************************//* main routine*//**********************************************************************/extern int main (int argc, char *argv[]){ /* defaults */ char *nlogodds = ""; char *database = ""; char *alphabet = ""; char *mfname = NULL; /* don't print the motif file name */ char *dfname = NULL; /* print actual name of database */ BOOLEAN print_seq = TRUE; /* print annotated sequence */ BOOLEAN weak = FALSE; /* print weak hits */ int rank = 1; /* print results starting with rank */ int smax = 0; /* print all results */ double e_max = EXPECT; /* maximum sequence p-value to print */ double m_thresh = 1e-4; /* maximum motif p-value to print */ double w_thresh; /* maximum motif p-value--weak hits default: m_thresh * 10 */ double min_ev = BIG; /* minimum E-value of motifs to use */ int min_seqs = 0; /* lower bound on number db sequences */ int roc_num = 0; /* don't compute ROC */ BOOLEAN use_seq_p = FALSE; /* use sequence p-value for m_thresh */ BOOLEAN shuffle = FALSE; /* don't shuffle the motif columns */ BOOLEAN sonly = FALSE; /* print product of spacing p-values */ BOOLEAN lump = FALSE; /* combine spacings into one p-value */ BOOLEAN status = TRUE; /* show progress */ BOOLEAN doc = TRUE; /* print documentation */ STYPE stype = Combine; /* handling of DNA strands */ BOOLEAN xlate_dna = FALSE; /* don't tranlate DNA sequences */ BOOLEAN best_motifs = FALSE; /* only print best motif in diagrams */ BOOLEAN use_seq_comp = FALSE; /* adjust E-values for actual compos. */ BOOLEAN rem_corr = FALSE; /* remove overly corellated motifs */ BOOLEAN read_stdin = FALSE; /* read database from stdin */ BOOLEAN hit_list = FALSE; /* print block diagrams */ /* locals */ int i, j, k; char *arguments; /* saved argv[1] */ FILE *fdata = NULL; /* the database */ FILE *fsave = NULL; /* good seqs if reading stdin */ FILE *hit_file = NULL, *diag_file=NULL, *note_file=NULL;/* temporary files for tables */ char *motif_file = NULL; /* motif file name */ char *bfile = NULL; /* no background file */ int nseqs = 0; /* total sequences in database */ double residues = 0; /* total number of residues in seqs */ int n_hits = 0; /* sequences less than e_max */ BOOLEAN dna; /* true if sequences are DNAB */ BOOLEAN error; /* true if something wrong */ /* motifs and logodds matrices */ int nmotifs; /* number motifs read */ int umotifs=0; /* number motifs to be used */ int kmotifs=0; /* number of known motifs */ LO *los[MAXLO]; /* array of logodds matrices */ BOOLEAN motifs[MAXLO]; /* list of motifs to output */ int bad_list[MAXLO]; /* list of highly correlated motifs */ int nbad; /* number of bad (correlated) motifs */ int mlist[MAXLO]; /* list of motifs given in -m option */ MOTIF motif[NMOTIFS]; /* data returned by read_motifs */ DATASET dummy; /* dummy arg for read_motifs */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?