mast.c

来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页

C
1,794
字号
/* * $Id: mast.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.5  2006/03/08 20:50:11  nadya * merge chamges from v3_5_2 branch * * Revision 1.4.4.2  2006/01/31 20:51:38  nadya * rm initialization for 'name' * * Revision 1.4.4.1  2006/01/31 20:22:48  nadya * init name with zeros * * Revision 1.4  2005/10/25 19:02:26  nadya * change c++ style comment to proper c * * Revision 1.3  2005/10/20 00:20:52  tbailey * *** empty log message *** * * Revision 1.2  2005/10/01 23:58:04  nadya * update documentation comment * * Revision 1.1.1.1  2005/07/29 18:06:43  nadya * Importing from meme-3.0.14, and adding configure/make * *//*************************************************************************                                                                     	**       MAST                                                           	**       Author: Timothy L. Bailey                                      	**                                                                       **	Copyright							**	(1994 - 2001) The Regents of the University of California.	**	All Rights Reserved.						**									**	Permission to use, copy, modify, and distribute any part of 	**	this software for educational, research and non-profit purposes,**	without fee, and without a written agreement is hereby granted, **	provided that the above copyright notice, this paragraph and 	**	the following three paragraphs appear in all copies.		**									**	Those desiring to incorporate this software into commercial 	**	products or use for commercial purposes should contact the 	**	Technology Transfer Office, University of California, San Diego,**	9500 Gilman Drive, La Jolla, California, 92093-0910, 		**	Ph: (619) 534 5815.						**									**	IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO 	**	ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR 	**	CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF 	**	THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA 	**	HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 		**									**	THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE **	UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE 		**	MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.  **	THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND 	**	EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, **	INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 	**	MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT 	**	THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT, 		**	TRADEMARK OR OTHER RIGHTS.  					*************************************************************************//**********************************************************************//*  mast <logodds> <database> <alphabet> [optional arguments...]	See <install-path>/bin/mast for documentation.	See <install-path>/bin/make_logodds for format of logodds file.*//**********************************************************************/#define DEFINE_GLOBALS#include "mast.h"#include "diagram.h"#include "projrel.h"/* printing */BOOLEAN debug = FALSE;		/* turn on debugging features */#ifndef EXP#define EXP 0#else#define EXP 1#endif/* size of memory chunks for sortable sequences */#define RCHUNK 100/* default e-thresh */#define EXPECT 10/* maximum pairwise motif correlation */#define MAXCORR 0.6/* constants for printing control */#define MMSN 34#define PAGEWIDTH 80#define MAXID (80 - MMSN - 7 - PVALUE - LENGTH)#define PVALUE 8#define LENGTH 6#define MAXIDLINES 10/* number of different integral score values */#define RANGE 100/* macro to copy one open file to another; noop if file NULL */#define copy_file(in_file, out_file) \  if (in_file != NULL) { \    int c; \    rewind(in_file); \    while ((c = getc(in_file)) != EOF) putc(c, out_file); \  }/*  Data Definitions*//* sortable sequence score record */typedef struct {  long  fp;		/* file pointer to beginning of sequence rec. */  long length;		/* length of sequence */  double *score;	/* motif scores */  int *loc;		/* motif locations */  int pos;		/* positive motif 0 score */  int neg;		/* negative motif 0 score */  double Pvalue;	/* p-value of product of p-values */  BOOLEAN known_pos;	/* known positive if true (for ROC) */  char *id;		/* name of sequence */  int strand;		/* -1 neg. strand, 0 both/protein, +1 pos. strand */  double *comp;		/* actual sequence composition */  double **pv;		/* pvalue distribution for each motif */} SORT_SEQ;/*  Local Procedures*/static int get_motifs(  BOOLEAN xlate_dna,		/* database is DNA and motifs protein */  char *nlogodds,		/* name of log-odds matrix */  char *alphabet,		/* alphabet of logodds matrices */  char *blast_alphabet,		/* corresponding BLAST alphabet */  int *p[MAXASCII],		/* alphabet permutation/substitution matrix */  BOOLEAN shuffle,		/* shuffle the motif columns */  double min_ev,		/* minimum E-value of motifs to use */  int umotifs, 			/* number motifs to be used */  int mlist[MAXLO],		/* list of motifs given in -m option */  BOOLEAN motifs[MAXLO],	/* list of motifs to output */  LO *los[MAXLO],		/* array of logodds matrices */  double *f,			/* array of null letter probabilities */  int range 			/* set logodds matrices to [0..range] */);static SORT_SEQ *get_scores(  BOOLEAN dna,		/* database is DNA */  STYPE stype,		/* handling of DNA strands */  BOOLEAN xlate_dna,	/* database is DNA and motifs protein */  double f[],		/* null letter probability distribution */  FILE *fdata,		/* the database */  FILE *fsave,		/* good sequences if database on stdin */  LO *los[],		/* array of pointers to log-odds matrices */  int nmotifs,		/* number of log-odds matrices in los */  int range, 		/* set entries in matrices to [0..range] */  double **pv,		/* p-value tables for each motif */  BOOLEAN sonly,	/* calculate p-value of observed spacings only */  BOOLEAN lump,		/* combine spacings into one p-value */  BOOLEAN use_seq_comp,	/* adjust E-value for actual sequence composition */  double e_max,		/* maximum sequence E-value to print */  int kmotifs,		/* number of known motifs */  MOTIF motif[],	/* data returned by read_motifs */  BOOLEAN status,	/* show progress */  int min_seqs,		/* lower bound on nseqs */  int *nseqs,		/* total sequences in database */  double *residues,	/* total number of residues in seqs */  int *n_hits		/* sequences less than e_max */);static double calc_roc(  SORT_SEQ *seq,			/* array of score values */  int nseqs,				/* number of sequences */  int pos,				/* number of known positives */  int roc_num,				/* number of fp's to truncate ROC at */  double thresh				/* threshold for error calculation */);static int p_compare(  const void *v1,  const void *v2);static double calc_p_value(  char *sample_name,	/* name of sample */  long length,           /* length of the sequence */  STYPE stype,		/* handling of DNA strands */  int nmotifs,          /* number of motifs */  LO *los[],            /* array of pointers to log-odds matrices */  double *best_score,   /* array of best score for each motif */  int *best_loc,        /* position of best match for each motif */  double range,		/* number of different score values */  double **pv,		/* p-value tables for each motif */  BOOLEAN sonly,	/* calculate p-value of observed spacings only */  BOOLEAN lump,		/* combine spacings into one p-value */  int norder,		/* number of motifs in diag */  BOOLEAN debug 	/* turn on debugging features */);static void print_mast_results(  BOOLEAN read_stdin,			/* read database on stdin */  char *alphabet,			/* alphabet */  double *back,				/* letter probability distribution */  char *bfile,				/* no background file */  BOOLEAN dna,				/* database is DNA */  STYPE stype,				/* handling of different strands */  BOOLEAN xlate_dna,			/* database is DNA and motifs protein */  BOOLEAN use_seq_comp,			/* adjust E-value for sequence composition */  FILE* hit_file,			/* table of hits */  FILE* diag_file,			/* table of motif diagrams */  FILE* note_file,			/* table of annotated sequences */  BOOLEAN doc,				/* print documentation */  int rank,				/* rank of first result to print */  double e_max,				/* maximum sequence E-value to print */  double m_thresh,			/* maximum motif p-value to print */  double w_thresh,	 		/* maximum motif p-value--weak hits */  BOOLEAN use_seq_p,			/* use sequence not position p-values */  int n_hits,				/* sequences less than e_max */  LO *los[],				/* array of pointers to lo matrices */  int bad_list[], 			/* list of highly correlated motifs */  int nbad,				/* number of bad motifs */  BOOLEAN rem_corr,			/* remove correlated motifs */  char *mfname, 			/* motif file name to print */  int nmotifs,				/* number motifs read */   char *database,			/* name of database */  char *dfname,				/* name of database to print */  int nseqs, 				/* number of sequences in database */  double res				/* total number of residues in seqs */);static int make_mast_tables(  BOOLEAN dna,				/* database is DNA */  STYPE stype,				/* handling of DNA strands */  BOOLEAN xlate_dna,			/* database is DNA and motifs protein */  BOOLEAN use_seq_comp,			/* adjust E-value for sequence composition */  BOOLEAN best_motifs, 			/* diagrams have only best motif */  int kmotifs,	 			/* number of known motifs */  MOTIF motif[],			/* data returned by read_motifs */  char *motif_file,			/* motif file name */  FILE *fdata,				/* the database file */  int nseqs,				/* number of sequences scored */  SORT_SEQ *seqs,			/* sortable array of seq/score records*/  int n_hits,				/* size of seqs array */  FILE* hit_file,			/* table of hits */  FILE* diag_file,			/* table of motif diagrams */  FILE* note_file,			/* table of annotated sequences */  int rank,				/* rank of first result to print */  int smax,				/* maximum sequences to print */  double e_max,				/* maximum E-value to print */  double m_thresh,			/* maximum motif p-value to print */  double w_thresh,	 		/* maximum motif p-value-- weak hits */  BOOLEAN use_seq_p,			/* use sequence not position p-values */  BOOLEAN hit_list,                     /* create hit list instead of diagram */  LO *los[],				/* array of pointers to lo matrices */  int nmotifs 				/* number motifs read */ );static void print_annotation(  BOOLEAN dna,				/* database is DNA */  STYPE stype,				/* handling of DNA strands */  BOOLEAN xlate_dna,			/* database is DNA and motifs protein */  double thresh,			/* threshold for strong hits */  FILE *note_file,			/* table of annotated sequences */  LO *los[],				/* array of pointers to lo matrices */  char *sequence,			/* sequence of sample */  long length,				/* length of sample */  char *sample_name,			/* name of sample */  char *id,				/* identifier text for sample */  int strand,				/* strand of DNA or 0 if both/protein */  double pvalue,			/* combined p-value of sequence */  double evalue,			/* combined E-value of sequence */  TILING tiling				/* tiling and diagram of sequence */);static char *best_frame(  int nmotifs, 				/* number of motifs */  long length,				/* length of sequence */  TILING				/* tiling of sequence */);static int get_bad_motifs(  double maxcorr,			/* maximum allowed corellation */  int nmotifs,				/* number of motifs */   LO *los[],				/* array of logodds matrices */  int bad_list[] 			/* list of highly correlated motifs */);/**********************************************************************//*  main routine*//**********************************************************************/extern int main (int argc, char *argv[]){  /* defaults */  char *nlogodds = "";  char *database = "";  char *alphabet = "";  char *mfname = NULL;			/* don't print the motif file name */  char *dfname = NULL;			/* print actual name of database */  BOOLEAN print_seq = TRUE;		/* print annotated sequence */  BOOLEAN weak = FALSE;			/* print weak hits */  int rank = 1;				/* print results starting with rank */  int smax = 0;				/* print all results */  double e_max = EXPECT;		/* maximum sequence p-value to print */  double m_thresh = 1e-4;		/* maximum motif p-value to print */  double w_thresh;			/* maximum motif p-value--weak hits 					   default: m_thresh * 10 */  double min_ev = BIG;			/* minimum E-value of motifs to use */  int min_seqs = 0;			/* lower bound on number db sequences */  int roc_num = 0;			/* don't compute ROC */  BOOLEAN use_seq_p = FALSE;		/* use sequence p-value for m_thresh */  BOOLEAN shuffle = FALSE;		/* don't shuffle the motif columns */  BOOLEAN sonly = FALSE;		/* print product of spacing p-values */  BOOLEAN lump = FALSE;			/* combine spacings into one p-value */  BOOLEAN status = TRUE;		/* show progress */  BOOLEAN doc = TRUE;			/* print documentation */  STYPE stype = Combine;		/* handling of DNA strands */  BOOLEAN xlate_dna = FALSE;		/* don't tranlate DNA sequences */  BOOLEAN best_motifs = FALSE;          /* only print best motif in diagrams */  BOOLEAN use_seq_comp = FALSE;		/* adjust E-values for actual compos. */  BOOLEAN rem_corr = FALSE;		/* remove overly corellated motifs */   BOOLEAN read_stdin = FALSE;		/* read database from stdin */   BOOLEAN hit_list = FALSE;		/* print block diagrams */     /* locals */  int i, j, k;  char *arguments;			/* saved argv[1] */  FILE *fdata = NULL;			/* the database */  FILE *fsave = NULL;			/* good seqs if reading stdin */  FILE *hit_file = NULL, *diag_file=NULL, *note_file=NULL;/* temporary files for tables */  char *motif_file = NULL;		/* motif file name */  char *bfile = NULL;			/* no background file */  int nseqs = 0;			/* total sequences in database */  double residues = 0;			/* total number of residues in seqs */  int n_hits = 0;			/* sequences less than e_max */  BOOLEAN dna;				/* true if sequences are DNAB */  BOOLEAN error;			/* true if something wrong */  /* motifs and logodds matrices */  int nmotifs;				/* number motifs read */   int umotifs=0;			/* number motifs to be used */  int kmotifs=0;	 		/* number of known motifs */  LO *los[MAXLO];			/* array of logodds matrices */  BOOLEAN motifs[MAXLO];		/* list of motifs to output */  int bad_list[MAXLO]; 			/* list of highly correlated motifs */  int nbad;				/* number of bad (correlated) motifs */  int mlist[MAXLO];			/* list of motifs given in -m option */  MOTIF motif[NMOTIFS];			/* data returned by read_motifs */  DATASET dummy;			/* dummy arg for read_motifs */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?