mast.c
来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页
C
1,794 行
char *blast_alphabet; /* corresponding BLAST alphabet */ int *perm[MAXASCII]; /* permutation/substitution matrix */ /* things for normalization */ SORT_SEQ *seqs; /* sortable array of seq/score records */ double **pv = NULL; /* p-value tables for each motif */ double *back; /* background letter probability distribution */#ifdef MALLOC_DEBUG int malloc_debug(int); malloc_debug(2);#endif (void) myclock(); /* record CPU time */ /* save script arguments, set command to blank and shift arguments */ arguments = argv[1]; argv[1] = ""; argv++; argc--; /* get the command line arguments */ i = 1; DO_STANDARD_COMMAND_LINE(1, NON_SWITCH(1, \r, switch (i++) { case 1: break; /* dummy */ default: COMMAND_LINE_ERROR; } ); DATA_OPTN(2, logodds, , , nlogodds = _OPTION_); DATA_OPTN(2, database, , , database = _OPTION_); DATA_OPTN(2, alphabet, , , alphabet = _OPTION_); FLAG_OPTN(2, stdin, , read_stdin = TRUE); FLAG_OPTN(1, sep, \t\tscore reverse complement DNA strand as a separate \n\t\t\tsequence, stype = Separate); FLAG_OPTN(1, norc, \t\tdo not score reverse complement DNA strand, stype = Norc); FLAG_OPTN(1, dna, \t\ttranslate DNA sequences to protein, xlate_dna = TRUE); FLAG_OPTN(1, comp, \t\tadjust p-values and E-values for sequence composition, use_seq_comp = TRUE); DATA_OPTN(1, rank, <rank>, \tprint results starting with <rank> best (default: 1), rank = atoi(_OPTION_)); DATA_OPTN(1, smax, <smax>, \tprint results for no more than <smax> sequences\n\t\t\t(default: all), smax = atoi(_OPTION_)); DATA_OPTN(1, ev, <ev>, \tprint results for sequences with E-value < <ev>\n\t\t\t(default: 10), e_max = atof(_OPTION_)); DATA_OPTN(1, mt, <mt>, \tshow motif matches with p-value < mt (default: 0.0001), m_thresh = atof(_OPTION_)); FLAG_OPTN(1, w, \t\tshow weak matches (mt<p-value<mt*10) in angle brackets, weak = TRUE); DATA_OPTN(1, bfile, <bfile>, \tread background frequencies from <bfile>, bfile = _OPTION_); FLAG_OPTN(1, seqp, \t\tuse SEQUENCE p-values for motif thresholds\n\t\t\t(default: use POSITION p-values), use_seq_p = TRUE); DATA_OPTN(1, mf, <mf>, \tprint <mf> as motif file name, mfname = _OPTION_); DATA_OPTN(1, df, <df>, \tprint <df> as database name, dfname = _OPTION_); DATA_OPTN(1, minseqs, <minseqs>, \tlower bound on number of sequences in db, min_seqs = atoi(_OPTION_)); DATA_OPTN(1, mev, <mev>,+\tuse only motifs with E-values less than <mev>, min_ev = atof(_OPTION_)); DATA_OPTN(1, m, <m>,+\tuse only motif(s) number <m> (overrides -mev), mlist[umotifs++] = atoi(_OPTION_)); DATA_OPTN(1, diag, <diag>, \tnominal order and spacing of motifs, diagram = _OPTION_); FLAG_OPTN(1, best, \t\tinclude only the best motif in diagrams, best_motifs = TRUE); FLAG_OPTN(1, remcorr, \tremove highly correlated motifs from query, rem_corr = TRUE); FLAG_OPTN(1, brief, \tbrief output--do not print documentation, doc = FALSE); FLAG_OPTN(1, b, \t\tprint only sections I and II, print_seq = FALSE); FLAG_OPTN(1, nostatus, \tdo not print progress report, status = FALSE); FLAG_OPTN(1, hit_list, \tprint only a list of non-overlapping hits; implies -text, hit_list = TRUE); /* DEBUG OPTIONS */ FLAG_OPTN(EXP, deb, \t\tprint p-values and exit; implies -text, debug = TRUE); DATA_OPTN(EXP, k, <k>, \tfile of known motifs to mark as "+" or for ROC if \n\t\t\t-r given, motif_file = _OPTION_); DATA_OPTN(EXP, r, <r>, \tcompute roc_<r> using <k>.tag file, roc_num = atoi(_OPTION_)); FLAG_OPTN(EXP, shuffle, \tshuffle columns of motifs, shuffle = TRUE); FLAG_OPTN(EXP, sonly, \tuse only spacing p-values in product, sonly = TRUE); FLAG_OPTN(EXP, lump, \t\tcombine spacings into one p-value, lump = TRUE); ); /* check input */ if (rank < 1) { fprintf(stderr, "<rank> must be at least 1.\n"); exit(1); } /* set e_max to BIG if doing ROC or printing all sequence p-values or printing hit-list */ if (roc_num > 0 || debug || hit_list) e_max = BIG; /* set maximum p-value for weak hits */ if (weak) { w_thresh = m_thresh * 10.0; /* 10 times less likely */ } else { w_thresh = m_thresh; /* no weak threshold */ } /* open the datafile or standard input */ if (!read_stdin) { if (!(fdata = fopen(database, "r"))) { fprintf(stderr, "Cannot open file `%s'.\n", database); exit(1); } } else { /* reading stdin */ fdata = stdin; fsave = tmpfile(); /* save good sequences here */ } /* open database or standard input */ /* initialize the background frequencies and alphabets */ back = init_mast_background(bfile, alphabet, stype, xlate_dna, &blast_alphabet, perm); /* get the motifs and parse the ordering and spacing diagram */ nmotifs = get_motifs(xlate_dna, nlogodds, alphabet, blast_alphabet, perm, shuffle, min_ev, umotifs, mlist, motifs, los, back, RANGE); /* get list of motifs that should be removed */ nbad = get_bad_motifs(MAXCORR, nmotifs, los, bad_list); /* remove highly correlated motifs from query */ if (rem_corr && nbad) { for (i=j=k=0; i<nmotifs; i++) { if (los[i]->name < bad_list[k] || k >= nbad) { los[j++] = los[i]; } else { k++; } } nmotifs -= nbad; } /* Set current sequence alphabet to DNA if translating. */ if (xlate_dna) setalph(0); /* determine the type of database being searched */ dna = (xlate_dna || los[0]->dna); /* get the type of handling of DNA strands */ if (!dna) { /* protein database */ if (stype == Separate || stype == Norc) { fprintf(stderr, "You may not specify -sep or -norc with a protein database.\n"); exit(1); } stype = Protein; } /* database type */ /* read in (optional) known motif occurrences */ if (motif_file) kmotifs = read_motifs(fdata, motif_file, motif, 0, &dummy); /* calculate p-values of all integer score values in range [0...w*RANGE] */ Resize(pv, nmotifs, double *); for (i=0, error=FALSE; i<nmotifs; i++) { /*pv[i] = calc_cdf(los[i], RANGE, back);*/ pv[i] = calc_pssm_cdf(los[i]->w, los[i]->alen, RANGE, los[i]->logodds, back); if (!pv[i]) { fprintf(stderr, "There is something wrong with motif %d\n", los[i]->name); error = TRUE; } } if (error) exit(1); /* error with motifs */ /* get the scores and p-values for all sequences in the database and sort them by p-value in ascending order */ seqs = get_scores(dna, stype, xlate_dna, back, fdata, fsave, los, nmotifs, RANGE, pv, sonly, lump, use_seq_comp, e_max, kmotifs, motif, status, min_seqs, &nseqs, &residues, &n_hits ); /* print p-values and exit if debug on */ if (debug) { for (i=1; i<nmotifs; i++) { /* from motif */ int j; printf("CORR %2d ", los[i]->name); for (j=0; j<i; j++) { /* to motif */ printf(" %5.2f", los[i]->corr[j]); } /* to motif */ printf("\n"); } /* from motif */ for (i=0; i<n_hits; i++) { printf("NORM %g\n", seqs[i].Pvalue); } /* hit */ exit(0); } /* calculate the ROC of the motifs and exit if doing ROC */ if (roc_num > 0) { printf("\n\nROC %d = %f\n", roc_num, calc_roc(seqs, n_hits, motif[0].pos, roc_num, los[0]->thresh)); exit(0); } /* create the three output tables in temporary files */ if (!hit_list) { /* files if -hit_list; prints directly */ hit_file = tmpfile(); diag_file = tmpfile(); note_file = print_seq ? tmpfile() : NULL; } if (fsave) fdata = fsave; /* saved sequences */ n_hits = make_mast_tables(dna, stype, xlate_dna, use_seq_comp, best_motifs, kmotifs, motif, motif_file, fdata, nseqs, seqs, n_hits, hit_file, diag_file, note_file, rank, smax, e_max, m_thresh, w_thresh, use_seq_p, hit_list, los, nmotifs ); /* print the results */ if (!hit_list) { print_mast_results( read_stdin, alphabet, back, bfile, dna, stype, xlate_dna, use_seq_comp, hit_file, diag_file, note_file, doc, rank, e_max, m_thresh, w_thresh, use_seq_p, n_hits, los, bad_list, nbad, rem_corr, mfname, nmotifs, database, dfname, nseqs, residues ); } /* print the command line to the script which was saved in argv[1] */ if (hit_list) { printf("# %s\n", arguments); } else { printf("%s\n", arguments); } /* finish status report */ if (status) fprintf(stderr, "\n"); return 0;} /* main *//**********************************************************************//* calc_roc Calculate the ROC (receiver operating characteristic) of a sorted list of score values with known positives flagged.*//**********************************************************************/static double calc_roc( SORT_SEQ *seqs, /* array of score values */ int nseqs, /* number of sequences */ int pos, /* number of known positives */ int roc_num, /* number of fp's to truncate ROC at */ double thresh /* threshold for error calculation */){ int i; double roc = 0; /* receiver operating characteristic */ double tpp = 0; /* true positive proportions */ double fpp = 0; /* false positive proportions */ double tp = 0; /* true positives so far */ double fp = 0; /* false positives so far */ double newtpp, newfpp; int neg = nseqs - pos; /* number of known negatives */ printf("thresh = %f\n", thresh); /* loop over sequences, best score first */ for (i=0; i<nseqs; i++) { /* sequence */ /* update tp and fp since score has changed (or last score reached) */ /* known positive? */ if (seqs[i].known_pos) { tp++; } else { fp++; } /* trapezoidal rule : (y2 - y1)/2 dx */ newtpp = MIN(1.0, tp / pos); /* seqs marked "?" in prosite are marked but not counted in pos so newtpp could go over 1.0 */ newfpp = fp / neg; roc += .5 * (newtpp + tpp) * (newfpp - fpp); tpp = newtpp; fpp = newfpp; if (seqs[i].known_pos) { printf("+ "); } else { printf("- "); } printf("%-*.*s %8.1e\n", MMSN, MMSN, seqs[i].id, seqs[i].Pvalue); /* reached limiting number of false positives? */ if (fp >= roc_num) break; } /* sequence */ /* normalize by fpp to get ROC <roc_num> */ if (fpp == 0) { roc = 1.0; } else { roc /= fpp; } return roc;} /* calc_roc *//**********************************************************************//* p_compare Compare two p-values in ascending order. Return <0 >0 if the second is >, < the first. If they are equal, resolves ties by returning <0 if the second has smaller fp.*//**********************************************************************/static int p_compare( const void *v1, const void *v2){ const SORT_SEQ * s1 = (const SORT_SEQ *) v1; const SORT_SEQ * s2 = (const SORT_SEQ *) v2; double diff = s2->Pvalue - s1->Pvalue; if (diff == 0) diff = (double) (s1->fp - s2->fp); return ((diff > 0) ? -1 : ( (diff < 0) ? 1 : 0) );} /* p_compare *//**********************************************************************//* calc_p_value Calculate the p-value of the product of the individual motif p-values < observed Returns the combined p-value.*//**********************************************************************/static double calc_p_value( char *sample_name, /* name of sample */ long length, /* length of the sequence */ STYPE stype, /* handling of DNA strands */ int nmotifs, /* number of motifs */ LO *los[], /* array of pointers to log-odds matrices */ double *best_score, /* array of best score for each motif */ int *best_loc, /* position of best match for each motif */ double range, /* number of different score values */ double **pv, /* p-value tables for each motif */ BOOLEAN sonly, /* calculate p-value of observed spacings only */ BOOLEAN lump, /* combine spacings into one p-value */ int norder, /* number of motifs in diag */ BOOLEAN debug /* turn on debugging features */){ int i; double pvalue; int nspaces; /* number of motif pairs spacings given for */ double k_scores; /* product of motif score p-values */ double k_spacing; /* product of spacing p-values */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?