getsize.c
来自「EM算法的改进」· C语言 代码 · 共 299 行
C
299 行
/* * $Id: getsize.c 1339 2006-09-21 19:46:28Z tbailey $ * * $Log$ * Revision 1.3 2005/12/15 05:32:10 tbailey * getsize.c: Change getsize behavior: unless -nd given, duplicate sequences are flagged * and NOT counted. * meme-cs.h: remove some Architecture checks to fix Cygwin version * * Revision 1.2 2005/10/25 19:06:39 nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1 2005/07/29 00:19:49 nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************ ** MEME++ ** Copyright 1994, The Regents of the University of California ** Author: Timothy L. Bailey ** ************************************************************************//* getsize.c *//* getsize <datafile> [options] Reads in a sequence file (Pearson/FASTA format). Prints the number of sequences, min L, max L, mean L, total residues and letters in alphabet used. Type "getsize" to see options.*/ #define DEFINE_GLOBALS #include "meme.h"#include "general.h"#define DATA_HASH_SIZE 100000#define LDNAB 30/**********************************************************************//* main *//**********************************************************************/extern int main( int argc, char *argv[]){ long i, j, k; char *datafile = NULL; FILE *data_file; int n_samples; long max_slength, min_slength, length; char *sample_name, *id, *seq; int bfile = 0; /* do not print bfile format */ BOOLEAN l_only = FALSE; /* do not print lengths only */ BOOLEAN print_duplicates = TRUE; /* print names of duplicates */ BOOLEAN print_frequencies = FALSE; /* do not print letter frequencies */ BOOLEAN print_table = FALSE; /* do not print frequencies table */ BOOLEAN xlate_dna = FALSE; /* do not translate DNA */ BOOLEAN print_codons = FALSE; /* do not print codon usage */ HASH_TABLE ht_seq_names; /* hash of dataset seq names */ int codons[LDNAB][LDNAB][LDNAB]; /* counts of used codons */ int alphabet[6][MAXASCII]; /* counts of used letters (per frame) */ int total_res[6]; /* total letters per frame */ char *letters = NULL; /* string of used letters */ /* set name of command */ argv[0] = "getsize"; /* get the command line arguments */ i = 1; DO_STANDARD_COMMAND_LINE(1, USAGE(<datafile> [options]); NON_SWITCH(1, \n\t<datafile>\t\tfile containing sequences in FASTA format\n, switch (i++) { case 1: datafile = _OPTION_; break; default: COMMAND_LINE_ERROR; }); FLAG_OPTN(1, l, \t\t\tjust print the length of each sequence, l_only = TRUE); FLAG_OPTN(1, nd, \t\t\tdo not print warnings about duplicate sequences, print_duplicates = FALSE); FLAG_OPTN(1, dna, \t\t\tprint dna frequencies in bfile format, bfile = 1); FLAG_OPTN(1, prot, \t\t\tprint protein frequencies in bfile format, bfile = 2); FLAG_OPTN(1, f, \t\t\tprint letter frequencies, print_frequencies = TRUE); FLAG_OPTN(1, ft, \t\t\tprint letter frequencies as latex table, print_table = TRUE); FLAG_OPTN(1, x, \t\t\ttranslate DNA in six frames, xlate_dna = TRUE); FLAG_OPTN(1, codons, \t\tprint frame0 codon usage (implies -f xlate_dna), print_codons = xlate_dna = print_frequencies = TRUE); USAGE(\n\tMeasure statistics of a FASTA file.); ) /* Setup hashing function for encoding strings as integers. If translating from DNA to protein, input alphabet must be DNAB; otherwise it may be all 26 letters plus asterisk. */ if (xlate_dna) { setup_hash_alph(DNAB); /* DNAB to position hashing */ setup_hash_alph(PROTEINB); /* PROTEINB to position hash */ setup_hash_dnab2protb(); /* DNAB to PROTEINB hashing */ } else if (bfile == 1) { /* get DNA frequencies */ setup_hash_alph(DNAB); /* DNAB to position hashing */ } else if (bfile == 2) { /* get PROT frequencies */ setup_hash_alph(PROTEINB); /* PROTEINB to position hash */ } else { setup_hash_alph("ABCDEFGHIJKLMNOPQRSTUVWXYZ*"); /* full alphabet */ } Resize(letters, MAXASCII, char); /* create a hash table of sequence names */ ht_seq_names = hash_create(DATA_HASH_SIZE); /* open data file */ if (datafile == NULL) { fprintf(stderr, "You must specify a data file or 'stdin'\n"); exit(1); } else if (strcmp(datafile, "stdin")) { data_file = fopen(datafile, "r"); if (data_file == NULL) { fprintf(stderr, "Cannot open file '%s'\n", datafile); exit(1); } } else { data_file = stdin; } /* initialize counts of letters and codons used */ for (i=0; i<6; i++) for (j=0; j<MAXASCII; j++) alphabet[i][j] = 0; for (i=0; i<LDNAB; i++) for (j=0; j<LDNAB; j++) for (k=0; k<LDNAB; k++) codons[i][j][k] = 0; /* initialize maximum length of sequences */ max_slength = 0; min_slength = 10000000; n_samples = 0; /* no samples yet */ for (i=0; i<6; i++) total_res[i] = 0; /* total residues (per frame) */ while (read_sequence(data_file, &sample_name, &id, &seq, &length)) { /* Skip weights */ if (strcmp(sample_name, "WEIGHTS")==0) continue; if (print_duplicates) { /* ignore duplicate (same sample name) sequences */ if (hash_lookup(sample_name, 0, ht_seq_names)) { fprintf(stderr, "Duplicate sequence: %s.\n", sample_name); /* free up unneeded space */ myfree(sample_name); myfree(id); myfree(seq); continue; } hash_insert(sample_name, 0, ht_seq_names); /* put name in table */ } else { myfree(sample_name); } /* Count letters used in sequence. */ if (!l_only) { if (xlate_dna) { /* translate DNA in six frames */ for (i=0; i<2; i++) { /* positive then negative strands */ if (i) invcomp_dna(seq, length); /* negative strand */ for (j=0; j<3; j++) { /* frame */ int f = j + 3 * i; for (k=j; k<length-2; k+=3) { /* hash DNAB codon to PROTEINB*/ int index = chash(TRUE, FALSE, seq+k); alphabet[f][index]++; total_res[f]++; if (print_codons && f == 0) { /* frame 0 */ int i1 = dnabhash(seq[k]); int i2 = dnabhash(seq[k+1]); int i3 = dnabhash(seq[k+2]); codons[i1][i2][i3]++; } } } /* frame */ } /* strand */ } else { /* don't translate */ for (i=0; i<length; i++) alphabet[0][hash(seq[i])]++; total_res[0] += length; } /* xlate_dna */ } /* !l_only */ /* free up unneeded space */ myfree(id); myfree(seq); /* record maximum length of actual sequences */ max_slength = MAX(length, max_slength); min_slength = MIN(length, min_slength); if (l_only) printf("%ld\n", length); n_samples++; /* number of non-duplicate sequences */ if (print_frequencies && n_samples%1000 == 0) fprintf(stderr, "%d\r", n_samples); } /* read_sequence */ /* print results */ if (!l_only) { /* make string of letters used */ for (i=0, j=0; i<MAXASCII; i++) if (alphabet[0][i] || alphabet[1][i] || alphabet[2][i] || alphabet[3][i] || alphabet[4][i] || alphabet[5][i]) letters[j++] = unhash(i); letters[j] = '\0'; if (!bfile) { printf("%d %ld %ld %10.1f %d %s\n", n_samples, min_slength, max_slength, ((double) total_res[0])/n_samples, total_res[0], letters); } /* Print frequencies in bfile format */ if (bfile) { char *alph_0 = bfile==1 ? DNA0 : PROTEIN0; /* short alphabet */ double tot = 0; /* total legal let. */ printf("# 0-order Markov frequencies from file %s\n", datafile); for (i=0; alph_0[i]; i++) tot += alphabet[0][hash(alph_0[i])]; for (i=0; alph_0[i]; i++) { /* letter */ int k = hash(alph_0[i]); double freq = tot ? alphabet[0][k]/tot : 0; printf("%c %11.8f\n", alph_0[i], freq); } /* letters used */ } /* bfile */ /* Print letter frequencies as C arrays. */ if (print_frequencies) { int nframes = xlate_dna ? 6 : 1; /* number of frames */ for (i=0; i<nframes; i++) { /* frame */ if (nframes) { printf(" double frame%ld[] = {\n", i); } else { printf(" double freq[] = {\n"); } for (j=0; letters[j]; j++) { /* letters used */ char c = letters[j]; /* letter */ int k = hash(c); /* index */ printf(" %11.8f /* %c */,\n",(double)alphabet[i][k]/total_res[i], c); } /* letters used */ printf(" };\n"); } /* frame */ if (print_codons) { int alen = strlen(DNAB); /* use DNAB alphabet */ letters = DNAB; printf(" double fcodon[] = {\n"); /* start C array */ for (i=0; i<alen; i++) { /* pos 0 */ for (j=0; j<alen; j++) { /* pos 1 */ for (k=0; k<alen; k++) { /* pos 2 */ char c1 = letters[i]; char c2 = letters[j]; char c3 = letters[k]; printf(" %11.8f /* %c%c%c */,\n", (double)codons[i][j][k]/total_res[0], c1, c2, c3); } /* pos 2 */ } /* pos 1 */ } /* pos 0 */ printf(" };\n"); } /* print_codons */ } /* print_frequencies */ /* Print letter frequencies as latex table. */ if (print_table) { int nframes = xlate_dna ? 6 : 1; /* number of frames */ for (j=0; letters[j]; j++) { /* letters used */ char c = letters[j]; /* letter */ int k = hash(c); /* index */ if (xlate_dna) { printf("%c & %7.4f", c, nrfreq[k]); } else { printf("%c", c); } for (i=0; i<nframes; i++) { /* frame */ printf(" & %7.4f", (double)alphabet[i][k]/total_res[i]); } /* frame */ printf(" \\\\\n"); } /* letters used */ } /* print_table */ } return(0); } /* getsize */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?