📄 read_seq_file.c
字号:
/* * $Id: read_seq_file.c 1339 2006-09-21 19:46:28Z tbailey $ * * $Log$ * Revision 1.3 2006/03/08 20:50:11 nadya * merge chamges from v3_5_2 branch * * Revision 1.2.4.2 2006/01/31 20:30:46 nadya * Attemp to fix control-m in description * * Revision 1.2.4.1 2006/01/31 19:11:54 nadya * add '\r' character to look for when delimiting strings * this is a fix for cygwin newline representation * * Revision 1.2 2005/10/25 19:06:39 nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1 2005/07/29 17:24:50 nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************ ** MEME++ ** Copyright 1994, The Regents of the University of California ** Author: Timothy L. Bailey ** ************************************************************************//* 9-30-99 tlb; add sequence to SAMPLE *//* 6-29-99 tlb; add resic *//* Supports FASTA and SWISSPROT formats.Note: Assumes SWISSPROT format if "ID" appears at beginning of first line. Ignores duplicate (same <id>) sequences.All formats: <id> := <alpha>+ The *unique* name of the sequence. May *not* contain whitespace! NOTE: if <id> is "WEIGHTS", the <description> field is parsed as a string of sequence weights and any <sequence> field (if any) is ignored. All weights are assigned in order to the sequences in the file. If there are more sequences than weights, the remainder are given weight one. Weights must be greater than zero and less than or equal to one. Weights may be specified by more than one "WEIGHT" entry which may appear anywhere in the file. <description> := <alpha>+ A description of the sequence; may contain whitespace. <sequence> := <alpha>+ The DNA or protein sequence data; may contain whitespace. <text> := <alpha>* Text which is ignored.Pearson FASTA format: INPUT DATA FILE FORMAT: [ <header> <sequence>+ ]* <header> := ">"<id>" "<description> Everything up to the first space is assumed to be the unique name of the sequence. SWISSPROT Format [ <ID> <DE>+ <SQ> <sequence>+ // ]* <ID> := "ID "<id>" "<text> <DE> := "DE "<description> <SQ> := "SQ "<text>*/#include <meme.h>/* size of memory chunks to allocate for sequence data */#define RCHUNK 100/* maximum size of sequence description text string */#define MAXDELEN 10000/* local types */typedef enum {FASTA, SWISSPROT} FORMAT_TYPE;/* local functions */static SAMPLE *create_sample( char *alpha, /* the alphabet */ long length, /* length of sequence */ char *name, /* name of sample */ char *sequence /* the sequence */);/* local functions */static long read_sequence_data( FILE *data_file, /* data file of sequences */ char **sequence, /* sequence data */ char *name /* name of sequence */);static int read_sequence_de( FILE *data_file, /* data file of sequences */ char **descriptor /* sequence descriptor */);/* local variables */#define DATA_HASH_SIZE 100000/* The hash table will be appended to if read_seq_file is called more than once. This is so that positive sequences the negative sequence file will be ignored.*/ static HASH_TABLE ht_seq_names = NULL; /* hash of dataset seq names *//**********************************************************************//* read_seq_file Open a sequence file and read in the sequences. Returns a dataset-> setup_hash_alphabet must have been called prior to first call to this.*//**********************************************************************/extern DATASET *read_seq_file( char *file_name, /* name of file to open */ char *alpha, /* alphabet used in sequences */ BOOLEAN use_comp, /* use complementary strands, too */ double seqfrac /* fraction of input sequences to use */){ int i, j; FILE *data_file; /* file with samples to read */ char *sample_name; /* name of sample read */ char *sample_de; /* descriptor text for sample */ char *sequence; /* sequence read */ long length; /* length of sequence */ BOOLEAN error=FALSE; /* none yet */ SAMPLE *sample; /* sample created */ DATASET *dataset; /* dataset created */ int n_samples=0; /* number of samples read */ double *seq_weights=NULL; /* sequence weights */ int n_wgts=0; /* number of sequence weights given */ /* create a hash table of sequence names */ if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE); /* create a dataset */ dataset = (DATASET *) mymalloc(sizeof(DATASET)); dataset->alength = strlen(alpha); dataset->alphabet = alpha; /* open data file */ if (file_name == NULL) { fprintf(stderr, "You must specify a data file or `stdin'.\n"); exit(1); } else if (strcmp(file_name, "stdin")) { data_file = fopen(file_name, "r"); if (data_file == NULL) { fprintf(stderr, "Cannot open file `%s'.\n", file_name); exit(1); } } else { data_file = stdin; } /* initialize maximum length of sequences */ dataset->max_slength = 0; dataset->min_slength = 10000000; dataset->n_samples = 0; /* no samples yet */ dataset->samples = NULL; /* no samples */ while (read_sequence(data_file, &sample_name, &sample_de, &sequence, &length)) { /* skip sequence if an error occurred */ if (length < 0) continue; /* parse weights if given; make (more than enough) room in array */ if (strcmp(sample_name, "WEIGHTS")==0) { double wgt; char *wgt_str = sample_de; Resize(seq_weights, n_wgts+(int)strlen(wgt_str), double); while (sscanf(wgt_str, "%lf", &wgt) == 1) { if (wgt <= 0 || wgt > 1) { fprintf(stderr, "Weights must be larger than zero and no greater than 1.\n"); exit(1); } seq_weights[n_wgts++] = wgt; /* save weight */ wgt_str += strspn(wgt_str, " "); /* skip white */ wgt_str += strcspn(wgt_str, " "); /* skip token */ } myfree(sample_name); myfree(sample_de); myfree(sequence); continue; } /* ignore duplicate (same sample name) sequences */ if (hash_lookup(sample_name, 0, ht_seq_names)) { fprintf(stderr, "Skipping sequence '%s'.\n", sample_name); myfree(sample_name); myfree(sample_de); myfree(sequence); continue; } hash_insert(sample_name, 0, ht_seq_names); /* put name in hash table */ n_samples++; /* see if sequence will be used in random sample; store it if yes */ if (drand48() >= 1 - seqfrac) { /* create the sample */ sample = create_sample(alpha, length, sample_name, sequence); if (sample == NULL) {error = TRUE; continue;} /* record maximum length of actual sequences */ dataset->max_slength = MAX(sample->length, dataset->max_slength); dataset->min_slength = MIN(sample->length, dataset->min_slength); /* put the sample in the array of samples */ if ((dataset->n_samples % RCHUNK) == 0) { Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *); } dataset->samples[dataset->n_samples++] = sample; } } /* sequences */ if (length < 0) error = TRUE; /* read_sequence error */ /* resize the array of samples */ if (dataset->n_samples) Resize(dataset->samples, dataset->n_samples, SAMPLE*); /* check that datafile contained at least one sample */ if (!error) { if (n_samples == 0) { fprintf(stderr, "No sequences found in file `%s'. Check file format.\n", file_name); error = TRUE; } else if (dataset->n_samples == 0) { fprintf(stderr, "No sequences sampled. Use different seed or higher seqfrac.\n"); error = TRUE; } } /* exit if there was an error */ if (error) exit(1); /* calculate the prior residue frequencies and entropies and |D|, size of the dataset */ /* tlb; 5/9/97 wgt_total_res and weighted res_freq */ dataset->res_freq = NULL; Resize(dataset->res_freq, dataset->alength, double); for (i=0; i<dataset->alength; i++) { dataset->res_freq[i] = 0; } dataset->total_res = 0; dataset->wgt_total_res = 0; for (i=0; i<dataset->n_samples; i++) { /* sequence */ long slen = dataset->samples[i]->length; double sw = dataset->samples[i]->sw = (n_wgts > i ? seq_weights[i] : 1); dataset->total_res += slen; dataset->wgt_total_res += slen*sw; for (j=0; j<dataset->alength; j++) { if (use_comp) { /* using complementary strand as well */ dataset->res_freq[j] += sw * dataset->samples[i]->counts[j]/2.0; dataset->res_freq[hash(comp_dna(unhash(j)))] += sw * dataset->samples[i]->counts[j]/2.0; } else { /* not using complementary strands */ dataset->res_freq[j] += sw * dataset->samples[i]->counts[j]; } } } /* convert counts to frequencies */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -