read_seq_file.c

来自「EM算法的改进」· C语言代码 · 共 591 行 · 第 1/2 页
591 行
/* * $Id: read_seq_file.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.3  2006/03/08 20:50:11  nadya * merge chamges from v3_5_2 branch * * Revision 1.2.4.2  2006/01/31 20:30:46  nadya * Attemp to fix control-m in description * * Revision 1.2.4.1  2006/01/31 19:11:54  nadya * add '\r' character to look for when delimiting strings * this is a fix for cygwin newline representation * * Revision 1.2  2005/10/25 19:06:39  nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1  2005/07/29 17:24:50  nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************								       **	MEME++							       **	Copyright 1994, The Regents of the University of California    **	Author: Timothy L. Bailey				       **								       ************************************************************************//* 9-30-99 tlb; add sequence to SAMPLE *//* 6-29-99 tlb; add resic *//*	Supports FASTA and SWISSPROT formats.Note:	Assumes SWISSPROT format if "ID" appears at beginning of first line.	Ignores duplicate (same <id>) sequences.All formats:	<id>		:= <alpha>+			   The *unique* name of the sequence.  May *not*			   contain whitespace!			   NOTE: if <id> is "WEIGHTS", the <description>			   field is parsed as a string of sequence weights 			   and any <sequence> field (if any) is ignored.			   All weights are assigned in order to the 			   sequences in the file. If there are more sequences			   than weights, the remainder are given weight one.			   Weights must be greater than zero and less than			   or equal to one.  Weights may be specified by			   more than one "WEIGHT" entry which may appear			   anywhere in the file.	<description>	:= <alpha>+			   A description of the sequence;		           may contain whitespace.	<sequence>	:= <alpha>+			   The DNA or protein sequence data;                            may contain whitespace.	<text>		:= <alpha>*			   Text which is ignored.Pearson FASTA format:	INPUT DATA FILE FORMAT:		[		<header>		<sequence>+		]*	<header>	:= ">"<id>" "<description>			    Everything up to the first space is 			    assumed to be the unique name of the sequence.	SWISSPROT Format	[	<ID>	<DE>+	<SQ>	<sequence>+	//	]*	<ID>		:= "ID   "<id>" "<text>	<DE>		:= "DE   "<description>	<SQ>		:= "SQ   "<text>*/#include <meme.h>/* size of memory chunks to allocate for sequence data */#define RCHUNK 100/* maximum size of sequence description text string */#define MAXDELEN 10000/* local types */typedef enum {FASTA, SWISSPROT} FORMAT_TYPE;/* local functions */static SAMPLE *create_sample(  char *alpha,		/* the alphabet */  long length,		/* length of sequence */  char *name,		/* name of sample */  char *sequence  	/* the sequence */);/* local functions */static long read_sequence_data(  FILE *data_file,		/* data file of sequences */  char **sequence, 		/* sequence data */  char *name			/* name of sequence */);static int read_sequence_de(  FILE *data_file,		/* data file of sequences */  char **descriptor		/* sequence descriptor */);/* local variables */#define DATA_HASH_SIZE 100000/*    The hash table will be appended to if read_seq_file is called more   than once.  This is so that positive sequences the negative sequence file   will be ignored.*/ static HASH_TABLE ht_seq_names = NULL;	/* hash of dataset seq names *//**********************************************************************//*	read_seq_file	Open a sequence file and read in the sequences. 	Returns a dataset->	setup_hash_alphabet must have been called prior to first call to this.*//**********************************************************************/extern DATASET *read_seq_file(  char *file_name,		/* name of file to open */  char *alpha,			/* alphabet used in sequences */  BOOLEAN use_comp,		/* use complementary strands, too */  double seqfrac		/* fraction of input sequences to use */){  int i, j;  FILE *data_file;		/* file with samples to read */  char *sample_name;		/* name of sample read */  char *sample_de;		/* descriptor text for sample */  char *sequence;		/* sequence read */  long length;			/* length of sequence */  BOOLEAN error=FALSE;		/* none yet */  SAMPLE *sample;		/* sample created */  DATASET *dataset;		/* dataset created */  int n_samples=0;		/* number of samples read */  double *seq_weights=NULL;	/* sequence weights */  int n_wgts=0;			/* number of sequence weights given */  /* create a hash table of sequence names */  if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE);  /* create a dataset */  dataset = (DATASET *) mymalloc(sizeof(DATASET));  dataset->alength = strlen(alpha);  dataset->alphabet = alpha;  /* open data file */   if (file_name == NULL) {    fprintf(stderr, "You must specify a data file or `stdin'.\n");    exit(1);  } else if (strcmp(file_name, "stdin")) {    data_file = fopen(file_name, "r");     if (data_file == NULL) {      fprintf(stderr, "Cannot open file `%s'.\n", file_name);      exit(1);    }  } else {    data_file = stdin;  }  /* initialize maximum length of sequences */  dataset->max_slength = 0;  dataset->min_slength = 10000000;  dataset->n_samples = 0;	/* no samples yet */  dataset->samples = NULL;	/* no samples */  while (read_sequence(data_file, &sample_name, &sample_de, &sequence,     &length)) {    /* skip sequence if an error occurred */    if (length < 0) continue;    /* parse weights if given; make (more than enough) room in array */    if (strcmp(sample_name, "WEIGHTS")==0) {      double wgt;       char *wgt_str = sample_de;      Resize(seq_weights, n_wgts+(int)strlen(wgt_str), double);      while (sscanf(wgt_str, "%lf", &wgt) == 1) {        if (wgt <= 0 || wgt > 1) {	  fprintf(stderr,             "Weights must be larger than zero and no greater than 1.\n");	  exit(1);        }        seq_weights[n_wgts++] = wgt;			/* save weight */        wgt_str += strspn(wgt_str, "      ");		/* skip white */        wgt_str += strcspn(wgt_str, "     ");		/* skip token */      }      myfree(sample_name);      myfree(sample_de);      myfree(sequence);      continue;    }    /* ignore duplicate (same sample name) sequences */     if (hash_lookup(sample_name, 0, ht_seq_names)) {      fprintf(stderr, "Skipping sequence '%s'.\n", sample_name);      myfree(sample_name);      myfree(sample_de);      myfree(sequence);      continue;    }    hash_insert(sample_name, 0, ht_seq_names);  /* put name in hash table */    n_samples++;    /* see if sequence will be used in random sample; store it if yes */    if (drand48() >= 1 - seqfrac) {      /* create the sample */      sample = create_sample(alpha, length, sample_name, sequence);      if (sample == NULL) {error = TRUE; continue;}      /* record maximum length of actual sequences */      dataset->max_slength = MAX(sample->length, dataset->max_slength);      dataset->min_slength = MIN(sample->length, dataset->min_slength);      /* put the sample in the array of samples */      if ((dataset->n_samples % RCHUNK) == 0) {        Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *);      }      dataset->samples[dataset->n_samples++] = sample;    }  } /* sequences */  if (length < 0) error = TRUE;			/* read_sequence error */  /* resize the array of samples */  if (dataset->n_samples) Resize(dataset->samples, dataset->n_samples, SAMPLE*);  /* check that datafile contained at least one sample */  if (!error) {    if (n_samples == 0) {      fprintf(stderr, "No sequences found in file `%s'.  Check file format.\n",	file_name);      error = TRUE;    } else if (dataset->n_samples == 0) {      fprintf(stderr,         "No sequences sampled.  Use different seed or higher seqfrac.\n");      error = TRUE;    }  }  /* exit if there was an error */  if (error) exit(1);  /* calculate the prior residue frequencies and entropies      and |D|, size of the dataset */  /* tlb; 5/9/97 wgt_total_res and weighted res_freq */  dataset->res_freq = NULL;  Resize(dataset->res_freq, dataset->alength, double);  for (i=0; i<dataset->alength; i++) { dataset->res_freq[i] = 0; }  dataset->total_res = 0;  dataset->wgt_total_res = 0;  for (i=0; i<dataset->n_samples; i++) {		/* sequence */    long slen = dataset->samples[i]->length;    double sw = dataset->samples[i]->sw = (n_wgts > i ? seq_weights[i] : 1);    dataset->total_res += slen;    dataset->wgt_total_res += slen*sw;    for (j=0; j<dataset->alength; j++) {      if (use_comp) { 	/* using complementary strand as well */        dataset->res_freq[j] += sw * dataset->samples[i]->counts[j]/2.0;        dataset->res_freq[hash(comp_dna(unhash(j)))] +=           sw * dataset->samples[i]->counts[j]/2.0;      } else {		/* not using complementary strands */        dataset->res_freq[j] += sw * dataset->samples[i]->counts[j];      }    }  }  /* convert counts to frequencies */
read_seq_file.c - 源码说明

本页面展示了「EM算法的改进」中的 read_seq_file.c 源码文件，采用 C语言编程语言编写，共 591 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与法的改进相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?