⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 read_seq_file.c

📁 EM算法的改进
💻 C
📖 第 1 页 / 共 2 页
字号:
  for (i=0; i<dataset->alength; i++)      dataset->res_freq[i] /= dataset->wgt_total_res;  return dataset;} /* read_seq_file *//**********************************************************************//*	create_sample	Create a sample.	Returns the sample or NULL on error.*//**********************************************************************/static SAMPLE *create_sample(  char *alpha,		/* the alphabet */  long length,		/* length of sequence */  char *name,		/* name of sample */  char *sequence  	/* the sequence */){  long i, j;   SAMPLE *new1;  int alength = strlen(alpha);			/* length of alphabet */  /* disallow zero length samples */  if (length == 0) {    fprintf(stderr, "\nZero length sequences not allowed. (Sequence `%s').\n",      name);    return NULL;  }  /* create the record to hold the sample and its associated data */  new1 = (SAMPLE *) mymalloc(sizeof(SAMPLE));  /* assign the name and sequence data */  new1->sample_name = name;  new1->seq = strdup(sequence);  /* set up encoded version of sequence and weights */  new1->res = (char *) mymalloc(length * (int) sizeof(char));  new1->resic = (char *) mymalloc(length * (int) sizeof(char));  new1->pYic = (char *) mymalloc(length * (int) sizeof(char));  new1->weights = (double *) mymalloc(length * (int) sizeof(double));  new1->not_o = (double *) mymalloc(length * (int) sizeof(double));  new1->log_not_o = (int *) mymalloc(length * (int) sizeof(int));  new1->logcumback = (double *) mymalloc((length+1) * (int) sizeof(double));  new1->nsites = 0;  new1->sites = NULL;  new1->z = (double *) mymalloc(length * (int) sizeof(double));  create_2array(new1->sz, double, 2, length);  for (i=0; i<length; i++) {     int c = (int) sequence[i];    int e = hash(c);    new1->res[i] = e;    new1->resic[length-i-1] = hash(comp_dna(c));    new1->weights[i] = 1.0;  }  /* set up arrays to hold posterior probabilities */  create_2array(new1->pY, int, 3, length);  /* set up array to hold character counts) */  new1->counts = (double *) calloc(alength, (int) sizeof(double));  /* get character counts */  for (i=0; i<length; i++) {    int c = new1->res[i];    if (c<alength) {				/* normal letter */      new1->counts[c]++;    } else {					/* 'X' */      for (j=0; j<alength; j++) new1->counts[j] += 1.0/alength;    }  }  /* record length of sequence */  new1->length = length;  return new1;} /* create_sample *//**********************************************************************//*	read_sequence	Read a single sequence from the data file.	Returns FALSE on EOF or bad sequence data.	Supports FASTA and SWISSPROT formats.	Note:	setup_hash_alphabet must have been called prior to first call to this.*//**********************************************************************/extern BOOLEAN read_sequence(  FILE *data_file,		/* file containing sequences */  char **sample_name,		/* unique identifier of sequence */  char **sample_de,		/* descriptor of sequence */  char **sequence,		/* protein or DNA letters */  long *length			/* length of sequence */){  int i, c;  FORMAT_TYPE format = FASTA;		/* assume FASTA format */  char *name = NULL;  /* skip anything until first sample name */  c = ' ';   while(c != EOF) {     if((c=fgetc(data_file)) == '>') {	/* FASTA format */      break;    } else if (c == 'I') {  		/* swiss-prot format? */      if ((c = fgetc(data_file)) == 'D') {	format = SWISSPROT;        c = fgetc(data_file);        Skip_whi(c, data_file);		/* skip whitespace to name */	break;      }    }     Skip_eol(c, data_file);		/* go to end of line */  }  if (c==EOF) return FALSE;		/* no more sequences */  /* get the sample name */  /* read to first blank/tab/ or end of line/file */  for (i=0; (c=fgetc(data_file))!=EOF; ) {    if ((c==' ') && i==0) {	/* skip blanks until name starts */      continue;    } else if (c==' ' || c=='\t' || c=='\n' || c=='\r') {      break;				/* blank or nl ends name */    } else {      if ((i % RCHUNK) == 0) {        Resize(name, i+RCHUNK, char);      }      name[i++] = c;			/* non-blank: add to name */    }  }  name[i] = '\0';  /* read in description */  *sample_de = NULL;			/* in case no description found */  if (format == FASTA) {    if (c != '\n' && c != '\r' && c != EOF) {        Skip_whi(c, data_file);		/* skip whitespace to name */       (void) read_sequence_de(data_file, sample_de);    }  } else if (format == SWISSPROT) {    Skip_eol(c, data_file);				/* go to end of line */    while (c != EOF) {					/* read all DE lines */      if ((c=fgetc(data_file)) == 'D') {	 	/* start of DE line? */	if ((c=fgetc(data_file)) == 'E') {	  c=fgetc(data_file);          Skip_whi(c, data_file);			/* skip white space */	  (void) read_sequence_de(data_file, sample_de);/* read description */          c = '\n';					/* at end of line */        }      } else if (c == 'S') { 				/* start of SQ line? */	if ((c=fgetc(data_file)) == 'Q') {          Skip_eol(c, data_file);			/* go to end of line */          break;        }      } else {        Skip_eol(c, data_file);				/* go to end of line */      }    } /* read all DE lines */  } /* format */  /* read in the actual sequence data */  *length = read_sequence_data(data_file, sequence, name);  /* sequence had bad data */  if (*length < 0) {    myfree(name);    myfree(*sample_de);    return FALSE;  }  *sample_name = name;  /* insure that the description string exists */  if (*sample_de == NULL) {    Resize(*sample_de, 1, char);    (*sample_de)[0] = '\0';  }  return TRUE;} /* read_sequence *//**********************************************************************//*	read_sequence_data	Read the sequence data into a dynamic array.	Converts to upper case.	Checks for illegal sequence characters.	Returns the length of the sequence or -1 on error.*//**********************************************************************/static long read_sequence_data(  FILE *data_file,		/* data file of sequences */  char **sequence, 		/* sequence data */  char *name			/* name of sequence */){  long length;   int c;  char *seq = NULL;  /*     read sample sequence   */  for(length=0; (c=fgetc(data_file))!=EOF; ) {    if (c == '>') { 			/* end of FASTA sequence */      ungetc(c,data_file);       break;     } else if (c == '/') {		/* end of SWISSPROT sequence */      c = fgetc(data_file);      if (c != '/') {        fprintf(stderr, "\nError reading SWISSPROT database.\n");        exit(1);      }      while ((c=fgetc(data_file))!=EOF && c != '\n') ;	/* find end of line */      break;    } else {      if (isspace(c)) continue;			/* remove whitespace from seq */      if(islower(c)) c = toupper(c);		/* convert to uppercase */      if (hash(c) == -1) {			/* illegal character? */        if (c=='*' || c=='.') continue;		/* remove *'s and .'s for MEME*/        fprintf(stderr, "\nIllegal character `%c' in sequence %s.\n", c, name);        fprintf(stderr, "Change alphabet or fix data file.\n");        if (seq != NULL) myfree(seq);        return -1;      }      /* alocate space for ASCII version of sequence */      if ((length % RCHUNK) == 0) {        Resize(seq, length+RCHUNK, char);      }      seq[length++] = c;    } /* read sample sequence */  }  /* put NULL at the end of the sequence */  Resize(seq, length+1, char);  seq[length] = '\0';  *sequence = seq;  return length;} /* read_sequence_data *//**********************************************************************//*	read_sequence_de	Read the sequence descriptor into a dynamic array.*//**********************************************************************/static int read_sequence_de(  FILE *data_file,		/* data file of sequences */  char **descriptor		/* sequence descriptor; if not NULL,				   string will be appended */){  int length, c;  char *de = *descriptor;  /* find current length of descriptor */  length = 0;  if (de != NULL) {			/* continuation of prev. descriptor */    length = strlen(de);		/* old length */    /* alocate space for descriptor string */    Resize(de, (1+length/RCHUNK)*RCHUNK, char);    de[length++] = ' ';			/* put a blank at the end */  }  /* read in sample descriptor */  while ( (c=fgetc(data_file)) != EOF) {    if (c == '\n') {       break;     } else if (length <= MAXDELEN) {      /* alocate space for descriptor string */      if ((length % RCHUNK) == 0) Resize(de, length+RCHUNK, char);      de[length++] = c;			/* append character to de */    }  }  /* put a NULL at the end of the de */  Resize(de, length+1, char);  de[length] = '\0';  *descriptor = de;  return length;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -