motifs.c

来自「EM算法的改进」· C语言 代码 · 共 187 行

C
187
字号
/* * $Id: motifs.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.2  2005/10/25 19:06:39  nadya * rm old macro for Header, all info is taken care of by Id and Log. * * Revision 1.1.1.1  2005/07/29 17:23:18  nadya * Importing from meme-3.0.14, and adding configure/make * *//************************************************************************                                                                      **       MEME++                                                         **       Copyright 1994, The Regents of the University of California    **       Author: Timothy L. Bailey                                      **                                                                      ************************************************************************//**********************************************************************//*	read_motifs	Read in a motif file.	Put positives for motifs into hash tables.	If requested, the dataset 	is read in and sequences are encoded  as integers using hash().	Only, n_samples, sample_name, length, and res are valid in dataset.		Two formats are supported:		.motif file (known sites=positions in sequence)		MOTIF <motif_name> <motif_width>		[<seq#> <pos> <seq_name>]+		.tag file (known family members=sequences)		MOTIF <motif_name> 0		[<seq#> <seq#> <seq_name>]+		<seq#> = 0;	"?" in Prosite; ie, possible positive			<seq#> = 1;	"T/N" in Prosite; ie, tp or fn  	Returns number of motifs read in.*//**********************************************************************/#include "meme.h"#define MOTIF_HASH_SIZE 100000#define DATA_HASH_SIZE 100000#define RCHUNK 100extern int read_motifs (  FILE *fdata,				/* opened dataset file */  char *filename,			/* motif file */  MOTIF motif[NMOTIFS],			/* motif info */  BOOLEAN save_dataset,			/* return dataset in memory */  DATASET *dataset			/* integer-encoded dataset */){  int i, nmotifs;  FILE *fptr;  char seq_name[MSN+1];  HASH_TABLE ht_seq_names;		/* hash of dataset seq names */  char *sample_name;			/* name of sample from dataset */  char *id;				/* id of sample from dataset */  char *sequence;			/* sample from dataset */  long length;				/* length of sample from dataset */  int seq_no = 0;			/* number of sequences in dataset */  /* create a hash table of sequence names */  ht_seq_names = hash_create(DATA_HASH_SIZE);  /* hash the names in the dataset      so that motif files can be checked for bad ones; column is set to 0   */  while (read_sequence(fdata, &sample_name, &id, &sequence, &length)) {    /* skip sequence if there was an error */    if (length < 0) continue;    if (hash_lookup(sample_name, 0, ht_seq_names)) {      /* printf("Duplicate sequence: %s\n", sample_name); */      myfree(sample_name);      myfree(id);      myfree(sequence);      continue;     }    hash_insert(sample_name, 0, ht_seq_names);	/* put name in hash table */    if (save_dataset) {      /* create a sample and put the sample in the array of samples */      if ((seq_no % RCHUNK) == 0) {	Resize(dataset->samples,	  (seq_no + RCHUNK) * (int) sizeof(SAMPLE *), SAMPLE *);      }      dataset->samples[seq_no] = (SAMPLE *) mymalloc(sizeof(SAMPLE));      dataset->samples[seq_no]->sample_name = sample_name;      dataset->samples[seq_no]->length = length;      /* encode the sequence as integers */      for (i=0; i < length; ++i) {	int c = (int) sequence[i];	int e = hash(c);	if (e == -1) {	  printf("\nIllegal character %c in sequence %s.  ", c, sample_name);	  printf("Change alphabet or fix data file.\n");	  return 0;	}        sequence[i] = e;      }      dataset->samples[seq_no]->res = sequence;      dataset->n_samples = ++seq_no;      /*printf("\r%s", sample_name);*/    } else {      myfree(sample_name);      myfree(id);      myfree(sequence);    }  }  rewind(fdata);  /* open motif file */  fptr = fopen(filename, "r");  if (fptr == NULL) {    fprintf(stderr, "Cannot open file '%s'\n", filename);    exit(1);  }  /* clear number of positives for each motif and best ROC */  for (i=0; i<NMOTIFS; i++) {    motif[i].pos = 0;		/* known motif positives */    motif[i].roc = 0;		/* this motif not found yet */    motif[i].recall = 0;	/* this motif not found yet */  }  /* read in the motifs and store them */  nmotifs = 0;  while ( (i = fscanf(fptr, " MOTIF %s %d", motif[nmotifs].name,    &motif[nmotifs].width) ) == 2) {    int seq, off;			/* positon of positive occurrence */    motif[nmotifs].ht = hash_create(MOTIF_HASH_SIZE);    if (nmotifs == NMOTIFS) {      fprintf(stderr,         "Too many known motifs (> NMOTIFS) in file %s!\n", filename);      fprintf(stderr, "Recompile with larger value of NMOTIFS.\n");      exit(1);    }    while ((i = fgetc(fptr)) != '\n'); /* ignore anything else on line */    while ( (i = fscanf(fptr, "%d %d ", &seq, &off) ) == 2) {      int c;      /* get the name of the sequence from the motif file */      for (i=0; (c=fgetc(fptr)) != EOF; ) {        if (c=='\n') {ungetc(c, fptr); break;}        else if (i < MSN) seq_name[i++] = c;      }      if (i == 0) printf("motif file must include sequence names!\n");      seq_name[i] = '\0';			/* null terminate name */           /* check that name is valid */      if (!hash_lookup(seq_name, 0, ht_seq_names)) {         if (seq != 0) {		/* don't worry about possibles */          printf("sample `%s' in motif %d not in dataset\n", seq_name, nmotifs);        }      } else  {					/* store name and offset/tag */	/* put in table only if found in dataset */        hash_insert(seq_name, off, motif[nmotifs].ht);	/* site in hash table */        /* if seq == 0, it is a tag file and this is NOT a real positive 	   ie, it is marked as "?" in Prosite         */        if (seq != 0) motif[nmotifs].pos++;		/* number of pos's */      }      while ((i = fgetc(fptr)) != '\n'); /* ignore anything else on line */    }    nmotifs++;				/* number of motifs */    if (i == EOF) break;  }  if (i != EOF) {    fprintf(stderr, "Error reading motif file %s\n", filename);    exit(1);  }  return nmotifs;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?