ureadseq.c

来自「EM算法的改进」· C语言 代码 · 共 1,911 行 · 第 1/4 页

C
1,911
字号
/* * $Id: ureadseq.c 1339 2006-09-21 19:46:28Z tbailey $ *  * $Log$ * Revision 1.2  2006/03/08 20:50:11  nadya * merge chamges from v3_5_2 branch * * Revision 1.1.1.1.4.1  2006/01/26 08:34:26  tbailey * Renamed local function getline() to getline1() to avoid conflict * with system function defined in stdio.h * * Revision 1.1.1.1  2005/07/29 17:19:22  nadya * Importing from meme-3.0.14, and adding configure/make * *//* File: ureadseq.c * * Reads and writes nucleic/protein sequence in various * formats. Data files may have multiple sequences. * * Copyright 1990 by d.g.gilbert * biology dept., indiana university, bloomington, in 47405 * e-mail: gilbertd@bio.indiana.edu * * This program may be freely copied and used by anyone. * Developers are encourged to incorporate parts in their * programs, rather than devise their own private sequence * format. * * This should compile and run with any ANSI C compiler. * */#define UREADSEQ_G#include "ureadseq.h"/* strlcpy is missing from some LINUX */#if defined(Linux)static size_t strlcpy(char *dst, const char *src, size_t dstsize){  int i;  for (i=0; src[i] != '\0'; i++) {    if (i<dstsize) dst[i] = src[i];  }  if (i<dstsize) dst[i] = '\0'; else dst[dstsize-1] = '\0';  return(i); }#endifint Strcasecmp(const char *a, const char *b)  /* from Nlm_StrICmp */{  int diff, done;  if (a == b)  return 0;  done = 0;  while (! done) {    diff = to_upper(*a) - to_upper(*b);    if (diff) return diff;    if (*a == '\0') done = 1;    else { a++; b++; }    }  return 0;}int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */{  int diff, done;  if (a == b)  return 0;  done = 0;  while (! done) {    diff = to_upper(*a) - to_upper(*b);    if (diff) return diff;    if (*a == '\0') done = 1;    else {      a++; b++; maxn--;      if (! maxn) done = 1;      }    }  return 0;}#ifndef Local# define Local      static    /* local functions */#endif#define kStartLength  500const char *aminos      = "ABCDEFGHIKLMNPQRSTVWXYZ*";const char *primenuc    = "ACGTU";const char *protonly    = "EFIPQZ";const char kNocountsymbols[5]  = "_.-?";const char stdsymbols[6]  = "_.-*?";const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\";static const char *seqsymbols   = allsymbols;const char nummask[11]   = "0123456789";const char nonummask[11] = "~!@#$%^&*(";/*    use general form of isseqchar -- all chars + symbols.    no formats except nbrf (?) use symbols in data area as    anything other than sequence chars.*/                          /* Local variables for readSeq: */struct ReadSeqVars {  short choice, err, nseq;  long  seqlen, maxseq, seqlencount;  short topnseq;  long  topseqlen;  const char *fname;  char *seq, *seqid, matchchar;  boolean allDone, done, filestart, addit;  FILE  *f;  long  linestart;  char  s[MAXLINE], *sp;  int  (*isseqchar)(int c);	/* tlb 3/1/96 */  /*int (*isseqchar)();*/  /* int  (*isseqchar)(int c);  << sgi cc hates (int c) */};int isSeqChar(int c){  return (isalpha((int)c) || strchr(seqsymbols,c));}int isSeqNumChar(int c){  return (isalnum((int)c) || strchr(seqsymbols,c));}int isAnyChar(int c){  return isascii(c); /* wrap in case isascii is macro */}Local void readline(FILE *f, char *s, long *linestart){  char  *cp;  *linestart= ftell(f);  if (NULL == fgets(s, MAXLINE, f))    *s = 0;  else {    cp = strchr(s, '\n');    if (cp != NULL) *cp = 0;    }}Local void getline1(struct ReadSeqVars *V){  readline(V->f, V->s, &V->linestart);}Local void ungetline(struct ReadSeqVars *V){  fseek(V->f, V->linestart, 0);}Local void addseq(char *s, struct ReadSeqVars *V){  char  *ptr;  if (V->addit) while (*s != 0) {    if ((V->isseqchar)(*s)) {      if (V->seqlen >= V->maxseq) {        V->maxseq += kStartLength;        ptr = (char*) realloc(V->seq, V->maxseq+1);        if (ptr==NULL) {          V->err = eMemFull;          return;          }        else V->seq = ptr;        }      V->seq[(V->seqlen)++] = *s;      }    s++;    }}Local void countseq(char *s, struct ReadSeqVars *V) /* this must count all valid seq chars, for some formats (paup-sequential) even    if we are skipping seq... */{  while (*s != 0) {    if ((V->isseqchar)(*s)) {      (V->seqlencount)++;      }    s++;    }}Local void addinfo(char *s, struct ReadSeqVars *V){  char s2[256], *si;  boolean saveadd;  si = s2;  while (*s == ' ') s++;  sprintf(si, " %d)  %s\n", V->nseq, s);  saveadd = V->addit;  V->addit = true;  V->isseqchar = isAnyChar;  addseq( si, V);  V->addit = saveadd;  V->isseqchar = isSeqChar;}Local void readLoop(short margin, boolean addfirst,            boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V),            struct ReadSeqVars *V){  boolean addend = false;  boolean ungetend = false;  V->nseq++;  if (V->choice == kListSequences) V->addit = false;  else V->addit = (V->nseq == V->choice);  if (V->addit) V->seqlen = 0;  if (addfirst) addseq(V->s, V);  do {    getline1(V);    V->done = feof(V->f);    V->done |= (*endTest)( &addend, &ungetend, V);    if (V->addit && (addend || !V->done) && ((int) strlen(V->s) > margin)) {      addseq( (V->s)+margin, V);    }  } while (!V->done);  if (V->choice == kListSequences) addinfo(V->seqid, V);  else {    V->allDone = (V->nseq >= V->choice);    if (V->allDone && ungetend) ungetline(V);    }}Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){  *addend = true; /* 1 or 2 occur in line w/ bases */  *ungetend= false;  return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL));}Local void readIG(struct ReadSeqVars *V){/* 18Aug92: new IG format -- ^L between sequences in place of ";" */  char  *si;  while (!V->allDone) {    do {      getline1(V);      for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */      if (*si == 0) *V->s= 0; /* chop line to empty */    } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) ));    if (feof(V->f))      V->allDone = true;    else {      strcpy(V->seqid, V->s);      readLoop(0, false, endIG, V);      }  }}Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){  *addend = false;  *ungetend= false;  return (strstr( V->s, "//") != NULL);}Local void readStrider(struct ReadSeqVars *V){ /* ? only 1 seq/file ? */  while (!V->allDone) {    getline1(V);    if (strstr(V->s,"; DNA sequence  ") == V->s)      strcpy(V->seqid, (V->s)+16);    else      strcpy(V->seqid, (V->s)+1);    while ((!feof(V->f)) && (*V->s == ';')) {      getline1(V);      }    if (feof(V->f)) V->allDone = true;    else readLoop(0, true, endStrider, V);  }}Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){  *addend = false;  *ungetend= (strstr(V->s,"ENTRY") == V->s);  return ((strstr(V->s,"///") != NULL) || *ungetend);}Local void readPIR(struct ReadSeqVars *V){ /*PIR -- many seqs/file */  while (!V->allDone) {    while (! (feof(V->f) || strstr(V->s,"ENTRY")  || strstr(V->s,"SEQUENCE")) )      getline1(V);    strcpy(V->seqid, (V->s)+16);    while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s))      getline1(V);    readLoop(0, false, endPIR, V);    if (!V->allDone) {     while (! (feof(V->f) || ((*V->s != 0)       && (strstr( V->s,"ENTRY") == V->s))))        getline1(V);      }    if (feof(V->f)) V->allDone = true;  }}Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){  *addend = false;  *ungetend= (strstr(V->s,"LOCUS") == V->s);  return ((strstr(V->s,"//") != NULL) || *ungetend);}Local void readGenBank(struct ReadSeqVars *V){ /*GenBank -- many seqs/file */  while (!V->allDone) {    strcpy(V->seqid, (V->s)+12);    while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s))      getline1(V);    readLoop(0, false, endGB, V);    if (!V->allDone) {     while (! (feof(V->f) || ((*V->s != 0)       && (strstr( V->s,"LOCUS") == V->s))))        getline1(V);      }    if (feof(V->f)) V->allDone = true;  }}Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){  char  *a;  if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */    /* "*" can be valid base symbol, drop it here */    *a = 0;    *addend = true;    *ungetend= false;    return(true);    }  else if (*V->s == '>') { /* start of next seq */    *addend = false;    *ungetend= true;    return(true);    }  else    return(false);}Local void readNBRF(struct ReadSeqVars *V){  while (!V->allDone) {    strcpy(V->seqid, (V->s)+4);    getline1(V);   /*skip title-junk line*/    readLoop(0, false, endNBRF, V);    if (!V->allDone) {     while (!(feof(V->f) || (*V->s != 0 && *V->s == '>')))        getline1(V);      }    if (feof(V->f)) V->allDone = true;  }}Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){  *addend = false;  *ungetend= true;  return(*V->s == '>');}Local void readPearson(struct ReadSeqVars *V){  while (!V->allDone) {    strlcpy(V->seqid, (V->s)+1, MAXLINE);    readLoop(0, false, endPearson, V);    if (!V->allDone) {     while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>'))))        getline1(V);      }    if (feof(V->f)) V->allDone = true;  }}Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){  *addend = false;  *ungetend= (strstr(V->s,"ID   ") == V->s);  return ((strstr(V->s,"//") != NULL) || *ungetend);}Local void readEMBL(struct ReadSeqVars *V){  while (!V->allDone) {    strcpy(V->seqid, (V->s)+5);    do {      getline1(V);    } while (!(feof(V->f) | (strstr(V->s,"SQ   ") == V->s)));    readLoop(0, false, endEMBL, V);    if (!V->allDone) {      while (!(feof(V->f) |         ((*V->s != '\0') & (strstr(V->s,"ID   ") == V->s))))      getline1(V);    }    if (feof(V->f)) V->allDone = true;  }}Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){  *addend = false;  *ungetend= true;  return( *V->s == '(' );}Local void readZuker(struct ReadSeqVars *V){  /*! 1st string is Zuker's Fortran format */  while (!V->allDone) {    getline1(V);  /*s == "seqLen seqid string..."*/    strcpy(V->seqid, (V->s)+6);    readLoop(0, false, endZuker, V);    if (!V->allDone) {      while (!(feof(V->f) |        ((*V->s != '\0') & (*V->s == '('))))          getline1(V);      }    if (feof(V->f)) V->allDone = true;  }}Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V){

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?