ureadseq.c

来自「EM算法的改进」· C语言 代码 · 共 1,911 行 · 第 1/4 页

C
1,911
字号
  if (V->f == NULL)    V->err = eFileNotFound;  else {    for (l = skiplines_; l > 0; l--) getline1( V);    do {      getline1( V);      for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ;    } while ((l == 0) && !feof(V->f));    if (feof(V->f)) V->err = eNoData;    else switch (format_) {      case kPlain : readPlain(V); break;      case kIG    : readIG(V); break;      case kStrider: readStrider(V); break;      case kGenBank: readGenBank(V); break;      case kPIR   : readPIR(V); break;      case kNBRF  : readNBRF(V); break;      case kPearson: readPearson(V); break;      case kEMBL  : readEMBL(V); break;      case kZuker : readZuker(V); break;      case kOlsen : readOlsen(V); break;      case kMSF   : readMSF(V); break;      case kPAUP    : {        boolean done= false;        boolean interleaved= false;        char  *cp;        /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */        while (!done) {          getline1( V);          tolowerstr( V->s);          if (strstr( V->s, "matrix")) done= true;          if (strstr( V->s, "interleav")) interleaved= true;          if (NULL != (cp=strstr( V->s, "ntax=")) )  V->topnseq= atoi(cp+5);          if (NULL != (cp=strstr( V->s, "nchar=")) )  V->topseqlen= atoi(cp+6);          if (NULL != (cp=strstr( V->s, "matchchar=")) )  {            cp += 10;            if (*cp=='\'') cp++;            else if (*cp=='"') cp++;            V->matchchar= *cp;            }          }        if (interleaved) readPAUPinterleaved(V);        else readPAUPsequential(V);        }        break;      /* kPhylip: ! can't determine in middle of file which type it is...*/      /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */      case kPhylip2:        readPhylipSequential(V);        break;      case kPhylip4: /* == kPhylip3 */        readPhylipInterleaved(V);        break;      default:        V->err = eUnknownFormat;        break;      case kFitch :        strcpy(V->seqid, V->s); getline1(V);        readFitch(V);        break;      case kGCG:        do {          gotuw = (strstr(V->s,"..") != NULL);          if (gotuw) readUWGCG(V);          getline1(V);        } while (!(feof(V->f) || V->allDone));        break;      }    }  V->filestart= false;  V->seq[V->seqlen] = 0; /* stick a string terminator on it */}char *readSeqFp(      const short whichEntry_,  /* index to sequence in file */      FILE  *fp_,   /* pointer to open seq file */      const long  skiplines_,      const short format_,      /* sequence file format */      long  *seqlen_,     /* return seq size */      short *nseq_,       /* number of seqs in file, for listSeqs() */      short *error_,      /* return error */      char  *seqid_)      /* return seq name/info */{  struct ReadSeqVars V;  if (format_ < kMinFormat || format_ > kMaxFormat) {    *error_ = eUnknownFormat;    *seqlen_ = 0;    return NULL;    }  V.choice = whichEntry_;  V.fname  = NULL;  /* don't know */  V.seq    = (char*) calloc(1, kStartLength+1);  V.maxseq = kStartLength;  V.seqlen = 0;  V.seqid  = seqid_;  V.f = fp_;  V.filestart= (ftell( fp_) == 0);  /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */  if (V.filestart)  V.nseq = 0;  else V.nseq= *nseq_;  /* track where we are in file...*/  *V.seqid = '\0';  V.err = 0;  V.nseq = 0;  V.isseqchar = isSeqChar;  if (V.choice == kListSequences) ; /* leave as is */  else if (V.choice <= 0) V.choice = 1; /* default ?? */  V.addit = (V.choice > 0);  V.allDone = false;  readSeqMain(&V, skiplines_, format_);  *error_ = V.err;  *seqlen_ = V.seqlen;  *nseq_ = V.nseq;  return V.seq;}char *readSeq(      const short whichEntry_,  /* index to sequence in file */      const char  *filename_,   /* file name */      const long  skiplines_,      const short format_,      /* sequence file format */      long  *seqlen_,     /* return seq size */      short *nseq_,       /* number of seqs in file, for listSeqs() */      short *error_,      /* return error */      char  *seqid_)      /* return seq name/info */{  struct ReadSeqVars V;  if (format_ < kMinFormat || format_ > kMaxFormat) {    *error_ = eUnknownFormat;    *seqlen_ = 0;    return NULL;    }  V.choice = whichEntry_;  V.fname  = filename_;  /* don't need to copy string, just ptr to it */  V.seq    = (char*) calloc(1, kStartLength+1);  V.maxseq = kStartLength;  V.seqlen = 0;  V.seqid  = seqid_;  V.f = NULL;  *V.seqid = '\0';  V.err = 0;  V.nseq = 0;  V.isseqchar = isSeqChar;  if (V.choice == kListSequences) ; /* leave as is */  else if (V.choice <= 0) V.choice = 1; /* default ?? */  V.addit = (V.choice > 0);  V.allDone = false;  V.f = fopen(V.fname, "r");  V.filestart= true;  readSeqMain(&V, skiplines_, format_);  if (V.f != NULL) fclose(V.f);  *error_ = V.err;  *seqlen_ = V.seqlen;  *nseq_ = V.nseq;  return V.seq;}char *listSeqs(      const char  *filename_,   /* file name */      const long skiplines_,      const short format_,      /* sequence file format */      short *nseq_,       /* number of seqs in file, for listSeqs() */      short *error_)      /* return error */{  char  seqid[MAXLINE];  long  seqlen;  return readSeq( kListSequences, filename_, skiplines_, format_,                  &seqlen, nseq_, error_, seqid);}short seqFileFormat(    /* return sequence format number, see ureadseq.h */    const char *filename,    long  *skiplines,   /* return #lines to skip any junk like mail header */    short *error)       /* return any error value or 0 */{  FILE      *fseq;  short      format;  fseq  = fopen(filename, "r");  format= seqFileFormatFp( fseq, skiplines, error);  if (fseq!=NULL) fclose(fseq);  return format;}short seqFileFormatFp(    FILE *fseq,    long  *skiplines,   /* return #lines to skip any junk like mail header */    short *error)       /* return any error value or 0 */{  boolean   foundIG= false, foundStrider= false,            foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false,            foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false,            gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false,            isfitch= false,  isphylip= false, done= false;  short     format= kUnknown;  int       nlines= 0, k=0, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0;  char      sp[MAXLINE];  long      linestart=0;  int     maxlines2check=500;#define ReadOneLine(sp)   \  { done |= (feof(fseq)); \    readline( fseq, sp, &linestart);  \    if (!done) { splen = strlen(sp); ++nlines; } }  *skiplines = 0;  *error = 0;  if (fseq == NULL) { *error = eFileNotFound;  return kNoformat; }  while ( !done ) {    ReadOneLine(sp);    /* check for mailer head & skip past if found */    if (nlines < 4 && !done) {      if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) {        do {          /* skip all lines until find one blank line */          ReadOneLine(sp);          if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ;          } while ((!done) && (k < splen));        *skiplines = nlines; /* !? do we want #lines or #bytes ?? */        }      }    if (sp==NULL || *sp==0)      ; /* nada */    /* high probability identities: */    else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") )      gotMSF= true;    else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL))      gotuw= true;    else if (strstr(sp,"identity:   Data:") != NULL)      gotolsen= true;    else if ( strstr(sp,"::=") &&      (strstr(sp,"Bioseq") ||       /* Bioseq or Bioseq-set */       strstr(sp,"Seq-entry") ||       strstr(sp,"Seq-submit") ) )  /* can we read submit format? */          gotasn1= true;    else if ( strstr(sp,"#NEXUS") == sp )      gotpaup= true;    /* uncertain identities: */    else if (*sp ==';') {      if (strstr(sp,"Strider") !=NULL) foundStrider= true;      else foundIG= true;      }    else if (strstr(sp,"LOCUS") == sp)      foundGB= true;    else if (strstr(sp,"ORIGIN") == sp)      foundGB= true;    else if (strstr(sp,"ENTRY   ") == sp)  /* ? also (strcmp(sp,"\\\\\\")==0) */      foundPIR= true;    else if (strstr(sp,"SEQUENCE") == sp)      foundPIR= true;    else if (*sp == '>') {      if (sp[3] == ';') foundNBRF= true;      else foundPearson= true;      }    else if (strstr(sp,"ID   ") == sp)      foundEMBL= true;    else if (strstr(sp,"SQ   ") == sp)      foundEMBL= true;    else if (*sp == '(')      foundZuker= true;    else {      if (nlines - *skiplines == 1) {        int ispp= 0, ilen= 0;        sscanf( sp, "%d%d", &ispp, &ilen);        if (ispp > 0 && ilen > 0) isphylip= true;        }      else if (isphylip && nlines - *skiplines == 2) {        int  tseq;        tseq= getseqtype(sp+10, strlen(sp+10));        if ( isalpha((int)*sp)     /* 1st letter in 2nd line must be of a name */         && (tseq != kOtherSeq))  /* sequence section must be okay */            foundPhylip= true;        }      for (k=0, isfitch= true; isfitch & (k < splen); k++) {        if (k % 4 == 0) isfitch &= (sp[k] == ' ');        else isfitch &= (sp[k] != ' ');        }      if (isfitch & (splen > 20)) foundFitch= true;      /* kRNA && kDNA are fairly certain...*/      switch (getseqtype( sp, splen)) {        case kOtherSeq: otherlines++; break;        case kAmino   : if (splen>20) aminolines++; break;        case kDNA     :        case kRNA     : if (splen>20) dnalines++; break;        case kNucleic : break; /* not much info ? */        }      }                    /* pretty certain */    if (gotolsen) {      format= kOlsen;      done= true;      }    else if (gotMSF) {      format= kMSF;      done= true;      }    else if (gotasn1) {      /* !! we need to look further and return  kASNseqentry | kASNseqset */      /*        seqentry key is Seq-entry ::=        seqset key is Bioseq-set ::=        ?? can't read these yet w/ ncbi tools ??          Seq-submit ::=          Bioseq ::=  << fails both bioseq-seq and seq-entry parsers !      */      if (strstr(sp,"Bioseq-set")) format= kASNseqset;      else if (strstr(sp,"Seq-entry")) format= kASNseqentry;      else format= kASN1;  /* other form, we can't yet read... */      done= true;      }    else if (gotpaup) {      format= kPAUP;      done= true;      }    else if (gotuw) {      if (foundIG) format= kIG;  /* a TOIG file from GCG for certain */      else format= kGCG;      done= true;      }    else if ((dnalines > 1) || done || (nlines > maxlines2check)) {          /* decide on most likely format */          /* multichar idents: */      if (foundStrider) format= kStrider;      else if (foundGB) format= kGenBank;      else if (foundPIR) format= kPIR;      else if (foundEMBL) format= kEMBL;      else if (foundNBRF) format= kNBRF;          /* single char idents: */      else if (foundIG) format= kIG;      else if (foundPearson) format= kPearson;      else if (foundZuker) format= kZuker;          /* digit ident: */      else if (foundPhylip) format= kPhylip;          /* spacing ident: */      else if (foundFitch) format= kFitch;          /* no format chars: */      else if (otherlines > 0) format= kUnknown;      else if (dnalines > 1) format= kPlain;      else if (aminolines > 1) format= kPlain;      else format= kUnknown;      done= true;      }    /* need this for possible long header in olsen format */     else if (strstr(sp,"): ") != NULL)       maxlines2check++;    }  if (format == kPhylip) {    /* check for interleaved or sequential -- really messy */    int tname, tseq;    long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0;    char  *ps;    rewind(fseq);    for (i=0; i < *skiplines; i++) ReadOneLine(sp);    nlines= 0;    ReadOneLine(sp);    sscanf( sp, "%ld%ld", &nspp, &nlen);    ReadOneLine(sp); /* 1st seq line */    for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint((int)*ps)) ilen++;    for (i= 1; i<nspp; i++) {      ReadOneLine(sp);      tseq= getseqtype(sp+10, strlen(sp+10));      tname= getseqtype(sp, 10);      for (j=0, ps= sp; isspace((int)*ps) && j<10; ps++, j++);      for (ps= sp; *ps!=0; ps++) if (isprint((int)*ps)) ilen++;      /* find probable interleaf or sequential ... */      if (j>=9) seq += 10; /* pretty certain not ileaf */      else {        if (tseq != tname) leaf++; else seq++;        if (tname == kDNA || tname == kRNA) seq++; else leaf++;        }      if (ilen <= nlen && j<9) {        if (tname == kOtherSeq) leaf += 10;        else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++;        }      else if (ilen > nlen) {        ilen= 0;        }      }    for ( nspp *= 2 ; i<nspp; i++) {  /* this should be only bases if interleaf */      ReadOneLine(sp);      tseq= getseqtype(sp+10, strlen(sp+10));      tname= getseqtype(sp, 10);      for (ps= sp; *ps!=0; ps++) if (isprint((int)*ps)) ilen++;      for (j=0, ps= sp; isspace((int)*ps) && j<10; ps++, j++);      if (j<9) {        if (tname == kOtherSeq) seq += 10;        if (tseq != tname) seq++; else leaf++;        if (tname == kDNA || tname == kRNA) leaf++; else seq++;        }      if (ilen > nlen) {        if (j>9) leaf += 10; /* must be a name here for sequent */        else if (tname == kOtherSeq) seq += 10;        ilen= 0;        }      }    if (leaf > seq) format= kPhylip4;    else format= kPhylip2;    }  return(format);#undef  ReadOneLine} /* SeqFileFormat */unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal)/* GCGchecksum */{  register long  i, check = 0, count = 0;  for (i = 0; i < seqlen; i++) {    count++;    check += count * to_upper(seq[i]);    if (count == 57) count = 0;    }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?