ureadseq.c
来自「EM算法的改进」· C语言 代码 · 共 1,911 行 · 第 1/4 页
C
1,911 行
if (V->f == NULL) V->err = eFileNotFound; else { for (l = skiplines_; l > 0; l--) getline1( V); do { getline1( V); for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ; } while ((l == 0) && !feof(V->f)); if (feof(V->f)) V->err = eNoData; else switch (format_) { case kPlain : readPlain(V); break; case kIG : readIG(V); break; case kStrider: readStrider(V); break; case kGenBank: readGenBank(V); break; case kPIR : readPIR(V); break; case kNBRF : readNBRF(V); break; case kPearson: readPearson(V); break; case kEMBL : readEMBL(V); break; case kZuker : readZuker(V); break; case kOlsen : readOlsen(V); break; case kMSF : readMSF(V); break; case kPAUP : { boolean done= false; boolean interleaved= false; char *cp; /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */ while (!done) { getline1( V); tolowerstr( V->s); if (strstr( V->s, "matrix")) done= true; if (strstr( V->s, "interleav")) interleaved= true; if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5); if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6); if (NULL != (cp=strstr( V->s, "matchchar=")) ) { cp += 10; if (*cp=='\'') cp++; else if (*cp=='"') cp++; V->matchchar= *cp; } } if (interleaved) readPAUPinterleaved(V); else readPAUPsequential(V); } break; /* kPhylip: ! can't determine in middle of file which type it is...*/ /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */ case kPhylip2: readPhylipSequential(V); break; case kPhylip4: /* == kPhylip3 */ readPhylipInterleaved(V); break; default: V->err = eUnknownFormat; break; case kFitch : strcpy(V->seqid, V->s); getline1(V); readFitch(V); break; case kGCG: do { gotuw = (strstr(V->s,"..") != NULL); if (gotuw) readUWGCG(V); getline1(V); } while (!(feof(V->f) || V->allDone)); break; } } V->filestart= false; V->seq[V->seqlen] = 0; /* stick a string terminator on it */}char *readSeqFp( const short whichEntry_, /* index to sequence in file */ FILE *fp_, /* pointer to open seq file */ const long skiplines_, const short format_, /* sequence file format */ long *seqlen_, /* return seq size */ short *nseq_, /* number of seqs in file, for listSeqs() */ short *error_, /* return error */ char *seqid_) /* return seq name/info */{ struct ReadSeqVars V; if (format_ < kMinFormat || format_ > kMaxFormat) { *error_ = eUnknownFormat; *seqlen_ = 0; return NULL; } V.choice = whichEntry_; V.fname = NULL; /* don't know */ V.seq = (char*) calloc(1, kStartLength+1); V.maxseq = kStartLength; V.seqlen = 0; V.seqid = seqid_; V.f = fp_; V.filestart= (ftell( fp_) == 0); /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */ if (V.filestart) V.nseq = 0; else V.nseq= *nseq_; /* track where we are in file...*/ *V.seqid = '\0'; V.err = 0; V.nseq = 0; V.isseqchar = isSeqChar; if (V.choice == kListSequences) ; /* leave as is */ else if (V.choice <= 0) V.choice = 1; /* default ?? */ V.addit = (V.choice > 0); V.allDone = false; readSeqMain(&V, skiplines_, format_); *error_ = V.err; *seqlen_ = V.seqlen; *nseq_ = V.nseq; return V.seq;}char *readSeq( const short whichEntry_, /* index to sequence in file */ const char *filename_, /* file name */ const long skiplines_, const short format_, /* sequence file format */ long *seqlen_, /* return seq size */ short *nseq_, /* number of seqs in file, for listSeqs() */ short *error_, /* return error */ char *seqid_) /* return seq name/info */{ struct ReadSeqVars V; if (format_ < kMinFormat || format_ > kMaxFormat) { *error_ = eUnknownFormat; *seqlen_ = 0; return NULL; } V.choice = whichEntry_; V.fname = filename_; /* don't need to copy string, just ptr to it */ V.seq = (char*) calloc(1, kStartLength+1); V.maxseq = kStartLength; V.seqlen = 0; V.seqid = seqid_; V.f = NULL; *V.seqid = '\0'; V.err = 0; V.nseq = 0; V.isseqchar = isSeqChar; if (V.choice == kListSequences) ; /* leave as is */ else if (V.choice <= 0) V.choice = 1; /* default ?? */ V.addit = (V.choice > 0); V.allDone = false; V.f = fopen(V.fname, "r"); V.filestart= true; readSeqMain(&V, skiplines_, format_); if (V.f != NULL) fclose(V.f); *error_ = V.err; *seqlen_ = V.seqlen; *nseq_ = V.nseq; return V.seq;}char *listSeqs( const char *filename_, /* file name */ const long skiplines_, const short format_, /* sequence file format */ short *nseq_, /* number of seqs in file, for listSeqs() */ short *error_) /* return error */{ char seqid[MAXLINE]; long seqlen; return readSeq( kListSequences, filename_, skiplines_, format_, &seqlen, nseq_, error_, seqid);}short seqFileFormat( /* return sequence format number, see ureadseq.h */ const char *filename, long *skiplines, /* return #lines to skip any junk like mail header */ short *error) /* return any error value or 0 */{ FILE *fseq; short format; fseq = fopen(filename, "r"); format= seqFileFormatFp( fseq, skiplines, error); if (fseq!=NULL) fclose(fseq); return format;}short seqFileFormatFp( FILE *fseq, long *skiplines, /* return #lines to skip any junk like mail header */ short *error) /* return any error value or 0 */{ boolean foundIG= false, foundStrider= false, foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false, foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false, gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false, isfitch= false, isphylip= false, done= false; short format= kUnknown; int nlines= 0, k=0, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0; char sp[MAXLINE]; long linestart=0; int maxlines2check=500;#define ReadOneLine(sp) \ { done |= (feof(fseq)); \ readline( fseq, sp, &linestart); \ if (!done) { splen = strlen(sp); ++nlines; } } *skiplines = 0; *error = 0; if (fseq == NULL) { *error = eFileNotFound; return kNoformat; } while ( !done ) { ReadOneLine(sp); /* check for mailer head & skip past if found */ if (nlines < 4 && !done) { if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) { do { /* skip all lines until find one blank line */ ReadOneLine(sp); if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ; } while ((!done) && (k < splen)); *skiplines = nlines; /* !? do we want #lines or #bytes ?? */ } } if (sp==NULL || *sp==0) ; /* nada */ /* high probability identities: */ else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") ) gotMSF= true; else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL)) gotuw= true; else if (strstr(sp,"identity: Data:") != NULL) gotolsen= true; else if ( strstr(sp,"::=") && (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */ strstr(sp,"Seq-entry") || strstr(sp,"Seq-submit") ) ) /* can we read submit format? */ gotasn1= true; else if ( strstr(sp,"#NEXUS") == sp ) gotpaup= true; /* uncertain identities: */ else if (*sp ==';') { if (strstr(sp,"Strider") !=NULL) foundStrider= true; else foundIG= true; } else if (strstr(sp,"LOCUS") == sp) foundGB= true; else if (strstr(sp,"ORIGIN") == sp) foundGB= true; else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */ foundPIR= true; else if (strstr(sp,"SEQUENCE") == sp) foundPIR= true; else if (*sp == '>') { if (sp[3] == ';') foundNBRF= true; else foundPearson= true; } else if (strstr(sp,"ID ") == sp) foundEMBL= true; else if (strstr(sp,"SQ ") == sp) foundEMBL= true; else if (*sp == '(') foundZuker= true; else { if (nlines - *skiplines == 1) { int ispp= 0, ilen= 0; sscanf( sp, "%d%d", &ispp, &ilen); if (ispp > 0 && ilen > 0) isphylip= true; } else if (isphylip && nlines - *skiplines == 2) { int tseq; tseq= getseqtype(sp+10, strlen(sp+10)); if ( isalpha((int)*sp) /* 1st letter in 2nd line must be of a name */ && (tseq != kOtherSeq)) /* sequence section must be okay */ foundPhylip= true; } for (k=0, isfitch= true; isfitch & (k < splen); k++) { if (k % 4 == 0) isfitch &= (sp[k] == ' '); else isfitch &= (sp[k] != ' '); } if (isfitch & (splen > 20)) foundFitch= true; /* kRNA && kDNA are fairly certain...*/ switch (getseqtype( sp, splen)) { case kOtherSeq: otherlines++; break; case kAmino : if (splen>20) aminolines++; break; case kDNA : case kRNA : if (splen>20) dnalines++; break; case kNucleic : break; /* not much info ? */ } } /* pretty certain */ if (gotolsen) { format= kOlsen; done= true; } else if (gotMSF) { format= kMSF; done= true; } else if (gotasn1) { /* !! we need to look further and return kASNseqentry | kASNseqset */ /* seqentry key is Seq-entry ::= seqset key is Bioseq-set ::= ?? can't read these yet w/ ncbi tools ?? Seq-submit ::= Bioseq ::= << fails both bioseq-seq and seq-entry parsers ! */ if (strstr(sp,"Bioseq-set")) format= kASNseqset; else if (strstr(sp,"Seq-entry")) format= kASNseqentry; else format= kASN1; /* other form, we can't yet read... */ done= true; } else if (gotpaup) { format= kPAUP; done= true; } else if (gotuw) { if (foundIG) format= kIG; /* a TOIG file from GCG for certain */ else format= kGCG; done= true; } else if ((dnalines > 1) || done || (nlines > maxlines2check)) { /* decide on most likely format */ /* multichar idents: */ if (foundStrider) format= kStrider; else if (foundGB) format= kGenBank; else if (foundPIR) format= kPIR; else if (foundEMBL) format= kEMBL; else if (foundNBRF) format= kNBRF; /* single char idents: */ else if (foundIG) format= kIG; else if (foundPearson) format= kPearson; else if (foundZuker) format= kZuker; /* digit ident: */ else if (foundPhylip) format= kPhylip; /* spacing ident: */ else if (foundFitch) format= kFitch; /* no format chars: */ else if (otherlines > 0) format= kUnknown; else if (dnalines > 1) format= kPlain; else if (aminolines > 1) format= kPlain; else format= kUnknown; done= true; } /* need this for possible long header in olsen format */ else if (strstr(sp,"): ") != NULL) maxlines2check++; } if (format == kPhylip) { /* check for interleaved or sequential -- really messy */ int tname, tseq; long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0; char *ps; rewind(fseq); for (i=0; i < *skiplines; i++) ReadOneLine(sp); nlines= 0; ReadOneLine(sp); sscanf( sp, "%ld%ld", &nspp, &nlen); ReadOneLine(sp); /* 1st seq line */ for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint((int)*ps)) ilen++; for (i= 1; i<nspp; i++) { ReadOneLine(sp); tseq= getseqtype(sp+10, strlen(sp+10)); tname= getseqtype(sp, 10); for (j=0, ps= sp; isspace((int)*ps) && j<10; ps++, j++); for (ps= sp; *ps!=0; ps++) if (isprint((int)*ps)) ilen++; /* find probable interleaf or sequential ... */ if (j>=9) seq += 10; /* pretty certain not ileaf */ else { if (tseq != tname) leaf++; else seq++; if (tname == kDNA || tname == kRNA) seq++; else leaf++; } if (ilen <= nlen && j<9) { if (tname == kOtherSeq) leaf += 10; else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++; } else if (ilen > nlen) { ilen= 0; } } for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */ ReadOneLine(sp); tseq= getseqtype(sp+10, strlen(sp+10)); tname= getseqtype(sp, 10); for (ps= sp; *ps!=0; ps++) if (isprint((int)*ps)) ilen++; for (j=0, ps= sp; isspace((int)*ps) && j<10; ps++, j++); if (j<9) { if (tname == kOtherSeq) seq += 10; if (tseq != tname) seq++; else leaf++; if (tname == kDNA || tname == kRNA) leaf++; else seq++; } if (ilen > nlen) { if (j>9) leaf += 10; /* must be a name here for sequent */ else if (tname == kOtherSeq) seq += 10; ilen= 0; } } if (leaf > seq) format= kPhylip4; else format= kPhylip2; } return(format);#undef ReadOneLine} /* SeqFileFormat */unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal)/* GCGchecksum */{ register long i, check = 0, count = 0; for (i = 0; i < seqlen; i++) { count++; check += count * to_upper(seq[i]); if (count == 57) count = 0; }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?