📄 sqio.c
字号:
voidToIUPAC(char *seq) { for (; *seq != '\0'; seq++) if (strchr(NUCLEOTIDES, *seq) == NULL && ! isgap(*seq)) *seq = 'N';}/* Function: addseq() * * Purpose: Add a line of sequence to the growing string in V. * Skip all nonalphabetic characters in the input string: * in particular, spaces and digits (coordinates). This * allows us to generically read sequence data from most * any format. */static void addseq(char *s, struct ReadSeqVars *V){ char *s0; char *sq; int rpl; /* valid residues per line */ int bpl; /* characters per line */ if (V->ssimode == -1) { /* Normal mode: keeping the seq */ /* Make sure we have enough room. We know that s is <= buflen, * so just make sure we've got room for a whole new buflen worth * of sequence. */ if (V->seqlen + V->buflen > V->maxseq) { V->maxseq += MAX(V->buflen, kStartLength); V->seq = ReallocOrDie (V->seq, V->maxseq+1); } s0 = s; sq = V->seq + V->seqlen; while (*s != 0) { if (isalpha((int) *s)) { *sq = *s; sq++; } s++; } V->seqlen = sq - V->seq; } else /* else: indexing mode, discard the seq */ { s0 = s; rpl = 0; while (*s != 0) { if (isalpha((int) *s)) rpl++; s++; } V->seqlen += rpl; bpl = s - s0; /* Keep track of the global rpl, bpl for the file. * This is overly complicated because we have to * allow the last line of each record (e.g. the last addseq() call * on each sequence) to have a different length - and sometimes * we'll have one-line sequence records, too. Thus we only * do something with the global V->rpl when we have *passed over* * a line - we keep the last line's rpl in last_rpl. And because * a file might consist entirely of single-line records, we keep * a third guy, maxrpl, that tells us the maximum rpl of any line * in the file. If we reach the end of file and rpl is still unset, * we'll set it to maxrpl. If we reach eof and rpl is set, but is * less than maxrpl, that's a weird case where a last line in some * record is longer than every other line. */ if (V->rpl != 0) { /* 0 means we already know rpl is invalid */ if (V->lastrpl > 0) { /* we're on something that's not the first line */ if (V->rpl > 0 && V->lastrpl != V->rpl) V->rpl = 0; else if (V->rpl == -1) V->rpl = V->lastrpl; } V->lastrpl = rpl; if (rpl > V->maxrpl) V->maxrpl = rpl; /* make sure we check max length of final lines */ } if (V->bpl != 0) { /* 0 means we already know bpl is invalid */ if (V->lastbpl > 0) { /* we're on something that's not the first line */ if (V->bpl > 0 && V->lastbpl != V->bpl) V->bpl = 0; else if (V->bpl == -1) V->bpl = V->lastbpl; } V->lastbpl = bpl; if (bpl > V->maxbpl) V->maxbpl = bpl; /* make sure we check max length of final lines */ } } /* end of indexing mode of addseq(). */}static void readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V){ int addend = 0; int done = 0; V->seqlen = 0; V->lastrpl = V->lastbpl = 0; if (addfirst) { if (V->ssimode >= 0) V->d_off = V->ssioffset; addseq(V->buf, V); } else if (V->ssimode >= 0) if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->d_off))) Die("SSIGetFilePosition() failed"); do { SeqfileGetLine(V); /* feof() alone is a bug; files not necessarily \n terminated */ if (*(V->buf) == '\0' && feof(V->f)) done = TRUE; done |= (*endTest)(V->buf, &addend); if (addend || !done) addseq(V->buf, V); } while (!done);}static intendPIR(char *s, int *addend){ *addend = 0; if ((strncmp(s, "///", 3) == 0) || (strncmp(s, "ENTRY", 5) == 0)) return 1; else return 0;}static voidreadPIR(struct ReadSeqVars *V){ char *sptr; /* load first line of entry */ while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) { SeqfileGetLine(V); } if (feof(V->f)) return; if (V->ssimode >= 0) V->r_off = V->ssioffset; if ((sptr = strtok(V->buf + 15, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } do { SeqfileGetLine(V); if (!feof(V->f) && strncmp(V->buf, "TITLE", 5) == 0) SetSeqinfoString(V->sqinfo, V->buf+15, SQINFO_DESC); else if (!feof(V->f) && strncmp(V->buf, "ACCESSION", 9) == 0) { if ((sptr = strtok(V->buf+15, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); } } while (! feof(V->f) && (strncmp(V->buf,"SEQUENCE", 8) != 0)); SeqfileGetLine(V); /* skip next line, coords */ readLoop(0, endPIR, V); /* reading a real PIR-CODATA database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; /* get next line */ while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) { SeqfileGetLine(V); }}static int endIG(char *s, int *addend){ *addend = 1; /* 1 or 2 occur in line w/ bases */ return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL));}static void readIG(struct ReadSeqVars *V){ char *nm; /* position past ';' comments */ do { SeqfileGetLine(V); } while (! (feof(V->f) || ((*V->buf != 0) && (*V->buf != ';')) )); if (!feof(V->f)) { if ((nm = strtok(V->buf, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); readLoop(0, endIG, V); } while (!(feof(V->f) || ((*V->buf != '\0') && (*V->buf == ';')))) SeqfileGetLine(V);}static int endStrider(char *s, int *addend){ *addend = 0; return (strstr( s, "//") != NULL);}static void readStrider(struct ReadSeqVars *V){ char *nm; while ((!feof(V->f)) && (*V->buf == ';')) { if (strncmp(V->buf,"; DNA sequence", 14) == 0) { if ((nm = strtok(V->buf+16, ",\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); } SeqfileGetLine(V); } if (! feof(V->f)) readLoop(1, endStrider, V); /* load next line */ while ((!feof(V->f)) && (*V->buf != ';')) SeqfileGetLine(V);}static int endGB(char *s, int *addend){ *addend = 0; return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s));}static void readGenBank(struct ReadSeqVars *V){ char *sptr; int in_definition; while (strncmp(V->buf, "LOCUS", 5) != 0) { SeqfileGetLine(V); } if (V->ssimode >= 0) V->r_off = V->ssioffset; if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } in_definition = FALSE; while (! feof(V->f)) { SeqfileGetLine(V); if (! feof(V->f) && strstr(V->buf, "DEFINITION") == V->buf) { if ((sptr = strtok(V->buf+12, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); in_definition = TRUE; } else if (! feof(V->f) && strstr(V->buf, "ACCESSION") == V->buf) { if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); in_definition = FALSE; } else if (strncmp(V->buf,"ORIGIN", 6) != 0) { if (in_definition) SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC); } else break; } readLoop(0, endGB, V); /* reading a real GenBank database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; while (!(feof(V->f) || ((*V->buf!=0) && (strstr(V->buf,"LOCUS") == V->buf)))) SeqfileGetLine(V); /* SRE: V->s now holds "//", so sequential reads are wedged: fixed Tue Jul 13 1993 */ while (!feof(V->f) && strstr(V->buf, "LOCUS ") != V->buf) SeqfileGetLine(V);}static intendGCGdata(char *s, int *addend) { *addend = 0; return (*s == '>');}static voidreadGCGdata(struct ReadSeqVars *V){ int binary = FALSE; /* whether data are binary or not */ int blen = 0; /* length of binary sequence */ /* first line contains ">>>>" followed by name */ if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->buf, 2)) { binary = TRUE; SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); blen = atoi(sqd_parse[2]); } else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->buf, 1)) SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); else Die("bogus GCGdata format? %s", V->buf); /* second line contains free text description */ SeqfileGetLine(V); SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC); if (binary) { /* allocate for blen characters +3... (allow for 3 bytes of slop) */ if (blen >= V->maxseq) { V->maxseq = blen; if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL) Die("malloc failed"); } /* read (blen+3)/4 bytes from file */ if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4)) Die("fread failed"); V->seqlen = blen; /* convert binary code to seq */ GCGBinaryToSequence(V->seq, blen); } else readLoop(0, endGCGdata, V); while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) SeqfileGetLine(V);}static intendPearson(char *s, int *addend){ *addend = 0; return(*s == '>');}static void readPearson(struct ReadSeqVars *V){ char *sptr; if (V->ssimode >= 0) V->r_off = V->ssioffset; if (*V->buf != '>') Die("\File %s does not appear to be in FASTA format at line %d.\n\You may want to invoke the Babelfish to autodetect your file's format.\n\Usually this is done with a -B option.\n", V->fname, V->linenumber); if ((sptr = strtok(V->buf+1, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); readLoop(0, endPearson, V); while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) { SeqfileGetLine(V); }}static intendEMBL(char *s, int *addend){ *addend = 0; /* Some people (Berlin 5S rRNA database, f'r instance) use * an extended EMBL format that attaches extra data after * the sequence -- watch out for that. We use the fact that * real EMBL sequence lines begin with five spaces. * * We can use this as the sole end test because readEMBL() will * advance to the next ID line before starting to read again. */ return (strncmp(s," ",5) != 0);/* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */}static void readEMBL(struct ReadSeqVars *V){ char *sptr; /* make sure we have first line */ while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) { SeqfileGetLine(V); } if (V->ssimode >= 0) V->r_off = V->ssioffset; if ((sptr = strtok(V->buf+5, "\n\t ")) != NULL) { SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); } do { SeqfileGetLine(V); if (!feof(V->f) && strstr(V->buf, "AC ") == V->buf) { if ((sptr = strtok(V->buf+5, "; \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); } else if (!feof(V->f) && strstr(V->buf, "DE ") == V->buf) { if ((sptr = strtok(V->buf+5, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); } } while (! feof(V->f) && strncmp(V->buf,"SQ",2) != 0); readLoop(0, endEMBL, V); /* Hack for Staden experiment files: convert - to N */ if (V->ssimode == -1) /* if we're in ssi mode, we're not keeping the seq */ for (sptr = V->seq; *sptr != '\0'; sptr++) if (*sptr == '-') *sptr = 'N'; /* reading a real EMBL database file, we keep the source coords */ V->sqinfo->start = 1; V->sqinfo->stop = V->seqlen; V->sqinfo->olen = V->seqlen; V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; /* load next record's ID line */ while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) { SeqfileGetLine(V); } }static intendZuker(char *s, int *addend){ *addend = 0; return( *s == '(' );}static voidreadZuker(struct ReadSeqVars *V){ char *sptr; SeqfileGetLine(V); /*s == "seqLen seqid string..."*/ if ((sptr = strtok(V->buf+6, " \t\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); if ((sptr = strtok(NULL, "\n")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); readLoop(0, endZuker, V);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -