📄 sqio.c
字号:
while (!(feof(V->f) | ((*V->buf != '\0') & (*V->buf == '(')))) SeqfileGetLine(V);}static void readUWGCG(struct ReadSeqVars *V){ char *si; char *sptr; int done; V->seqlen = 0; /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ /*drop above or ".." from id*/ if ((si = strstr(V->buf," Length: ")) != NULL) *si = 0; else if ((si = strstr(V->buf,"..")) != NULL) *si = 0; if ((sptr = strtok(V->buf, "\n\t ")) != NULL) SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); do { done = feof(V->f); SeqfileGetLine(V); if (! done) addseq(V->buf, V); } while (!done);} /* Function: ReadSeq() * * Purpose: Read next sequence from an open database file. * Return the sequence and associated info. * * Args: fp - open sequence database file pointer * format - format of the file (previously determined * by call to SeqfileFormat()). * Currently unused, since we carry it in V. * ret_seq - RETURN: sequence * sqinfo - RETURN: filled in w/ other information * * Limitations: uses squid_errno, so it's not threadsafe. * * Return: 1 on success, 0 on failure. * ret_seq and some field of sqinfo are allocated here, * The preferred call mechanism to properly free the memory is: * * SQINFO sqinfo; * char *seq; * * ReadSeq(fp, format, &seq, &sqinfo); * ... do something... * FreeSequence(seq, &sqinfo); */intReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo){ int gotuw; squid_errno = SQERR_OK; /* Here's the hack for sequential access of sequences from * the multiple sequence alignment formats */ if (IsAlignmentFormat(V->format)) { if (V->msa->lastidx >= V->msa->nseq) { /* out of data. try to read another alignment */ MSAFree(V->msa); if ((V->msa = MSAFileRead(V->afp)) == NULL) return 0; V->msa->lastidx = 0; } /* copy and dealign the appropriate aligned seq */ MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, V->msa->aseq[V->msa->lastidx], &(V->seq)); V->seqlen = strlen(V->seq); /* Extract sqinfo stuff for this sequence from the msa. * Tedious; code that should be cleaned. */ sqinfo->flags = 0; if (V->msa->sqname[V->msa->lastidx] != NULL) SetSeqinfoString(sqinfo, V->msa->sqname[V->msa->lastidx], SQINFO_NAME); if (V->msa->sqacc != NULL && V->msa->sqacc[V->msa->lastidx] != NULL) SetSeqinfoString(sqinfo, V->msa->sqacc[V->msa->lastidx], SQINFO_ACC); if (V->msa->sqdesc != NULL && V->msa->sqdesc[V->msa->lastidx] != NULL) SetSeqinfoString(sqinfo, V->msa->sqdesc[V->msa->lastidx], SQINFO_DESC); if (V->msa->ss != NULL && V->msa->ss[V->msa->lastidx] != NULL) { MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, V->msa->ss[V->msa->lastidx], &(sqinfo->ss)); sqinfo->flags |= SQINFO_SS; } if (V->msa->sa != NULL && V->msa->sa[V->msa->lastidx] != NULL) { MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, V->msa->sa[V->msa->lastidx], &(sqinfo->sa)); sqinfo->flags |= SQINFO_SA; } V->msa->lastidx++; } else { if (feof(V->f)) return 0; if (V->ssimode == -1) { /* normal mode */ V->seq = (char*) calloc (kStartLength+1, sizeof(char)); V->maxseq = kStartLength; } else { /* index mode: discarding seq */ V->seq = NULL; V->maxseq = 0; } V->seqlen = 0; V->sqinfo = sqinfo; V->sqinfo->flags = 0; switch (V->format) { case SQFILE_IG : readIG(V); break; case SQFILE_STRIDER : readStrider(V); break; case SQFILE_GENBANK : readGenBank(V); break; case SQFILE_FASTA : readPearson(V); break; case SQFILE_EMBL : readEMBL(V); break; case SQFILE_ZUKER : readZuker(V); break; case SQFILE_PIR : readPIR(V); break; case SQFILE_GCGDATA : readGCGdata(V); break; case SQFILE_GCG : do { /* skip leading comments on GCG file */ gotuw = (strstr(V->buf,"..") != NULL); if (gotuw) readUWGCG(V); SeqfileGetLine(V); } while (! feof(V->f)); break; case SQFILE_IDRAW: /* SRE: no attempt to read idraw postscript */ default: squid_errno = SQERR_FORMAT; free(V->seq); return 0; } if (V->seq != NULL) /* (it can be NULL in indexing mode) */ V->seq[V->seqlen] = 0; /* stick a string terminator on it */ } /* Cleanup */ sqinfo->len = V->seqlen; sqinfo->flags |= SQINFO_LEN; *ret_seq = V->seq; if (squid_errno == SQERR_OK) return 1; else return 0;} /* Function: SeqfileFormat() * Date: SRE, Tue Jun 22 10:58:58 1999 [Sanger Centre] * * Purpose: Determine format of an open file. * Returns format code. * Rewinds the file. * * Autodetects the following unaligned formats: * SQFILE_FASTA * SQFILE_GENBANK * SQFILE_EMBL * SQFILE_GCG * SQFILE_GCGDATA * SQFILE_PIR * Also autodetects the following alignment formats: * MSAFILE_STOCKHOLM * MSAFILE_MSF * MSAFILE_CLUSTAL * MSAFILE_SELEX * MSAFILE_PHYLIP * * Can't autodetect MSAFILE_A2M, calls it SQFILE_FASTA. * MSAFileFormat() does the opposite. * * Args: sfp - open SQFILE * * Return: format code, or SQFILE_UNKNOWN if unrecognized */ intSeqfileFormat(FILE *fp){ char *buf; int len; int fmt = SQFILE_UNKNOWN; int ndataline; char *bufcpy, *s, *s1, *s2; int has_junk; buf = NULL; len = 0; ndataline = 0; has_junk = FALSE; while (sre_fgets(&buf, &len, fp) != NULL) { if (IsBlankline(buf)) continue; /* Well-behaved formats identify themselves in first nonblank line. */ if (ndataline == 0) { if (strncmp(buf, ">>>>", 4) == 0 && strstr(buf, "Len: ")) { fmt = SQFILE_GCGDATA; goto DONE; } if (buf[0] == '>') { fmt = SQFILE_FASTA; goto DONE; } if (strncmp(buf, "!!AA_SEQUENCE", 13) == 0 || strncmp(buf, "!!NA_SEQUENCE", 13) == 0) { fmt = SQFILE_GCG; goto DONE; } if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0) { fmt = MSAFILE_STOCKHOLM; goto DONE; } if (strncmp(buf, "CLUSTAL", 7) == 0 && strstr(buf, "multiple sequence alignment") != NULL) { fmt = MSAFILE_CLUSTAL; goto DONE; } if (strncmp(buf, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0 || strncmp(buf, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) { fmt = MSAFILE_MSF; goto DONE; } /* PHYLIP id: also just a good bet */ bufcpy = sre_strdup(buf, -1); s = bufcpy; if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) != NULL && (s2 = sre_strtok(&s, WHITESPACE, NULL)) != NULL && IsInt(s1) && IsInt(s2)) { free(bufcpy); fmt = MSAFILE_PHYLIP; goto DONE; } free(bufcpy); } /* We trust that other formats identify themselves soon. */ /* dead giveaways for extended SELEX */ if (strncmp(buf, "#=AU", 4) == 0 || strncmp(buf, "#=ID", 4) == 0 || strncmp(buf, "#=AC", 4) == 0 || strncmp(buf, "#=DE", 4) == 0 || strncmp(buf, "#=GA", 4) == 0 || strncmp(buf, "#=TC", 4) == 0 || strncmp(buf, "#=NC", 4) == 0 || strncmp(buf, "#=SQ", 4) == 0 || strncmp(buf, "#=SS", 4) == 0 || strncmp(buf, "#=CS", 4) == 0 || strncmp(buf, "#=RF", 4) == 0) { fmt = MSAFILE_SELEX; goto DONE; } if (strncmp(buf, "///", 3) == 0 || strncmp(buf, "ENTRY ", 6) == 0) { fmt = SQFILE_PIR; goto DONE; } /* a ha, diagnostic of an (old) MSF file */ if ((strstr(buf, "..") != NULL) && (strstr(buf, "MSF:") != NULL) && (strstr(buf, "Check:")!= NULL)) { fmt = MSAFILE_MSF; goto DONE; } /* unaligned GCG (must follow MSF test!) */ if (strstr(buf, " Check: ") != NULL && strstr(buf, "..") != NULL) { fmt = SQFILE_GCG; goto DONE; } if (strncmp(buf,"LOCUS ",6) == 0 || strncmp(buf,"ORIGIN ",6) == 0) { fmt = SQFILE_GENBANK; goto DONE; } if (strncmp(buf,"ID ",5) == 0 || strncmp(buf,"SQ ",5) == 0) { fmt = SQFILE_EMBL; goto DONE; } /* But past here, we're being desperate. A simple SELEX file is * very difficult to detect; we can only try to disprove it. */ s = buf; if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; /* skip blank lines */ if (strchr("#%", *s1) != NULL) continue; /* skip comment lines */ /* Disproof 1. Noncomment, nonblank lines in a SELEX file * must have at least two space-delimited fields (name/seq) */ if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) has_junk = TRUE; /* Disproof 2. * The sequence field should look like a sequence. */ if (s2 != NULL && Seqtype(s2) == kOtherSeq) has_junk = TRUE; ndataline++; if (ndataline == 300) break; /* only look at first 300 lines */ } if (ndataline == 0) Die("Sequence file contains no data"); /* If we've made it this far, we've run out of data, but there * was at least one line of it; check if we've * disproven SELEX. If not, cross our fingers, pray, and guess SELEX. */ if (has_junk == TRUE) fmt = SQFILE_UNKNOWN; else fmt = MSAFILE_SELEX; DONE: if (buf != NULL) free(buf); rewind(fp); return fmt;}/* Function: GCGBinaryToSequence() * * Purpose: Convert a GCG 2BIT binary string to DNA sequence. * 0 = C 1 = T 2 = A 3 = G * 4 nts/byte * * Args: seq - binary sequence. Converted in place to DNA. * len - length of DNA. binary is (len+3)/4 bytes */intGCGBinaryToSequence(char *seq, int len){ int bpos; /* position in binary */ int spos; /* position in sequence */ char twobit; int i; for (bpos = (len-1)/4; bpos >= 0; bpos--) { twobit = seq[bpos]; spos = bpos*4; for (i = 3; i >= 0; i--) { switch (twobit & 0x3) { case 0: seq[spos+i] = 'C'; break; case 1: seq[spos+i] = 'T'; break; case 2: seq[spos+i] = 'A'; break; case 3: seq[spos+i] = 'G'; break; } twobit = twobit >> 2; } } seq[len] = '\0'; return 1;}/* Function: GCGchecksum() * Date: SRE, Mon May 31 11:13:21 1999 [St. Louis] * * Purpose: Calculate a GCG checksum for a sequence. * Code provided by Steve Smith of Genetics * Computer Group. * * Args: seq - sequence to calculate checksum for. * may contain gap symbols. * len - length of sequence (usually known, * so save a strlen() call) * * Returns: GCG checksum. */intGCGchecksum(char *seq, int len){ int i; /* position in sequence */ int chk = 0; /* calculated checksum */ for (i = 0; i < len; i++) chk = (chk + (i % 57 + 1) * (sre_toupper((int) seq[i]))) % 10000; return chk;}/* Function: GCGMultchecksum() * * Purpose: GCG checksum for a multiple alignment: sum of * individual sequence checksums (including their * gap characters) modulo 10000. * * Implemented using spec provided by Steve Smith of * Genetics Computer Group. * * Args: seqs - sequences to be checksummed; aligned or not * nseq - number of sequences * * Return: the checksum, a number between 0 and 9999 */ intGCGMultchecksum(char **seqs, int nseq){ int chk = 0; int idx; for (idx = 0; idx < nseq; idx++) chk = (chk + GCGchecksum(seqs[idx], strlen(seqs[idx]))) % 10000; return chk;}/* Function: Seqtype() * * Purpose: Returns a (very good) guess about type of sequence: * kDNA, kRNA, kAmino, or kOtherSeq. * * Modified from, and replaces, Gilbert getseqtype(). */intSeqtype(char *seq){ int saw; /* how many non-gap characters I saw */ char c; int po = 0; /* count of protein-only */ int nt = 0; /* count of t's */ int nu = 0; /* count of u's */ int na = 0; /* count of nucleotides */ int aa = 0; /* count of amino acids */ int no = 0; /* count of others */ /* Look at the first 300 non-gap characters */ for (saw = 0; *seq != '\0' && saw < 300; seq++) { c = sre_toupper((int) *seq); if (! isgap(c)) { if (strchr(protonly, c)) po++; else if (strchr(primenuc,c)) { na++; if (c == 'T') nt++; else if (c == 'U') nu++; } else if (strchr(aminos,c)) aa++; else if (isalpha((int) c)) no++; saw++; } } if (no > 0) return kOtherSeq; else if (po > 0) return kAmino; else if (na > aa) { if (nu > nt) return kRNA; else return kDNA; } else return kAmino; /* ooooh. risky. */}/* Function: GuessAlignmentSeqtype() * Date: SRE, Wed Jul 7 09:42:34 1999 [St. Louis] * * Purpose: Try to guess whether an alignment is protein * or nucleic acid; return a code for the * type (kRNA, kDNA, or kAmino). * * Args: aseq - array of aligned sequences. (Could also * be an rseq unaligned sequence array) * nseq - number of aseqs * * Returns: kRNA, kDNA, kAmino; * kOtherSeq if inconsistency is detected. */intGuessAlignmentSeqtype(char **aseq, int nseq){ int idx; int nrna = 0; int ndna = 0; int namino = 0; int nother = 0; for (idx = 0; idx < nseq; idx++) switch (Seqtype(aseq[idx])) { case kRNA: nrna++; break; case kDNA: ndna++; break; case kAmino: namino++; break; default: nother++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -