📄 sqio.c
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. *****************************************************************//* File: sqio.c * From: ureadseq.c in Don Gilbert's sequence i/o package * * Reads and writes nucleic/protein sequence in various * formats. Data files may have multiple sequences. * * Heavily modified from READSEQ package * Copyright (C) 1990 by D.G. Gilbert * Biology Dept., Indiana University, Bloomington, IN 47405 * email: gilbertd@bio.indiana.edu * Thanks Don! * * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992 * Packaged for squid, Thu Oct 1 10:07:11 1992 * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993 * * CVS $Id: sqio.c,v 1.25 2001/06/21 23:43:49 eddy Exp $ * ***************************************************************** * Basic API for single sequence reading: * * SQFILE *sqfp; * char *seqfile; * int format; - see squid.h for formats; example: SQFILE_FASTA * char *seq; * SQINFO *sqinfo; * * if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL) * Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); * while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) { * do_stuff; * FreeSequence(seq, &sqinfo); * } * SeqfileClose(sqfp); * ***************************************************************** */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#ifndef SEEK_SET#include <unistd.h> #endif#include "squid.h"#include "msa.h"#include "ssi.h"static void SeqfileGetLine(SQFILE *V);#define kStartLength 500static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";static char *primenuc = "ACGTUN";static char *protonly = "EFIPQZ";static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode);/* Function: SeqfileOpen() * * Purpose : Open a sequence database file and prepare for reading * sequentially. * * Args: filename - name of file to open * format - format of file * env - environment variable for path (e.g. BLASTDB) * ssimode - -1, SSI_OFFSET_I32, or SSI_OFFSET_I64 * * Returns opened SQFILE ptr, or NULL on failure. */SQFILE *SeqfileOpen(char *filename, int format, char *env){ return seqfile_open(filename, format, env, -1);}SQFILE *SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode){ return seqfile_open(filename, format, env, ssimode);}static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode){ SQFILE *dbfp; dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE)); dbfp->ssimode = ssimode; dbfp->rpl = -1; /* flag meaning "unset" */ dbfp->lastrpl = 0; dbfp->maxrpl = 0; dbfp->bpl = -1; /* flag meaning "unset" */ dbfp->lastbpl = 0; dbfp->maxbpl = 0; /* Open our file handle. * Three possibilities: * 1. normal file open * 2. filename = "-"; read from stdin * 3. filename = "*.gz"; read thru pipe from gzip * If we're reading from stdin or a pipe, we can't reliably * back up, so we can't do two-pass parsers like the interleaved alignment * formats. */ if (strcmp(filename, "-") == 0) { dbfp->f = stdin; dbfp->do_stdin = TRUE; dbfp->do_gzip = FALSE; dbfp->fname = sre_strdup("[STDIN]", -1); }#ifndef SRE_STRICT_ANSI /* popen(), pclose() aren't portable to non-POSIX systems; disable */ else if (Strparse("^.*\\.gz$", filename, 0)) { char cmd[256]; /* Note that popen() will return "successfully" * if file doesn't exist, because gzip works fine * and prints an error! So we have to check for * existence of file ourself. */ if (! FileExists(filename)) Die("%s: file does not exist", filename); if (strlen(filename) + strlen("gzip -dc ") >= 256) Die("filename > 255 char in SeqfileOpen()"); sprintf(cmd, "gzip -dc %s", filename); if ((dbfp->f = popen(cmd, "r")) == NULL) return NULL; dbfp->do_stdin = FALSE; dbfp->do_gzip = TRUE; dbfp->fname = sre_strdup(filename, -1); }#endif /*SRE_STRICT_ANSI*/ else { if ((dbfp->f = fopen(filename, "r")) == NULL && (dbfp->f = EnvFileOpen(filename, env, NULL)) == NULL) return NULL; dbfp->do_stdin = FALSE; dbfp->do_gzip = FALSE; dbfp->fname = sre_strdup(filename, -1); } /* Invoke autodetection if we haven't already been told what * to expect. */ if (format == SQFILE_UNKNOWN) { if (dbfp->do_stdin == TRUE || dbfp->do_gzip) Die("Can't autodetect sequence file format from a stdin or gzip pipe"); format = SeqfileFormat(dbfp->f); if (format == SQFILE_UNKNOWN) Die("Can't determine format of sequence file %s", dbfp->fname); } /* The hack for sequential access of an interleaved alignment file: * read the alignment in, we'll copy sequences out one at a time. */ dbfp->msa = NULL; dbfp->afp = NULL; dbfp->format = format; dbfp->linenumber = 0; dbfp->buf = NULL; dbfp->buflen = 0; if (IsAlignmentFormat(format)) { /* We'll be reading from the MSA interface. Copy our data * to the MSA afp's structure. */ dbfp->afp = MallocOrDie(sizeof(MSAFILE)); dbfp->afp->f = dbfp->f; /* just a ptr, don't close */ dbfp->afp->do_stdin = dbfp->do_stdin; dbfp->afp->do_gzip = dbfp->do_gzip; dbfp->afp->fname = dbfp->fname; /* just a ptr, don't free */ dbfp->afp->format = dbfp->format; /* e.g. format */ dbfp->afp->linenumber = dbfp->linenumber; /* e.g. 0 */ dbfp->afp->buf = NULL; dbfp->afp->buflen = 0; if ((dbfp->msa = MSAFileRead(dbfp->afp)) == NULL) Die("Failed to read any alignment data from file %s", dbfp->fname); /* hack: overload/reuse msa->lastidx; indicates next seq to return upon a ReadSeq() call */ dbfp->msa->lastidx = 0; return dbfp; } /* Load the first line. */ SeqfileGetLine(dbfp); return dbfp;}/* Function: SeqfilePosition() * * Purpose: Move to a particular offset in a seqfile. * Will not work on alignment files. */voidSeqfilePosition(SQFILE *sqfp, SSIOFFSET *offset){ if (sqfp->do_stdin || sqfp->do_gzip || IsAlignmentFormat(sqfp->format)) Die("SeqfilePosition() failed: in a nonrewindable data file or stream"); if (SSISetFilePosition(sqfp->f, offset) != 0) Die("SSISetFilePosition failed, but that shouldn't happen."); SeqfileGetLine(sqfp);}/* Function: SeqfileRewind() * * Purpose: Set a sequence file back to the first sequence. * * Won't work on alignment files. Although it would * seem that it could (just set msa->lastidx back to 0), * that'll fail on "multiple multiple" alignment file formats * (e.g. Stockholm). */voidSeqfileRewind(SQFILE *sqfp){ if (sqfp->do_stdin || sqfp->do_gzip) Die("SeqfileRewind() failed: in a nonrewindable data file or stream"); rewind(sqfp->f); SeqfileGetLine(sqfp);}/* Function: SeqfileLineParameters() * Date: SRE, Thu Feb 15 17:00:41 2001 [St. Louis] * * Purpose: After all the sequences have been read from the file, * but before closing it, retrieve overall bytes-per-line and * residues-per-line info. If non-zero, these mean that * the file contains homogeneous sequence line lengths (except * the last line in each record). * * If either of bpl or rpl is determined to be inhomogeneous, * both are returned as 0. * * Args: *sqfp - an open but fully read sequence file * ret_bpl - RETURN: bytes per line, or 0 if inhomogeneous * ret_rpl - RETURN: residues per line, or 0 if inhomogenous. * * Returns: void */voidSeqfileLineParameters(SQFILE *V, int *ret_bpl, int *ret_rpl){ if (V->rpl > 0 && V->maxrpl == V->rpl && V->bpl > 0 && V->maxbpl == V->bpl) { *ret_bpl = V->bpl; *ret_rpl = V->rpl; } else { *ret_bpl = 0; *ret_rpl = 0; }}voidSeqfileClose(SQFILE *sqfp){ /* note: don't test for sqfp->msa being NULL. Now that * we're holding afp open and allowing access to multi-MSA * databases (e.g. Stockholm format, Pfam), msa ends * up being NULL when we run out of alignments. */ if (sqfp->afp != NULL) { if (sqfp->msa != NULL) MSAFree(sqfp->msa); if (sqfp->afp->buf != NULL) free(sqfp->afp->buf); free(sqfp->afp); }#ifndef SRE_STRICT_ANSI /* gunzip functionality only on POSIX systems */ if (sqfp->do_gzip) pclose(sqfp->f);#endif else if (! sqfp->do_stdin) fclose(sqfp->f); if (sqfp->buf != NULL) free(sqfp->buf); if (sqfp->fname != NULL) free(sqfp->fname); free(sqfp);}/* Function: SeqfileGetLine() * Date: SRE, Tue Jun 22 09:15:49 1999 [Sanger Centre] * * Purpose: read a line from a sequence file into V->buf * If the fgets() is NULL, sets V->buf[0] to '\0'. * * Args: V * * Returns: void */static void SeqfileGetLine(SQFILE *V){ if (V->ssimode >= 0) if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->ssioffset))) Die("SSIGetFilePosition() failed"); if (sre_fgets(&(V->buf), &(V->buflen), V->f) == NULL) *(V->buf) = '\0'; V->linenumber++;}voidFreeSequence(char *seq, SQINFO *sqinfo){ if (seq != NULL) free(seq); if (sqinfo->flags & SQINFO_SS) free(sqinfo->ss); if (sqinfo->flags & SQINFO_SA) free(sqinfo->sa);}intSetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag){ int len; int pos; /* silently ignore NULL. */ if (sptr == NULL) return 1; while (*sptr == ' ') sptr++; /* ignore leading whitespace */ for (pos = strlen(sptr)-1; pos >= 0; pos--) if (! isspace((int) sptr[pos])) break; sptr[pos+1] = '\0'; /* ignore trailing whitespace */ switch (flag) { case SQINFO_NAME: if (*sptr != '-') { strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1); sqinfo->name[SQINFO_NAMELEN-1] = '\0'; sqinfo->flags |= SQINFO_NAME; } break; case SQINFO_ID: if (*sptr != '-') { strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1); sqinfo->id[SQINFO_NAMELEN-1] = '\0'; sqinfo->flags |= SQINFO_ID; } break; case SQINFO_ACC: if (*sptr != '-') { strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1); sqinfo->acc[SQINFO_NAMELEN-1] = '\0'; sqinfo->flags |= SQINFO_ACC; } break; case SQINFO_DESC: if (*sptr != '-') { if (sqinfo->flags & SQINFO_DESC) /* append? */ { len = strlen(sqinfo->desc); if (len < SQINFO_DESCLEN-2) /* is there room? */ { strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++; strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len); } } else /* else copy */ strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1); sqinfo->desc[SQINFO_DESCLEN-1] = '\0'; sqinfo->flags |= SQINFO_DESC; } break; case SQINFO_START: if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } sqinfo->start = atoi(sptr); if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START; break; case SQINFO_STOP: if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } sqinfo->stop = atoi(sptr); if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP; break; case SQINFO_OLEN: if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } sqinfo->olen = atoi(sptr); if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN; break; default: Die("Invalid flag %d to SetSeqinfoString()", flag); } return 1;}voidSeqinfoCopy(SQINFO *sq1, SQINFO *sq2){ sq1->flags = sq2->flags; if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name); if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id); if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc); if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc); if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len; if (sq2->flags & SQINFO_START) sq1->start = sq2->start; if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop; if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen; if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type; if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss); if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa);}/* Function: ToDNA() * * Purpose: Convert a sequence to DNA. * U --> T */voidToDNA(char *seq){ for (; *seq != '\0'; seq++) { if (*seq == 'U') *seq = 'T'; else if (*seq == 'u') *seq = 't'; }}/* Function: ToRNA() * * Purpose: Convert a sequence to RNA. * T --> U */voidToRNA(char *seq){ for (; *seq != '\0'; seq++) { if (*seq == 'T') *seq = 'U'; else if (*seq == 't') *seq = 'u'; }}/* Function: ToIUPAC() * * Purpose: Convert X's, o's, other junk in a nucleic acid sequence to N's, * to comply with IUPAC code. Does allow gap characters * though, so we can call ToIUPAC() on aligned seqs. * * WU-BLAST's pressdb will * choke on X's, for instance, necessitating conversion * of certain genome centers' data. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -