⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sqio.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 4 页
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* File: sqio.c * From: ureadseq.c in Don Gilbert's sequence i/o package * * Reads and writes nucleic/protein sequence in various * formats. Data files may have multiple sequences. * * Heavily modified from READSEQ package * Copyright (C) 1990 by D.G. Gilbert * Biology Dept., Indiana University, Bloomington, IN 47405 * email: gilbertd@bio.indiana.edu * Thanks Don! * * SRE: Modifications as noted. Fri Jul  3 09:44:54 1992 *      Packaged for squid, Thu Oct  1 10:07:11 1992 *      ANSI conversion in full swing, Mon Jul 12 12:22:21 1993 * * CVS $Id: sqio.c,v 1.25 2001/06/21 23:43:49 eddy Exp $ *  ***************************************************************** * Basic API for single sequence reading: *  * SQFILE *sqfp; * char   *seqfile; * int     format;        - see squid.h for formats; example: SQFILE_FASTA * char   *seq; * SQINFO *sqinfo; *  * if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL) *   Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); * while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) { *   do_stuff; *   FreeSequence(seq, &sqinfo); * } * SeqfileClose(sqfp);   *  *****************************************************************   */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#ifndef SEEK_SET#include <unistd.h>	#endif#include "squid.h"#include "msa.h"#include "ssi.h"static void SeqfileGetLine(SQFILE *V);#define kStartLength  500static char *aminos      = "ABCDEFGHIKLMNPQRSTVWXYZ*";static char *primenuc    = "ACGTUN";static char *protonly    = "EFIPQZ";static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode);/* Function: SeqfileOpen() *  * Purpose : Open a sequence database file and prepare for reading *           sequentially. *            * Args:     filename - name of file to open *           format   - format of file *           env      - environment variable for path (e.g. BLASTDB)    *           ssimode  - -1, SSI_OFFSET_I32, or SSI_OFFSET_I64 * *           Returns opened SQFILE ptr, or NULL on failure. */SQFILE *SeqfileOpen(char *filename, int format, char *env){  return seqfile_open(filename, format, env, -1);}SQFILE *SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode){  return seqfile_open(filename, format, env, ssimode);}static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode){  SQFILE *dbfp;  dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));  dbfp->ssimode  = ssimode;  dbfp->rpl      = -1;		/* flag meaning "unset" */  dbfp->lastrpl  = 0;  dbfp->maxrpl   = 0;  dbfp->bpl      = -1;		/* flag meaning "unset" */  dbfp->lastbpl  = 0;  dbfp->maxbpl   = 0;  /* Open our file handle.   * Three possibilities:   *    1. normal file open   *    2. filename = "-";    read from stdin   *    3. filename = "*.gz"; read thru pipe from gzip    * If we're reading from stdin or a pipe, we can't reliably   * back up, so we can't do two-pass parsers like the interleaved alignment      * formats.   */  if (strcmp(filename, "-") == 0)    {      dbfp->f         = stdin;      dbfp->do_stdin  = TRUE;       dbfp->do_gzip   = FALSE;      dbfp->fname     = sre_strdup("[STDIN]", -1);    }#ifndef SRE_STRICT_ANSI  /* popen(), pclose() aren't portable to non-POSIX systems; disable */  else if (Strparse("^.*\\.gz$", filename, 0))    {      char cmd[256];      /* Note that popen() will return "successfully"       * if file doesn't exist, because gzip works fine       * and prints an error! So we have to check for       * existence of file ourself.       */      if (! FileExists(filename))	Die("%s: file does not exist", filename);      if (strlen(filename) + strlen("gzip -dc ") >= 256)	Die("filename > 255 char in SeqfileOpen()");       sprintf(cmd, "gzip -dc %s", filename);      if ((dbfp->f = popen(cmd, "r")) == NULL)	return NULL;      dbfp->do_stdin = FALSE;      dbfp->do_gzip  = TRUE;      dbfp->fname    = sre_strdup(filename, -1);    }#endif /*SRE_STRICT_ANSI*/  else    {      if ((dbfp->f = fopen(filename, "r")) == NULL &&	  (dbfp->f = EnvFileOpen(filename, env, NULL)) == NULL)	return NULL;      dbfp->do_stdin = FALSE;      dbfp->do_gzip  = FALSE;      dbfp->fname    = sre_strdup(filename, -1);    }    /* Invoke autodetection if we haven't already been told what   * to expect.   */  if (format == SQFILE_UNKNOWN)    {      if (dbfp->do_stdin == TRUE || dbfp->do_gzip)	Die("Can't autodetect sequence file format from a stdin or gzip pipe");      format = SeqfileFormat(dbfp->f);      if (format == SQFILE_UNKNOWN)	Die("Can't determine format of sequence file %s", dbfp->fname);    }  /* The hack for sequential access of an interleaved alignment file:   * read the alignment in, we'll copy sequences out one at a time.   */  dbfp->msa        = NULL;  dbfp->afp        = NULL;  dbfp->format     = format;  dbfp->linenumber = 0;  dbfp->buf        = NULL;  dbfp->buflen     = 0;  if (IsAlignmentFormat(format))	    {      /* We'll be reading from the MSA interface. Copy our data       * to the MSA afp's structure.       */      dbfp->afp           = MallocOrDie(sizeof(MSAFILE));      dbfp->afp->f        = dbfp->f;            /* just a ptr, don't close */      dbfp->afp->do_stdin = dbfp->do_stdin;      dbfp->afp->do_gzip  = dbfp->do_gzip;      dbfp->afp->fname    = dbfp->fname;        /* just a ptr, don't free */      dbfp->afp->format   = dbfp->format;       /* e.g. format */      dbfp->afp->linenumber = dbfp->linenumber;	/* e.g. 0 */      dbfp->afp->buf      = NULL;      dbfp->afp->buflen   = 0;      if ((dbfp->msa = MSAFileRead(dbfp->afp)) == NULL)	Die("Failed to read any alignment data from file %s", dbfp->fname);				/* hack: overload/reuse msa->lastidx; indicates				   next seq to return upon a ReadSeq() call */      dbfp->msa->lastidx = 0;      return dbfp;    }  /* Load the first line.   */  SeqfileGetLine(dbfp);   return dbfp;}/* Function: SeqfilePosition() *  * Purpose:  Move to a particular offset in a seqfile. *           Will not work on alignment files. */voidSeqfilePosition(SQFILE *sqfp, SSIOFFSET *offset){  if (sqfp->do_stdin || sqfp->do_gzip || IsAlignmentFormat(sqfp->format))    Die("SeqfilePosition() failed: in a nonrewindable data file or stream");  if (SSISetFilePosition(sqfp->f, offset) != 0)    Die("SSISetFilePosition failed, but that shouldn't happen.");  SeqfileGetLine(sqfp);}/* Function: SeqfileRewind() *  * Purpose:  Set a sequence file back to the first sequence. *  *           Won't work on alignment files. Although it would *           seem that it could (just set msa->lastidx back to 0), *           that'll fail on "multiple multiple" alignment file formats *           (e.g. Stockholm). */voidSeqfileRewind(SQFILE *sqfp){  if (sqfp->do_stdin || sqfp->do_gzip)    Die("SeqfileRewind() failed: in a nonrewindable data file or stream");  rewind(sqfp->f);  SeqfileGetLine(sqfp);}/* Function: SeqfileLineParameters() * Date:     SRE, Thu Feb 15 17:00:41 2001 [St. Louis] * * Purpose:  After all the sequences have been read from the file, *           but before closing it, retrieve overall bytes-per-line and *           residues-per-line info. If non-zero, these mean that *           the file contains homogeneous sequence line lengths (except *           the last line in each record).   *            *           If either of bpl or rpl is determined to be inhomogeneous, *           both are returned as 0. * * Args:     *sqfp   - an open but fully read sequence file *           ret_bpl - RETURN: bytes per line, or 0 if inhomogeneous *           ret_rpl - RETURN: residues per line, or 0 if inhomogenous. * * Returns:  void */voidSeqfileLineParameters(SQFILE *V, int *ret_bpl, int *ret_rpl){  if (V->rpl > 0 && V->maxrpl == V->rpl &&      V->bpl > 0 && V->maxbpl == V->bpl) {    *ret_bpl = V->bpl;    *ret_rpl = V->rpl;  } else {    *ret_bpl = 0;    *ret_rpl = 0;  }}voidSeqfileClose(SQFILE *sqfp){  /* note: don't test for sqfp->msa being NULL. Now that   * we're holding afp open and allowing access to multi-MSA   * databases (e.g. Stockholm format, Pfam), msa ends   * up being NULL when we run out of alignments.   */  if (sqfp->afp != NULL) {    if (sqfp->msa      != NULL) MSAFree(sqfp->msa);    if (sqfp->afp->buf != NULL) free(sqfp->afp->buf);    free(sqfp->afp);  }#ifndef SRE_STRICT_ANSI	/* gunzip functionality only on POSIX systems */  if (sqfp->do_gzip)         pclose(sqfp->f);#endif    else if (! sqfp->do_stdin) fclose(sqfp->f);  if (sqfp->buf   != NULL) free(sqfp->buf);  if (sqfp->fname != NULL) free(sqfp->fname);  free(sqfp);}/* Function: SeqfileGetLine() * Date:     SRE, Tue Jun 22 09:15:49 1999 [Sanger Centre] * * Purpose:  read a line from a sequence file into V->buf *           If the fgets() is NULL, sets V->buf[0] to '\0'. * * Args:     V * * Returns:  void */static void SeqfileGetLine(SQFILE *V){  if (V->ssimode >= 0)     if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->ssioffset)))      Die("SSIGetFilePosition() failed");  if (sre_fgets(&(V->buf), &(V->buflen), V->f) == NULL)    *(V->buf) = '\0';  V->linenumber++;}voidFreeSequence(char *seq, SQINFO *sqinfo){  if (seq != NULL) free(seq);  if (sqinfo->flags & SQINFO_SS)   free(sqinfo->ss);  if (sqinfo->flags & SQINFO_SA)   free(sqinfo->sa);}intSetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag){  int len;  int pos;				/* silently ignore NULL. */  if (sptr == NULL) return 1;  while (*sptr == ' ') sptr++; /* ignore leading whitespace */  for (pos = strlen(sptr)-1; pos >= 0; pos--)    if (! isspace((int) sptr[pos])) break;  sptr[pos+1] = '\0';	       /* ignore trailing whitespace */  switch (flag) {  case SQINFO_NAME:    if (*sptr != '-')      { 	strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1);	sqinfo->name[SQINFO_NAMELEN-1] = '\0';	sqinfo->flags   |= SQINFO_NAME;      }    break;  case SQINFO_ID:    if (*sptr != '-')      { 	strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1);	sqinfo->id[SQINFO_NAMELEN-1] = '\0';	sqinfo->flags |= SQINFO_ID;      }    break;  case SQINFO_ACC:    if (*sptr != '-')      { 	strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1);	sqinfo->acc[SQINFO_NAMELEN-1] = '\0';	sqinfo->flags   |= SQINFO_ACC;      }    break;  case SQINFO_DESC:    if (*sptr != '-')      { 	if (sqinfo->flags & SQINFO_DESC) /* append? */	  {	    len = strlen(sqinfo->desc);	    if (len < SQINFO_DESCLEN-2)	/* is there room? */	      {		strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++;		strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len);	      }	  }	else			/* else copy */	  strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1);	sqinfo->desc[SQINFO_DESCLEN-1] = '\0';	sqinfo->flags   |= SQINFO_DESC;      }    break;  case SQINFO_START:    if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }    sqinfo->start = atoi(sptr);    if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START;    break;  case SQINFO_STOP:    if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }    sqinfo->stop = atoi(sptr);    if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP;    break;  case SQINFO_OLEN:    if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }    sqinfo->olen = atoi(sptr);    if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN;    break;  default:    Die("Invalid flag %d to SetSeqinfoString()", flag);  }  return 1;}voidSeqinfoCopy(SQINFO *sq1, SQINFO *sq2){  sq1->flags = sq2->flags;  if (sq2->flags & SQINFO_NAME)  strcpy(sq1->name, sq2->name);  if (sq2->flags & SQINFO_ID)    strcpy(sq1->id,   sq2->id);  if (sq2->flags & SQINFO_ACC)   strcpy(sq1->acc,  sq2->acc);  if (sq2->flags & SQINFO_DESC)  strcpy(sq1->desc, sq2->desc);  if (sq2->flags & SQINFO_LEN)   sq1->len    = sq2->len;  if (sq2->flags & SQINFO_START) sq1->start  = sq2->start;  if (sq2->flags & SQINFO_STOP)  sq1->stop   = sq2->stop;  if (sq2->flags & SQINFO_OLEN)  sq1->olen   = sq2->olen;  if (sq2->flags & SQINFO_TYPE)  sq1->type   = sq2->type;  if (sq2->flags & SQINFO_SS)    sq1->ss     = Strdup(sq2->ss);  if (sq2->flags & SQINFO_SA)    sq1->sa     = Strdup(sq2->sa);}/* Function: ToDNA() *  * Purpose:  Convert a sequence to DNA. *           U --> T */voidToDNA(char *seq){  for (; *seq != '\0'; seq++)    {      if      (*seq == 'U') *seq = 'T';      else if (*seq == 'u') *seq = 't';    }}/* Function: ToRNA() *  * Purpose:  Convert a sequence to RNA. *           T --> U */voidToRNA(char *seq){  for (; *seq != '\0'; seq++)    {      if      (*seq == 'T') *seq = 'U';      else if (*seq == 't') *seq = 'u';    }}/* Function: ToIUPAC() *  * Purpose:  Convert X's, o's, other junk in a nucleic acid sequence to N's, *           to comply with IUPAC code. Does allow gap characters *           though, so we can call ToIUPAC() on aligned seqs. * *           WU-BLAST's pressdb will *           choke on X's, for instance, necessitating conversion *           of certain genome centers' data.  */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -