📄 selex.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* selex.c  *  * SRE, Mon Jun 14 11:08:38 1999 * SELEX  obsolete as the preferred HMMER/SQUID format * replaced by Stockholm format * selex support retained for backwards compatibility * kludged to use the MSA interface *  * SRE, Mon Jan 30 14:41:49 1995: * #=SA side chain % surface accessibility annotation supported *  * SRE, Tue Nov  9 17:40:50 1993:  * major revision. #= special comments and aliinfo_s optional * alignment info support added. Support for #=CS (consensus * secondary structure), #=SS (individual secondary structure), * #=RF (reference coordinate system), #=SQ (per-sequence header info), * and #=AU ("author") added. * * Fri Dec  4 17:43:24 1992, SRE: * Reading and writing aligned sequences to/from disk files. * Implements a new, broader specification of SELEX format * and supercedes alignio.c. * * SELEX format is documented in Docs/formats.tex. **************************************************************************** * RCS $Id: selex.c,v 1.10 2001/08/04 20:15:42 eddy Exp $ */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include <memory.h>#include "squid.h"#include "msa.h"static int  copy_alignment_line(char *aseq, int apos, int name_rcol, 				char *buffer, int lcol, int rcol, char gapsym);static void actually_write_selex(FILE *fp, MSA *msa, int cpl);static char commentsyms[] = "%#";/* Function: ReadSELEX() * Date:     SRE, Sun Jun  6 18:24:09 1999 [St. Louis] * * Purpose:  Parse an alignment read from an open SELEX format *           alignment file. (SELEX is a single alignment format). *           Return the alignment, or NULL if we've already read the *           alignment or there's no alignment data in the file. *            * Limitations: SELEX is the only remaining multipass parser for *           alignment files. It cannot read from gzip or from stdin. *           It Die()'s here if you try. The reason for this *           that SELEX allows space characters as gaps, so we don't *           know the borders of an alignment block until we've seen *           the whole block. I could rewrite to allow single-pass *           parsing (by storing the whole block in memory) but *           since SELEX is now legacy, why bother. *            *           Note that the interface is totally kludged: fastest *           possible adaptation of old ReadSELEX() to the new *           MSA interface.   * * Args:     afp  - open alignment file * * Returns:  MSA *  - an alignment object *                    caller responsible for an MSAFree() *           NULL if no alignment data.           */MSA *ReadSELEX(MSAFILE *afp){  MSA     *msa;                 /* RETURN: mult seq alignment   */  FILE    *fp;                  /* ptr to opened seqfile        */  char   **aseqs;               /* aligned seqs                 */  int      num = 0;		/* number of seqs read          */  char     buffer[LINEBUFLEN];	/* input buffer for lines       */  char     bufcpy[LINEBUFLEN];	/* strtok'able copy of buffer   */  struct block_struc {          /** alignment data for a block: */    int lcol;			/* furthest left aligned sym    */    int rcol;			/* furthest right aligned sym   */  } *blocks = NULL;  int      blocknum;		/* number of blocks in file     */  char    *nptr;                /* ptr to start of name on line */  char    *sptr;                /* ptr into sequence on line    */  int      currnum;		/* num. seqs in given block     */  int      currblock;		/* index for blocks             */  int      i;			/* loop counter                 */  int      seqidx;		/* counter for seqs             */  int      alen;                /* length of alignment          */  int      warn_names;          /* becomes TRUE if names don't match between blocks */  int      headnum;		/* seqidx in per-sequence header info */  int      currlen;  int      count;  int      have_cs = 0;  int      have_rf = 0;  AINFO    base_ainfo, *ainfo;	/* hack: used to be passed ptr to AINFO */  /* Convert from MSA interface to what old ReadSELEX() did:   *     - copy our open fp, rather than opening file   *     - verify that we're not reading a gzip or stdin   */  if (feof(afp->f)) return NULL;  if (afp->do_gzip || afp->do_stdin)    Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file");   fp    = afp->f;  ainfo = &base_ainfo;  /***************************************************   * First pass across file.    * Count seqs, get names, determine column info   * Determine what sorts of info are active in this file.   ***************************************************/  InitAinfo(ainfo);				/* get first line of the block 				 * (non-comment, non-blank) */  do    {      if (fgets(buffer, LINEBUFLEN, fp) == NULL)	{ squid_errno = SQERR_NODATA; return 0; }      strcpy(bufcpy, buffer);      if (*buffer == '#')	{	  if      (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;	  else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;	}    }  while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 	 (strchr(commentsyms, *nptr) != NULL));  blocknum   = 0;  warn_names = FALSE;  while (!feof(fp))    {				/* allocate for info about this block. */      if (blocknum == 0)	blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));      else 	blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));      blocks[blocknum].lcol = LINEBUFLEN+1;      blocks[blocknum].rcol = -1;	      currnum = 0;      while (nptr != NULL)	/* becomes NULL when this block ends. */      {				/* First block only: save names */	if (blocknum == 0)	  {	    if (currnum == 0)	      ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));	    else 	      ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));	    ainfo->sqinfo[currnum].flags = 0;	    SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);	  }	else			/* in each additional block: check names */	  {	    if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)	      warn_names = TRUE;	  }	currnum++;				/* check rcol, lcol */	if ((sptr = strtok(NULL, WHITESPACE)) != NULL)	  {				/* is this the furthest left we've				   seen word 2 in this block? */	    if (sptr - bufcpy < blocks[blocknum].lcol) 	      blocks[blocknum].lcol = sptr - bufcpy;				/* look for right side in buffer */	    for (sptr = buffer + strlen(buffer) - 1;  		 strchr(WHITESPACE, *sptr) != NULL;		 sptr --)	      /* do nothing */ ;	    if (sptr - buffer > blocks[blocknum].rcol)	      blocks[blocknum].rcol = sptr - buffer;	  }				/* get the next line; blank line means end of block */	do	  {	    if (fgets(buffer, LINEBUFLEN, fp) == NULL) 	      { nptr = NULL; break; }	    strcpy(bufcpy, buffer);	    if      (strncmp(buffer, "#=SS",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;	    else if (strncmp(buffer, "#=SA",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;	    else if (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;	    else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;	    if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) 	      break;	  } while (strchr(commentsyms, *nptr) != NULL);      }				/* check that number of sequences matches expected */      if (blocknum == 0)	num = currnum;      else if (currnum != num)	Die("Parse error in ReadSELEX()");      blocknum++;				/* get first line of next block 				 * (non-comment, non-blank) */      do	{	  if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }	  strcpy(bufcpy, buffer);	}      while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || 	     (strchr(commentsyms, *nptr) != NULL));    }    /***************************************************   * Get ready for second pass:   *   figure out the length of the alignment   *   malloc space   *   rewind the file   ***************************************************/  alen = 0;  for (currblock = 0; currblock < blocknum; currblock++)    alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;  rewind(fp);  /* allocations. we can't use AllocateAlignment because of   * the way we already used ainfo->sqinfo.   */  aseqs     = (char **) MallocOrDie (num * sizeof(char *));  if (have_cs)     ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));  if (have_rf)     ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));      for (i = 0; i < num; i++)    {      aseqs[i]     = (char *) MallocOrDie ((alen+1) * sizeof(char));      if (ainfo->sqinfo[i].flags & SQINFO_SS)	ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));      if (ainfo->sqinfo[i].flags & SQINFO_SA)	ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));    }    ainfo->alen = alen;  ainfo->nseq = num;   ainfo->wgt  = (float *) MallocOrDie (sizeof(float) * num);  FSet(ainfo->wgt, num, 1.0);  /***************************************************   * Second pass across file. Parse header; assemble sequences   ***************************************************/  /* We've now made a complete first pass over the file. We know how   * many blocks it contains, we know the number of seqs in the first   * block, and we know every block has the same number of blocks;   * so we can be a bit more cavalier about error-checking as we   * make the second pass.   */  /* Look for header   */  headnum = 0;  for (;;)    {      if (fgets(buffer, LINEBUFLEN, fp) == NULL)	Die("Parse error in ReadSELEX()");      strcpy(bufcpy, buffer);      if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */      if (strcmp(nptr, "#=AU") == 0  && (sptr = strtok(NULL, "\n")) != NULL)	ainfo->au = Strdup(sptr);      else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)	ainfo->name = Strdup(sptr);      else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)	ainfo->acc  = Strdup(sptr);      else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)	ainfo->desc = Strdup(sptr);      else if (strcmp(nptr, "#=GA") == 0)	{	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 	    Die("Parse error in #=GA line in ReadSELEX()");	  ainfo->ga1 = atof(sptr);	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 	    Die("Parse error in #=GA line in ReadSELEX()");	  ainfo->ga2 = atof(sptr);	  ainfo->flags |= AINFO_GA;	}      else if (strcmp(nptr, "#=TC") == 0)	{	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 	    Die("Parse error in #=TC line in ReadSELEX()");	  ainfo->tc1 = atof(sptr);	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 	    Die("Parse error in #=TC line in ReadSELEX()");	  ainfo->tc2 = atof(sptr);	  ainfo->flags |= AINFO_TC;	}      else if (strcmp(nptr, "#=NC") == 0)	{	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 	    Die("Parse error in #=NC line in ReadSELEX()");	  ainfo->nc1 = atof(sptr);	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL) 	    Die("Parse error in #=NC line in ReadSELEX()");	  ainfo->nc2 = atof(sptr);	  ainfo->flags |= AINFO_NC;	}      else if (strcmp(nptr, "#=SQ") == 0)      /* per-sequence header info */	{				/* first field is the name */	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)	    Die("Parse error in #=SQ line in ReadSELEX()");	  if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;				/* second field is the weight */	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)	    Die("Parse error in #=SQ line in ReadSELEX()");	  if (!IsReal(sptr)) 	    Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");	  ainfo->wgt[headnum] = atof(sptr);				/* third field is database source id */	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);				/* fourth field is database accession number */	  if ((sptr = strtok(NULL, WHITESPACE)) == NULL)	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);				/* fifth field is start..stop::olen */	  if ((sptr = strtok(NULL, ".:")) == NULL)	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);	  if ((sptr = strtok(NULL, ".:")) == NULL)	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);	  	  if ((sptr = strtok(NULL, ":\t ")) == NULL)	    Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");	  SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);				/* rest of line is optional description */	  if ((sptr = strtok(NULL, "\n")) != NULL)	    SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);	  	  headnum++;	}      else if (strcmp(nptr, "#=CS") == 0) break;      else if (strcmp(nptr, "#=RF") == 0) break;      else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */    }    currlen = 0;  for (currblock = 0 ; currblock < blocknum; currblock++)    {				/* parse the block */      seqidx = 0;      while (nptr != NULL)	{				/* Consensus structure */	  if (strcmp(nptr, "#=CS") == 0)	    {	      if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, 					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))		Die("Parse error in #=CS line in ReadSELEX()");	    }				/* Reference coordinates */	  else if (strcmp(nptr, "#=RF") == 0)	    {	      if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, 					buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))		Die("Parse error in #=RF line in ReadSELEX()");	    }				/* Individual secondary structure */	  else if (strcmp(nptr, "#=SS") == 0)	    {	      if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,					buffer, blocks[currblock].lcol, 					blocks[currblock].rcol, (char) '.'))		Die("Parse error in #=SS line in ReadSELEX()");
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -