📄 sqio.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 4 页
字号:
  while (!(feof(V->f) | ((*V->buf != '\0') & (*V->buf == '('))))    SeqfileGetLine(V);}static void readUWGCG(struct ReadSeqVars *V){  char  *si;  char  *sptr;  int    done;  V->seqlen = 0;  /*writeseq: "    %s  Length: %d  (today)  Check: %d  ..\n" */  /*drop above or ".." from id*/  if ((si = strstr(V->buf,"  Length: ")) != NULL) *si = 0;  else if ((si = strstr(V->buf,"..")) != NULL)    *si = 0;  if ((sptr = strtok(V->buf, "\n\t ")) != NULL)    SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);  do {    done = feof(V->f);    SeqfileGetLine(V);    if (! done) addseq(V->buf, V);  } while (!done);}    /* Function: ReadSeq() *  * Purpose:  Read next sequence from an open database file. *           Return the sequence and associated info. *            * Args:     fp      - open sequence database file pointer           *           format  - format of the file (previously determined *                      by call to SeqfileFormat()). *                      Currently unused, since we carry it in V.  *           ret_seq - RETURN: sequence *           sqinfo  - RETURN: filled in w/ other information   *                      * Limitations: uses squid_errno, so it's not threadsafe.                     *            * Return:   1 on success, 0 on failure. *           ret_seq and some field of sqinfo are allocated here, *           The preferred call mechanism to properly free the memory is: *            *           SQINFO sqinfo; *           char  *seq; *            *           ReadSeq(fp, format, &seq, &sqinfo); *           ... do something... *           FreeSequence(seq, &sqinfo); */intReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo){  int    gotuw;  squid_errno = SQERR_OK;  /* Here's the hack for sequential access of sequences from   * the multiple sequence alignment formats   */  if (IsAlignmentFormat(V->format))    {      if (V->msa->lastidx >= V->msa->nseq) 	{ /* out of data. try to read another alignment */					  MSAFree(V->msa);	  if ((V->msa = MSAFileRead(V->afp)) == NULL)	    return 0;	  V->msa->lastidx = 0;	}				/* copy and dealign the appropriate aligned seq */      MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, 			  V->msa->aseq[V->msa->lastidx], &(V->seq));      V->seqlen = strlen(V->seq);      /* Extract sqinfo stuff for this sequence from the msa.       * Tedious; code that should be cleaned.       */      sqinfo->flags = 0;      if (V->msa->sqname[V->msa->lastidx] != NULL) 	SetSeqinfoString(sqinfo, V->msa->sqname[V->msa->lastidx], SQINFO_NAME);      if (V->msa->sqacc != NULL && V->msa->sqacc[V->msa->lastidx] != NULL) 	SetSeqinfoString(sqinfo, V->msa->sqacc[V->msa->lastidx], SQINFO_ACC);      if (V->msa->sqdesc != NULL && V->msa->sqdesc[V->msa->lastidx] != NULL) 	SetSeqinfoString(sqinfo, V->msa->sqdesc[V->msa->lastidx], SQINFO_DESC);      if (V->msa->ss != NULL && V->msa->ss[V->msa->lastidx] != NULL) {	MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, 			    V->msa->ss[V->msa->lastidx], &(sqinfo->ss));	sqinfo->flags |= SQINFO_SS;      }      if (V->msa->sa != NULL && V->msa->sa[V->msa->lastidx] != NULL) {	MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, 			    V->msa->sa[V->msa->lastidx], &(sqinfo->sa));	sqinfo->flags |= SQINFO_SA;      }      V->msa->lastidx++;    }   else {    if (feof(V->f)) return 0;    if (V->ssimode == -1) {	/* normal mode */      V->seq           = (char*) calloc (kStartLength+1, sizeof(char));      V->maxseq        = kStartLength;    } else {			/* index mode: discarding seq */      V->seq           = NULL;      V->maxseq        = 0;    }    V->seqlen        = 0;    V->sqinfo        = sqinfo;    V->sqinfo->flags = 0;    switch (V->format) {    case SQFILE_IG      : readIG(V);      break;    case SQFILE_STRIDER : readStrider(V); break;    case SQFILE_GENBANK : readGenBank(V); break;    case SQFILE_FASTA   : readPearson(V); break;    case SQFILE_EMBL    : readEMBL(V);    break;    case SQFILE_ZUKER   : readZuker(V);   break;    case SQFILE_PIR     : readPIR(V);     break;    case SQFILE_GCGDATA : readGCGdata(V); break; 	    case SQFILE_GCG :      do {			/* skip leading comments on GCG file */	gotuw = (strstr(V->buf,"..") != NULL);	if (gotuw) readUWGCG(V);	SeqfileGetLine(V);      } while (! feof(V->f));      break;    case SQFILE_IDRAW:   /* SRE: no attempt to read idraw postscript */    default:      squid_errno = SQERR_FORMAT;      free(V->seq);      return 0;    }    if (V->seq != NULL)		/* (it can be NULL in indexing mode) */      V->seq[V->seqlen] = 0; /* stick a string terminator on it */  }  /* Cleanup   */  sqinfo->len    = V->seqlen;   sqinfo->flags |= SQINFO_LEN;  *ret_seq = V->seq;  if (squid_errno == SQERR_OK) return 1; else return 0;}  /* Function: SeqfileFormat() * Date:     SRE, Tue Jun 22 10:58:58 1999 [Sanger Centre] * * Purpose:  Determine format of an open file. *           Returns format code. *           Rewinds the file. *            *           Autodetects the following unaligned formats: *              SQFILE_FASTA *              SQFILE_GENBANK *              SQFILE_EMBL *              SQFILE_GCG *              SQFILE_GCGDATA *              SQFILE_PIR *           Also autodetects the following alignment formats: *              MSAFILE_STOCKHOLM *              MSAFILE_MSF *              MSAFILE_CLUSTAL *              MSAFILE_SELEX *              MSAFILE_PHYLIP * *           Can't autodetect MSAFILE_A2M, calls it SQFILE_FASTA. *           MSAFileFormat() does the opposite. * * Args:     sfp -  open SQFILE *            * Return:   format code, or SQFILE_UNKNOWN if unrecognized */          intSeqfileFormat(FILE *fp){  char *buf;  int   len;  int   fmt = SQFILE_UNKNOWN;  int   ndataline;  char *bufcpy, *s, *s1, *s2;  int   has_junk;  buf       = NULL;  len       = 0;  ndataline = 0;  has_junk  = FALSE;  while (sre_fgets(&buf, &len, fp) != NULL)    {      if (IsBlankline(buf)) continue;      /* Well-behaved formats identify themselves in first nonblank line.       */      if (ndataline == 0)	{	  if (strncmp(buf, ">>>>", 4) == 0 && strstr(buf, "Len: "))	    { fmt = SQFILE_GCGDATA; goto DONE; }	  if (buf[0] == '>')	    { fmt = SQFILE_FASTA; goto DONE; }	  if (strncmp(buf, "!!AA_SEQUENCE", 13) == 0 ||	      strncmp(buf, "!!NA_SEQUENCE", 13) == 0)	    { fmt = SQFILE_GCG; goto DONE; }	  if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0)	    { fmt = MSAFILE_STOCKHOLM; goto DONE; }	  if (strncmp(buf, "CLUSTAL", 7) == 0 && 	      strstr(buf, "multiple sequence alignment") != NULL)	    { fmt = MSAFILE_CLUSTAL; goto DONE; }	  if (strncmp(buf, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0 ||	      strncmp(buf, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0)	    { fmt = MSAFILE_MSF; goto DONE; }	  			/* PHYLIP id: also just a good bet */	  bufcpy = sre_strdup(buf, -1);	  s = bufcpy;	  if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&	      (s2 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&	      IsInt(s1) && 	      IsInt(s2))	    { free(bufcpy); fmt = MSAFILE_PHYLIP; goto DONE; }	  free(bufcpy);	}      /* We trust that other formats identify themselves soon.       */     				/* dead giveaways for extended SELEX */      if (strncmp(buf, "#=AU", 4) == 0 ||          strncmp(buf, "#=ID", 4) == 0 ||	  strncmp(buf, "#=AC", 4) == 0 ||	  strncmp(buf, "#=DE", 4) == 0 ||	  strncmp(buf, "#=GA", 4) == 0 ||	  strncmp(buf, "#=TC", 4) == 0 ||	  strncmp(buf, "#=NC", 4) == 0 ||	  strncmp(buf, "#=SQ", 4) == 0 ||	  strncmp(buf, "#=SS", 4) == 0 ||	  strncmp(buf, "#=CS", 4) == 0 ||	  strncmp(buf, "#=RF", 4) == 0)	{ fmt = MSAFILE_SELEX; goto DONE; }	      if (strncmp(buf, "///", 3) == 0 || strncmp(buf, "ENTRY ", 6) == 0)	{ fmt = SQFILE_PIR; goto DONE; }				/* a ha, diagnostic of an (old) MSF file */      if ((strstr(buf, "..")    != NULL) && 	  (strstr(buf, "MSF:")  != NULL) &&	  (strstr(buf, "Check:")!= NULL))	{ fmt = MSAFILE_MSF; goto DONE; }				/* unaligned GCG (must follow MSF test!) */      if (strstr(buf, " Check: ") != NULL && strstr(buf, "..") != NULL)	{ fmt = SQFILE_GCG; goto DONE; }      if (strncmp(buf,"LOCUS ",6) == 0 || strncmp(buf,"ORIGIN ",6) == 0)	{ fmt = SQFILE_GENBANK; goto DONE; }      if (strncmp(buf,"ID   ",5) == 0 || strncmp(buf,"SQ   ",5) == 0)	{ fmt = SQFILE_EMBL; goto DONE; }      /* But past here, we're being desperate. A simple SELEX file is       * very difficult to detect; we can only try to disprove it.       */      s = buf;      if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; /* skip blank lines */      if (strchr("#%", *s1) != NULL) continue;   /* skip comment lines */      /* Disproof 1. Noncomment, nonblank lines in a SELEX file       * must have at least two space-delimited fields (name/seq)       */      if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) 	has_junk = TRUE;      /* Disproof 2.        * The sequence field should look like a sequence.       */      if (s2 != NULL && Seqtype(s2) == kOtherSeq) 	has_junk = TRUE;      ndataline++;      if (ndataline == 300) break; /* only look at first 300 lines */    }  if (ndataline == 0)    Die("Sequence file contains no data");  /* If we've made it this far, we've run out of data, but there   * was at least one line of it; check if we've   * disproven SELEX. If not, cross our fingers, pray, and guess SELEX.    */  if (has_junk == TRUE) fmt = SQFILE_UNKNOWN;  else                  fmt = MSAFILE_SELEX; DONE:  if (buf != NULL) free(buf);  rewind(fp);  return fmt;}/* Function: GCGBinaryToSequence() *  * Purpose:  Convert a GCG 2BIT binary string to DNA sequence. *           0 = C  1 = T  2 = A  3 = G *           4 nts/byte *            * Args:     seq  - binary sequence. Converted in place to DNA. *           len  - length of DNA. binary is (len+3)/4 bytes */intGCGBinaryToSequence(char *seq, int len){  int   bpos;			/* position in binary   */  int   spos;			/* position in sequence */  char  twobit;  int   i;  for (bpos = (len-1)/4; bpos >= 0; bpos--)     {      twobit = seq[bpos];      spos   = bpos*4;      for (i = 3; i >= 0; i--) 	{	  switch (twobit & 0x3) {	  case 0: seq[spos+i] = 'C'; break;	  case 1: seq[spos+i] = 'T'; break;	  case 2: seq[spos+i] = 'A'; break;	  case 3: seq[spos+i] = 'G'; break;	  }	  twobit = twobit >> 2;	}    }  seq[len] = '\0';  return 1;}/* Function: GCGchecksum() * Date:     SRE, Mon May 31 11:13:21 1999 [St. Louis] * * Purpose:  Calculate a GCG checksum for a sequence. *           Code provided by Steve Smith of Genetics *           Computer Group. * * Args:     seq -  sequence to calculate checksum for. *                  may contain gap symbols. *           len -  length of sequence (usually known, *                  so save a strlen() call)        * * Returns:  GCG checksum. */intGCGchecksum(char *seq, int len){  int i;			/* position in sequence */  int chk = 0;			/* calculated checksum  */  for (i = 0; i < len; i++)     chk = (chk + (i % 57 + 1) * (sre_toupper((int) seq[i]))) % 10000;  return chk;}/* Function: GCGMultchecksum() *  * Purpose:  GCG checksum for a multiple alignment: sum of *           individual sequence checksums (including their *           gap characters) modulo 10000. * *           Implemented using spec provided by Steve Smith of *           Genetics Computer Group. *            * Args:     seqs - sequences to be checksummed; aligned or not *           nseq - number of sequences *            * Return:   the checksum, a number between 0 and 9999 */                      intGCGMultchecksum(char **seqs, int nseq){  int chk = 0;  int idx;  for (idx = 0; idx < nseq; idx++)    chk = (chk + GCGchecksum(seqs[idx], strlen(seqs[idx]))) % 10000;  return chk;}/* Function: Seqtype() *  * Purpose:  Returns a (very good) guess about type of sequence: *           kDNA, kRNA, kAmino, or kOtherSeq. *            *           Modified from, and replaces, Gilbert getseqtype(). */intSeqtype(char *seq){  int  saw;			/* how many non-gap characters I saw */  char c;  int  po = 0;			/* count of protein-only */  int  nt = 0;			/* count of t's */  int  nu = 0;			/* count of u's */  int  na = 0;			/* count of nucleotides */  int  aa = 0;			/* count of amino acids */  int  no = 0;			/* count of others */    /* Look at the first 300 non-gap characters   */  for (saw = 0; *seq != '\0' && saw < 300; seq++)    {      c = sre_toupper((int) *seq);      if (! isgap(c)) 	{	  if (strchr(protonly, c)) po++;	  else if (strchr(primenuc,c)) {	    na++;	    if (c == 'T') nt++;	    else if (c == 'U') nu++;	  }	  else if (strchr(aminos,c)) aa++;	  else if (isalpha((int) c)) no++;	  saw++;	}    }  if (no > 0) return kOtherSeq;  else if (po > 0) return kAmino;  else if (na > aa) {    if (nu > nt) return kRNA;    else return kDNA;    }  else return kAmino;		/* ooooh. risky. */}/* Function: GuessAlignmentSeqtype() * Date:     SRE, Wed Jul  7 09:42:34 1999 [St. Louis] * * Purpose:  Try to guess whether an alignment is protein  *           or nucleic acid; return a code for the *           type (kRNA, kDNA, or kAmino). * * Args:     aseq  - array of aligned sequences. (Could also *                   be an rseq unaligned sequence array) *           nseq  - number of aseqs * * Returns:  kRNA, kDNA, kAmino; *           kOtherSeq if inconsistency is detected. */intGuessAlignmentSeqtype(char **aseq, int nseq){  int idx;  int nrna   = 0;  int ndna   = 0;  int namino = 0;  int nother = 0;  for (idx = 0; idx < nseq; idx++)    switch (Seqtype(aseq[idx])) {    case kRNA:   nrna++;   break;    case kDNA:   ndna++;   break;    case kAmino: namino++; break;    default:     nother++;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -