⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 seqstat_main.c

📁 hmmer源程序
💻 C
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* seqstat_main.c * Wed Aug 10 15:47:14 1994 *  * Look at a sequence file, determine some simple statistics. * CVS $Id: seqstat_main.c,v 1.9 2001/08/04 20:26:14 eddy Exp $ */#include <stdio.h>#include <string.h>#include <limits.h>#include <ctype.h>#include "squid.h"#include "msa.h"static char banner[] = "seqstat - show some simple statistics on a sequence file";static char usage[]  = "\Usage: seqstat [-options] <seqfile>\n\  Available options:\n\  -a    : report per-sequence info, not just a summary\n\  -h    : help; display usage and version\n\";  static char experts[] = "\  --gccomp       : with -a, include GC composition in report (DNA/RNA only)\n\  --informat <s> : specify sequence file format <s>\n\  --quiet        : suppress verbose header (used in regression testing)\n\";struct opt_s OPTIONS[] = {  { "-a", TRUE, sqdARG_NONE },      { "-h", TRUE, sqdARG_NONE },      { "--gccomp",   FALSE, sqdARG_NONE },  { "--informat", FALSE, sqdARG_STRING },  { "--quiet",    FALSE, sqdARG_NONE   },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))static float gc_composition(char *seq);intmain(int argc, char **argv){  char     *seqfile;            /* name of sequence file     */  SQFILE   *dbfp;		/* open sequence file        */  int       fmt;		/* format of seqfile         */  char     *seq;		/* sequence                  */  SQINFO    sqinfo;             /* extra info about sequence */  int       nseqs;  long long small;		/* smallest length */  long long large;		/* largest length  */  long long total;              /* total length    */  int       type;		/* kAmino, kDNA, kRNA, or kOtherSeq */  int    allreport;		/* TRUE to do a short table for each sequence */  int    be_quiet;		/* TRUE to suppress header */  int    do_gccomp;		/* TRUE to include GC composition in per-seq report */  float  gc;			/* fractional gc composition, 0..1 */    char  *optname;  char  *optarg;  int    optind;  /***********************************************   * Parse command line   ***********************************************/  fmt       = SQFILE_UNKNOWN;	/* default: autodetect format  */  allreport = FALSE;		/* default: file summary only  */  be_quiet  = FALSE;		/* show header info by default */  type      = kOtherSeq;	/* just to silence gcc uninit warning */  do_gccomp = FALSE;  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, 		&optind, &optname, &optarg))    {      if      (strcmp(optname, "-a")       == 0)  allreport = TRUE;       else if (strcmp(optname, "--quiet")  == 0)  be_quiet  = TRUE;       else if (strcmp(optname, "--gccomp") == 0)  do_gccomp = TRUE;       else if (strcmp(optname, "--informat") == 0) {	fmt = String2SeqfileFormat(optarg);	if (fmt == SQFILE_UNKNOWN) 	  Die("unrecognized sequence file format \"%s\"", optarg);      }      else if (strcmp(optname, "-h") == 0) {	Banner(stdout, banner);	puts(usage);	puts(experts);        exit(EXIT_SUCCESS);      }    }  if (argc - optind != 1) Die("%s\n", usage);  seqfile = argv[argc-1];  if (! be_quiet) Banner(stdout, banner);  /***********************************************   * Read the file.   ***********************************************/  if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)    Die("Failed to open sequence file %s for reading", seqfile);    if (allreport) {    printf("  %-15s %-5s %s%s\n", "  NAME", "LEN",            do_gccomp? " f_GC " : "",	   "DESCRIPTION");    printf("  --------------- ----- %s-----------\n",	   do_gccomp ? "----- " : "");  }  nseqs = 0;  small = -1;  large = -1;  total = 0L;  while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))    {      if (nseqs == 0) type = Seqtype(seq);      if (do_gccomp)  gc   = gc_composition(seq);      if (allreport) {	if (do_gccomp) {	  printf("* %-15s %5d %.3f %-50.50s\n", sqinfo.name, sqinfo.len, 	       gc, 	       sqinfo.flags & SQINFO_DESC ? sqinfo.desc : "");	} else {	  printf("* %-15s %5d %-50.50s\n", sqinfo.name, sqinfo.len, 	       sqinfo.flags & SQINFO_DESC ? sqinfo.desc : "");	}      }      if (small == -1 || sqinfo.len < small) small = (long long) sqinfo.len;      if (large == -1 || sqinfo.len > large) large = (long long) sqinfo.len;      total += (long long) sqinfo.len;      nseqs++;      FreeSequence(seq, &sqinfo);    }  if (allreport) puts("");  printf("Format:              %s\n", SeqfileFormat2String(dbfp->format));  printf("Type (of 1st seq):   ");  switch (type)     {    case kDNA:      puts("DNA");     break;    case kRNA:      puts("RNA");     break;    case kAmino:    puts("Protein"); break;    case kOtherSeq: puts("Unknown"); break;    default:        Die("oops.");    }  printf("Number of sequences: %d\n", nseqs);  printf("Total # residues:    %lld\n", total);  printf("Smallest:            %lld\n", small);  printf("Largest:             %lld\n", large);  printf("Average length:      %.1f\n", (float) total / (float) nseqs);  SeqfileClose(dbfp);  return 0;}/* Function: gc_composition() * Date:     SRE, Mon Apr 23 10:01:48 2001 [St. Louis] * * Purpose:  Calculate the fractional GC composition of *           an input RNA or DNA sequence. Deals appropriately *           with IUPAC degeneracy. Case-insensitive.  *           Ignores gap symbols. Other unexpected characters *           make it die with an error (protein, for instance). * * Args:     seq - the DNA or RNA sequence * * Returns:  fractional GC composition, 0-1 */static floatgc_composition(char *seq){  int   c;  float total;  float gc;    gc = total = 0.;  for (; *seq != '\0'; seq++)     {      if (isgap(c)) continue;      c = toupper((int) *seq);      total += 1.0;      switch (c) {      case 'C':      case 'G':       case 'S': gc += 1.0; break;      case 'A':      case 'T':      case 'U':       case 'W': gc += 0.0; break;      case 'N':       case 'R':      case 'Y':      case 'M':      case 'K': gc += 0.5; break;      case 'H':       case 'D': gc += 0.3333; break;      case 'B':       case 'V': gc += 0.6667; break;      default:	Die("unrecognized nucleic acid character %c in sequence", c);      }    }  return (gc/total);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -