📄 alistat_main.c

📁 hmmer源程序
💻 C
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* alistat_main.c * Fri Jan 27 10:41:41 1995 * CVS $Id: alistat_main.c,v 1.7 2001/04/23 15:48:04 eddy Exp $ *  * Look at an alignment file, determine some simple statistics. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include "squid.h"#include "msa.h"static char banner[] = "alistat - show some simple statistics on an alignment file";static char usage[]  = "\Usage: alistat [-options] <alignment file>\n\  Available options:\n\  -a    : report per-sequence info, not just a summary\n\  -f    : fast: estimate average %id by sampling (not compatible with -a)\n\  -h    : help: display usage and version\n\  -q    : quiet: suppress verbose header\n\";static char experts[] = "\  Expert options:\n\  --consensus <f>: write majority rule consensus sequence(s) in FASTA\n\                   format to file <f>\n\  --identmx <f>  : save a report on all NxN pairwise identities to file <f>\n\  --informat <s> : specify alignment file format <s>\n\                   allowed formats: SELEX, MSF, Clustal, a2m, PHYLIP\n\";struct opt_s OPTIONS[] = {  { "-a", TRUE, sqdARG_NONE },      { "-f", TRUE, sqdARG_NONE },      { "-h", TRUE, sqdARG_NONE },      { "-q", TRUE, sqdARG_NONE },      { "--consensus", FALSE, sqdARG_STRING },  { "--identmx",   FALSE, sqdARG_STRING },  { "--informat",  FALSE, sqdARG_STRING },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))intmain(int argc, char **argv){  char     *afile;              /* name of aligned sequence file */  MSAFILE  *afp;		/* pointer to open alignment file*/  MSA      *msa;                /* multiple sequence alignment   */    int       fmt;		/* format of afile               */  int       rlen;		/* raw sequence length           */  int       nres;		/* number of residues            */  float  **imx;                 /* identity matrix               */  int       i,j;  int       small, large;	  int       bestj, worstj;  float     sum, best, worst;  float     worst_worst, worst_best, best_best;  float     avgid;  int       nsample;  int    allreport;  int    do_fast;  int    be_quiet;  char  *consfile;  FILE  *consfp = NULL;  char  *identmx_report;	/* file to save identity matrix info to */  FILE  *identmx_fp = NULL;  char  *optname;  char  *optarg;  int    optind;  /* These inits are solely to silence gcc warnings about    * uninitialized variables   */  worst_worst = worst_best = best_best = 0.0;  bestj = worstj = -1;  /***********************************************   * Parse command line   ***********************************************/  fmt            = MSAFILE_UNKNOWN; /* by default, we autodetect file format */  allreport      = FALSE;  do_fast        = FALSE;  be_quiet       = FALSE;  consfile       = NULL;  identmx_report = NULL;  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, 		&optind, &optname, &optarg))    {      if      (strcmp(optname, "-a") == 0) { allreport = TRUE; }      else if (strcmp(optname, "-f") == 0) { do_fast   = TRUE; }      else if (strcmp(optname, "-q") == 0) { be_quiet  = TRUE; }      else if (strcmp(optname, "--consensus") == 0) { consfile        = optarg; }      else if (strcmp(optname, "--identmx")   == 0) { identmx_report  = optarg; }      else if (strcmp(optname, "--informat") == 0) {	fmt = String2SeqfileFormat(optarg);	if (fmt == MSAFILE_UNKNOWN) 	  Die("unrecognized sequence file format \"%s\"", optarg);	if (! IsAlignmentFormat(fmt))	  Die("%s is an unaligned format, can't read as an alignment", optarg);      }      else if (strcmp(optname, "-h") == 0) {	Banner(stdout, banner);	puts(usage);	puts(experts);        exit(EXIT_SUCCESS);      }    }  if (argc - optind != 1) Die("Incorrect number of arguments.\n%s\n", usage);  afile = argv[optind];  if (do_fast && allreport)    Die("Verbose reports (-a, --identmx) are incompatible with fast sampling (-f)");  if (do_fast && identmx_report != NULL)    Die("Verbose reports (-a, --identmx) are incompatible with fast sampling (-f)");  if (! be_quiet)    Banner(stdout, banner);  /***********************************************   * Loop over every alignment in the file.   ***********************************************/  if ((afp = MSAFileOpen(afile, fmt, NULL)) == NULL)    Die("Alignment file %s could not be opened for reading", afile);  if (consfile != NULL && (consfp = fopen(consfile, "w")) == NULL)    Die("Failed to open consensus sequence file %s for writing", consfile);  if (identmx_report != NULL && (identmx_fp = fopen(identmx_report, "w")) == NULL)    Die("Failed to open identity matrix report file %s for writing", identmx_report);  while ((msa = MSAFileRead(afp)) != NULL)    {      for (i = 0; i < msa->nseq; i++) s2upper(msa->aseq[i]);      /* Statistics we always collect:        *    unaligned sequence lengths; mean and range       */      nres = 0;      small = large = -1;      for (i = 0; i < msa->nseq; i++)	{	  rlen  = DealignedLength(msa->aseq[i]);	  nres +=  rlen;	  if (small == -1 || rlen < small) small = rlen;	  if (large == -1 || rlen > large) large = rlen;	}      /* Statistics we have to be careful about       * collecting, because of time constraints on NxN operations       */      if (do_fast)	{	  nsample = 1000;	  avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen, msa->nseq, 					      nsample);	}      else	{	  /* In a full report, for each sequence, find the best relative, 	   * and the worst relative. For overall statistics, save the            * worst best (most distant single seq) and the best best            * (most closely related pair) and the worst worst (most            * distantly related pair) and yes, I know it's confusing.	   */	  MakeIdentityMx(msa->aseq, msa->nseq, &imx);	  if (allreport) {	    printf("  %-15s %5s %7s %-15s %7s %-15s\n",		   "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)");	    printf("  --------------- ----- ------- --------------- ------- ---------------\n");	  }	  /* Print the identity matrix report: one line per pair of sequences.	   */	  if (identmx_report != NULL) 	    {	      for (i = 0; i < msa->nseq; i++)		for (j = i+1; j < msa->nseq; j++) 		  fprintf(identmx_fp, "%-4d %-4d %-15s %-15s %.3f\n",			  i, j, msa->sqname[i], msa->sqname[j], imx[i][j]);	    }	  sum = 0.0;	  worst_best  = 1.0;	  best_best   = 0.0;	  worst_worst = 1.0;	  for (i = 0; i < msa->nseq; i++)	    {	      worst = 1.0;	      best  = 0.0;	      for (j = 0; j < msa->nseq; j++)		{			/* closest seq to this one = best */		  if (i != j && imx[i][j] > best) 		    { best  = imx[i][j]; bestj = j; }		  if (imx[i][j] < worst)    		    { worst = imx[i][j]; worstj = j; }		}	      if (allreport) 		printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n",		       msa->sqname[i], DealignedLength(msa->aseq[i]),		       best * 100.,  msa->sqname[bestj],		       worst * 100., msa->sqname[worstj]);	  	      if (best > best_best)    best_best = best;	      if (best < worst_best)   worst_best = best;	      if (worst < worst_worst) worst_worst = worst;	      for (j = 0; j < i; j++)		sum += imx[i][j];	    }	  avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);	  if (allreport) puts("");	  FMX2Free(imx);	}      /* Print output.        * Some fields aren't available if -f (fast) was chosen.       */      if (msa->name != NULL)	printf("Alignment name:      %s\n",   msa->name);       printf("Format:              %s\n",     SeqfileFormat2String(afp->format));      printf("Number of sequences: %d\n",     msa->nseq);      printf("Total # residues:    %d\n",     nres);      printf("Smallest:            %d\n",     small);      printf("Largest:             %d\n",     large);      printf("Average length:      %.1f\n",   (float) nres / (float) msa->nseq);      printf("Alignment length:    %d\n",     msa->alen);      printf("Average identity:    %.0f%%\n", 100.*avgid);      if (! do_fast) {	printf("Most related pair:   %.0f%%\n", 100.*best_best);	printf("Most unrelated pair: %.0f%%\n", 100.*worst_worst);	printf("Most distant seq:    %.0f%%\n", 100.*worst_best);      }      /* Save majority rule consensus sequence if we were asked       */      if (consfile != NULL) {	char *cs;	cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen);	WriteSimpleFASTA(consfp, cs, 			 msa->name != NULL? msa->name : "consensus",			 msa->desc);	free(cs);	printf("Consensus:           written to %s\n", consfile);      }      puts("//");        MSAFree(msa);    }  MSAFileClose(afp);  if (consfile != NULL) fclose(consfp);  return 0;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -