seqsplit_main.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 278 行

C
278
字号
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-2003 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* seqsplit_main.c * SRE, Mon Sep 25 11:43:58 2000 *  * Split sequences into smaller chunks of defined size and overlap; * output a FASTA file. * * Limitations: *   still working in 32 bits -- no sequence can be more than 2 GB *   in size. * CVS $Id: seqsplit_main.c,v 1.7 2003/05/26 16:21:50 eddy Exp $ */#include "squidconf.h"#include <stdio.h>#include <string.h>#include "squid.h"#include "msa.h"static char banner[] = "seqsplit - split seqs into chunks of defined size and overlap";static char usage[]  = "\Usage: seqsplit [-options] <seqfile>\n\  Available options:\n\  -h        : help; display usage and version\n\  -o <file> : output the new FASTA file to <file>\n\";  static char experts[] = "\  --fragfile <f> : save one-line-per-frag coord summary file to <f>\n\  --informat <s> : specify sequence file format <s>\n\  --length <n>   : set max length of each unique seq frag to <n>\n\  --overlap <n>  : set overlap length to <n> (total frag size = length+overlap)\n\  --shortnames   : use short \"frag1\" names, not \"<src>/<from>-<to>\"\n\";static struct opt_s OPTIONS[] = {  { "-h", TRUE, sqdARG_NONE },      { "-o", TRUE, sqdARG_STRING },      { "--fragfile", FALSE, sqdARG_STRING },  { "--informat", FALSE, sqdARG_STRING },  { "--length",   FALSE, sqdARG_INT },  { "--overlap",  FALSE, sqdARG_INT },  { "--shortnames", FALSE, sqdARG_NONE },};#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))static char *set_description(char *source, int start, int end, char *origdesc);static char *set_name(char *origname, int start, int end, int do_shortnames, int fragnum);intmain(int argc, char **argv){  char     *seqfile;            /* name of sequence file     */  char     *outfile;		/* name of output file       */  SQFILE   *dbfp;		/* open sequence file        */  FILE     *ofp;		/* open output file          */  int       fmt;		/* format of seqfile         */  char     *seq;		/* sequence                  */  SQINFO    sqinfo;             /* extra info about sequence */  char     *seqfrag;		/* space for a seq fragment  */  int       fraglength;		/* length of unique seq per frag     */  int       overlap;            /* length of overlap. frags are fraglength+overlap*/  char     *sqname;	        /* renamed fragment, w/ coord info */  char     *desc;	        /* new desc line */  int       num;		/* number of this fragment   */  int       pos;		/* position in a sequence */  int       len;		/* length of a fragment   */    int       nseqs;		/* total number of input sequences */  int       nsplit;		/* number of seqs that get split */  int       nnewfrags;		/* total number of new fragments */  int       ntot;		/* total number of seqs in new file */  int       do_shortnames;	/* TRUE to do short code names */  char     *fragfile;           /* fragment summary out file, or NULL */  FILE     *fragfp;               char  *optname;  char  *optarg;  int    optind;  /***********************************************   * Parse command line   ***********************************************/  fmt           = SQFILE_UNKNOWN;	/* default: autodetect      */  fraglength    = 100000;  overlap       = 1000;  outfile       = NULL;  do_shortnames = FALSE;  fragfile      = NULL;  fragfp        = NULL;    while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, 		&optind, &optname, &optarg))    {      if      (strcmp(optname, "-o")         == 0)  outfile    = optarg;      else if (strcmp(optname, "--fragfile") == 0)  fragfile   = optarg;      else if (strcmp(optname, "--length")   == 0)  fraglength = atoi(optarg);      else if (strcmp(optname, "--overlap")  == 0)  overlap    = atoi(optarg);      else if (strcmp(optname, "--shortnames") == 0) do_shortnames = TRUE;      else if (strcmp(optname, "--informat") == 0) {	fmt = String2SeqfileFormat(optarg);	if (fmt == SQFILE_UNKNOWN) 	  Die("unrecognized sequence file format \"%s\"", optarg);      }      else if (strcmp(optname, "-h") == 0) {	SqdBanner(stdout, banner);	puts(usage);	puts(experts);        exit(EXIT_SUCCESS);      }    }  if (argc - optind != 1) Die("%s\n", usage);  seqfile = argv[argc-1];  seqfrag = MallocOrDie(sizeof(char) * (fraglength+overlap));  seqfrag[fraglength+overlap] = '\0';  /* Try to work around inability to autodetect from a pipe or .gz:   * assume FASTA format   */  if (fmt == SQFILE_UNKNOWN &&      (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0))    fmt = SQFILE_FASTA;  /***********************************************   * Read the file.   ***********************************************/  if (outfile == NULL)  ofp = stdout;   else {    if ((ofp = fopen(outfile, "w")) == NULL)      Die("Failed to open output sequence file %s for writing", outfile);  }  if (fragfile != NULL) {    if ((fragfp = fopen(fragfile, "w")) == NULL)      Die("Failed to open frag summary file %s for writing", fragfile);  }  if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)    Die("Failed to open sequence file %s for reading", seqfile);    nseqs = nsplit = nnewfrags = ntot = 0;  while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))    {      nseqs++;      if (sqinfo.len <= fraglength+overlap) {	ntot++;	if (do_shortnames) {	    sqname = set_name(sqinfo.name, 1, sqinfo.len, do_shortnames, ntot);	    desc = set_description(sqinfo.name, 1, sqinfo.len, 				   sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL);	} else {	  sqname = sre_strdup(sqinfo.name, -1);	  if (sqinfo.flags & SQINFO_DESC) desc = sre_strdup(sqinfo.desc, -1);	  else desc = NULL;	}	WriteSimpleFASTA(ofp, seq, sqname, desc);	if (fragfp != NULL) 	  fprintf(fragfp, "%s\t%s\t%d\t%d\n", sqname, sqinfo.name, 1, sqinfo.len);	if (desc != NULL) free(desc);	free(sqname);	continue;      }            num = 1;      nsplit++;      for (pos = 0; pos < sqinfo.len; pos += fraglength)	{	  if (sqinfo.len - pos <= overlap) continue;	  ntot++;	  strncpy(seqfrag, seq+pos, fraglength+overlap);	  len = strlen(seqfrag);	  if (do_shortnames) {	    sqname = set_name(sqinfo.name, pos+1, pos+len, do_shortnames, ntot);	    desc = set_description(sqinfo.name, pos+1, pos+len, 				   sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL);	  } else {	    sqname = set_name(sqinfo.name, pos+1, pos+len, do_shortnames, num);	    if (sqinfo.flags & SQINFO_DESC) desc   = sre_strdup(sqinfo.desc, -1);	    else desc = NULL;	  }	  WriteSimpleFASTA(ofp, seqfrag, sqname, desc);	  if (fragfp != NULL) 	    fprintf(fragfp, "%s\t%s\t%d\t%d\n", sqname, sqinfo.name, pos+1,		    pos+len);	  if (desc != NULL) free(desc);	  free(sqname);	  nnewfrags++;	  num ++;	}      FreeSequence(seq, &sqinfo);    }  SeqfileClose(dbfp);  if (outfile   != NULL) fclose(ofp);  if (fragfile  != NULL) fclose(fragfp);  printf("Total # of seqs:         %d\n", nseqs);  printf("Affected by splitting:   %d\n", nsplit);  printf("New # of seqs:           %d\n", nseqs-nsplit + nnewfrags);  return 0;}static char *set_description(char *source, int start, int end, char *origdesc){  int   len;  char *new;    len = 7;			/* for [:..] \0 */  if (source != NULL) {    len += strlen(source);    len += start > 0 ? ceil(log10(start+1)) : 1; /* itoa length */    len += end   > 0 ? ceil(log10(end+1)) : 1;  }  if (origdesc != NULL) len += strlen(origdesc);  if (source != NULL) {    new = MallocOrDie(sizeof(char) * len);    sprintf(new, "[%s:%d..%d] %s", source, start, end, 	    origdesc == NULL ? "" : origdesc);  } else if (origdesc != NULL) {    new = sre_strdup(origdesc, -1);  } else     new = NULL;  return new;}static char *set_name(char *origname, int start, int end, int do_shortnames, int fragnum){  int   len;  char *new;  if (do_shortnames) {    len = 5;			/* frag \0 */    len += fragnum > 0 ? ceil(log10(fragnum+1)) : 1;    new = MallocOrDie(sizeof(char) * len);    sprintf(new, "frag%d", fragnum);  } else {    len = strlen(origname) + 8;    len += fragnum > 0 ? ceil(log10(fragnum+1)) : 1;    len += start > 0 ? ceil(log10(start+1)) : 1; /* itoa length */    len += end   > 0 ? ceil(log10(end+1)) : 1;    new = MallocOrDie(sizeof(char) * len);    sprintf(new, "%s/frag%d/%d-%d", origname, fragnum, start, end);  }  return new;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?