⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 map_db.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
字号:
/* map_db.c - read a FASTA or GCG format database and generate a list   of indices for rapid memory mapping *//* copyright (c) 1999 William R. Pearson *//* $Name: fa35_03_06 $ - $Id: map_db.c,v 1.9 2005/09/27 15:32:58 wrp Exp $ *//* input is a libtype 1,5, or 6 sequence database *//* output is a BLAST2 formatdb type index file *//* format of the index file:1)  map_db version number ["MP"+2 bytes]2)  number of sequences in database [4 bytes]3)  total length of database        [8 bytes]  (MP1, 4 bytes for MP0)4)  longest sequence in database    [8 bytes]  (MP1, 4 bytes for MP0)5) list of offsets to definitions  [num_seq+1] int*8 (MP1, 4 bytes for MP0)6) list of offsets to sequences    [num_seq+1] int*8 (MP1, 4 bytes for MP1)7) list of flag characters for sequences [num_seq+1]bytes    (used for GCG binary to encode 2bit or 4 bit representation)    sequence files will be as defined by their format*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#include "uascii.h"#include "ncbl2_head.h"#define GCGBIN 6#define LASTLIB 6int (*get_entry) ();int a_get_ent(long *, long *);int v_get_ent(long *, long *);int gcg_get_ent(long *, long *);int gbf_get_ent(long *, long *);void src_int4_write(FILE *, int);void src_int4_read(FILE *, int *);void src_long4_write(FILE *, long);void src_long4_read(FILE *, long *);void src_long8_write(FILE *, long);void src_long8_read(FILE *, long *);void newname(char *nname, char *oname, char *suff, int maxn);int (*get_ent_arr[LASTLIB+1])()={a_get_ent, gbf_get_ent, NULL, NULL, NULL,				 v_get_ent, gcg_get_ent};long openlib(char *, int);static int *sascii;main(int argc, char **argv){  FILE *libi;  char lname[256];  char iname[256];  char format[4];  char *bp;  int i;  int nlib;	/* number of entries */  long max_len;	/* longest sequence */  long tot_len;	/* total sequence length */  int n1;    long f_size;	/* file size from fstat() */  int lib_size;	/* current space available - may be realloc'ed */  int lib_inc;  int lib_type; /* 1 for protein, 0 for DNA */  int lib_aa;	/* dna=1; prot=0; */  /* file offsets */  long d_pos;	/* start of description */  long s_pos;	/* start of sequence */  long *d_pos_arr;	/* array of description pointers */  long *s_pos_arr;	/* array of description pointers */  lib_type = 0;  lib_size = 200000;  lib_inc  = 100000;  lib_aa = 1;  while (argc > 1 && *argv[1]=='-') {    if (strcmp(argv[1],"-n")==0) lib_aa = 0;    argv++;    argc--;  }  /* open the database */  if (argc > 1) strncpy(lname, argv[1],sizeof(lname));  else {    fprintf(stderr," Entry library name: ");    fgets(lname,sizeof(lname),stdin);    if ((bp=strchr(lname,'\n'))!=NULL) *bp='\0';  }      if ((bp=strchr(lname,' '))!=NULL) {    lib_type = atoi(bp+1);    *bp='\0';  }  else lib_type = 0;  if (get_ent_arr[lib_type] == NULL) {    fprintf(stderr," cannot index file %s type %d\n",lname,lib_type);    exit(1);  }    if (lib_type == 6) lib_aa = 0;  if (lib_type == 1) lib_aa = 0;    if (lib_aa == 1) sascii = aascii;  else sascii = nascii;  if ((f_size=openlib(lname,lib_type))==0) {    fprintf(stderr," cannot open %s (type: %d)\n",lname,lib_type);    exit(1);  }  /* allocate array of description pointers */  if ((d_pos_arr=(long *)calloc(lib_size, sizeof(long)))==NULL) {    fprintf(stderr," cannot allocate %d for desc. array\n",lib_size);    exit(1);  }  /* allocate array of sequence pointers */  if ((s_pos_arr=(long *)calloc(lib_size, sizeof(long)))==NULL) {    fprintf(stderr," cannot allocate %d for seq. array\n",lib_size);    exit(1);  }  /* allocate array of sequence flags */  nlib = 0; tot_len=0; max_len=-1;  while ((n1=get_entry(&d_pos, &s_pos)) > 0) {    d_pos_arr[nlib] = d_pos;    s_pos_arr[nlib] = s_pos;    nlib++;    tot_len += n1;    if (n1 > max_len) max_len = n1;    if (nlib >= lib_size) { /* too many entries */      lib_size += lib_inc;      if ((d_pos_arr=(long *)realloc(d_pos_arr,lib_size*sizeof(long)))==NULL) {	fprintf(stderr," cannot realloc allocate %d for desc.. array\n",		lib_size);	exit(1);      }      if ((s_pos_arr=(long *)realloc(s_pos_arr,lib_size*sizeof(long)))==NULL) {	fprintf(stderr," cannot realloc allocate %d for seq. array\n",		lib_size);	exit(1);      }    }  }  d_pos_arr[nlib]= d_pos;	/* put in the end of the file */  s_pos_arr[nlib]=0;  /* all the information is in, write it out */    newname(iname,lname,"xin",sizeof(iname));  if ((libi=fopen(iname,"w"))==NULL) {    fprintf(stderr," cannot open %s for writing\n",iname);    exit(1);  }  /* write out format version */  format[0]='M';  format[1]='P';#ifdef BIG_LIB64  format[2]= 1;		/* format 1 for 8-byte offsets */#else  format[2]='\0';	/* format '\0' for original 4-byte */#endif  format[3]=lib_type;  fwrite(format,4,sizeof(char),libi);  /* write out sequence type */  src_int4_write(libi, lib_aa);  /* write out file fstat as integrity check */#ifdef BIG_LIB64  src_long8_write(libi, f_size);#else  src_int4_write(libi, f_size);#endif  /* write out num_seq */  src_int4_write(libi, nlib);#ifdef BIG_LIB64  /* write out tot_len, max_len */  src_long8_write(libi, tot_len);#else  src_int4_write(libi, tot_len);#endif  src_int4_write(libi, max_len);#ifdef BIG_LIB64  for (i=0; i<=nlib; i++) src_long8_write(libi,d_pos_arr[i]);  for (i=0; i<=nlib; i++) src_long8_write(libi,s_pos_arr[i]);#else  for (i=0; i<=nlib; i++) src_int4_write(libi,d_pos_arr[i]);  for (i=0; i<=nlib; i++) src_int4_write(libi,s_pos_arr[i]);#endif  fclose(libi);#ifdef BIG_LIB64  fprintf(stderr," wrote %d sequences (tot=%ld, max=%ld) to %s\n",	  nlib,tot_len,max_len,iname);#else  fprintf(stderr," wrote %d sequences (tot=%ld, max=%ld) to %s\n",	  nlib,tot_len,max_len,iname);#endif}FILE *libf=NULL;long lpos;#define MAXLINE 4096char lline[MAXLINE+1];longopenlib(char *lname, int lib_type){  long f_size;  struct stat stat_buf;  if (stat(lname,&stat_buf)<0) {    fprintf(stderr," cannot stat library: %s\n",lname);    return 0;  }  if ((libf=fopen(lname,"r"))==NULL) {    fprintf(stderr," cannot open library: %s (type: %d)\n",	    lname, lib_type);    return 0;  }    f_size = stat_buf.st_size;  get_entry = get_ent_arr[lib_type];  lpos = ftell(libf);  if (fgets(lline,MAXLINE,libf)==NULL) return 0;  return f_size;}inta_get_ent(long *d_pos, long *s_pos){  register char *cp;  register int *ap, n1;  ap = sascii;  while (lline[0]!='>' && lline[0]!=';') {    lpos = ftell(libf);    if (fgets(lline,sizeof(lline),libf)==NULL) {      *d_pos = lpos;      return 0;    }  }  *d_pos = lpos;  /* make certain we have the end of the line */  while (strchr((char *)lline,'\n')==NULL) {    if (fgets(lline,sizeof(lline),libf)==NULL) break;  }  *s_pos = ftell(libf);  lline[0]='\0';  n1 = 0;  while (fgets(lline,sizeof(lline),libf)!=NULL) {    if (lline[0]=='>') break;    if (lline[0]==';') {      if (strchr(lline,'\n')==NULL) {	fprintf(stderr," excessive continuation\n%s",lline);	return -1;      }    }    for (cp=lline; *cp; ) if (ap[*cp++]<NA) n1++;    lpos = ftell(libf);  }  return n1;}intv_get_ent(long *d_pos, long *s_pos){  register char *cp;  register int *ap;  int n1;  ap = sascii;  /* check for seq_id line */  while (lline[0]!='>' && lline[0]!=';') {    lpos = ftell(libf);    if (fgets(lline,sizeof(lline),libf)==NULL) {      *d_pos = lpos;      return 0;    }  }  *d_pos = lpos;  /* get the description line */  if (fgets(lline,sizeof(lline),libf)==NULL) return 0;  /* make certain we have the end of the line */  while (strchr((char *)lline,'\n')==NULL) {    if (fgets(lline,sizeof(lline),libf)==NULL) break;  }  *s_pos = ftell(libf);  lline[0]='\0';  n1 = 0;  while (fgets(lline,sizeof(lline),libf)!=NULL) {    if (lline[0]=='>') break;    for (cp=lline; *cp; ) if (ap[*cp++]<NA) n1++;    lpos = ftell(libf);  }  return n1;}static char gcg_type[10];static long gcg_len;static int gcg_bton[4]={2,4,1,3};intgcg_get_ent(long *d_pos, long *s_pos){  register char *cp;  register int *ap;  char libstr[20], dummy[20];  char gcg_date[6];  int r_block;  int n1;  /* check for seq_id line */  while (lline[0]!='>') {    lpos = ftell(libf);    if (fgets(lline,sizeof(lline),libf)==NULL) {      *d_pos = lpos;      return 0;    }  }  *d_pos = lpos;  /* get the encoding/sequence length info */  sscanf(&lline[4],"%s %s %s %s %ld",	 libstr,gcg_date,gcg_type,dummy,&gcg_len);  /* get the description line */  if (fgets(lline,MAXLINE,libf)==NULL) return;  *s_pos = ftell(libf);  /* seek to the end of the sequence; +1 to jump over newline */  if (gcg_type[0]=='2') {    r_block = (gcg_len+3)/4;    fseek(libf,r_block+1,SEEK_CUR);  }  else fseek(libf,gcg_len+1,SEEK_CUR);  lpos = ftell(libf);  fgets(lline,MAXLINE,libf);  return gcg_len;}intgbf_get_ent(long *d_pos, long *s_pos){  int n1;  char *cp;  register int *ap;#if !defined(TFAST)  ap = sascii;#else  ap = nascii;#endif  while (lline[0]!='L' || lline[1]!='O' || 	 strncmp(lline,"LOCUS",5)) { /* find LOCUS */    lpos = ftell(libf);    if (fgets(lline,MAXLINE,libf)==NULL) return (-1);  }  *d_pos=lpos;  while (lline[0]!='O' || lline[1]!='R' ||	 strncmp(lline,"ORIGIN",6)) { /* find ORIGIN */    if (fgets(lline,MAXLINE,libf)==NULL) return (-1);  }  *s_pos = ftell(libf);  lline[0]='\0';  n1=0;  while (fgets(lline,MAXLINE,libf)!=NULL) {    if (lline[0]=='/') break;    for (cp=lline; *cp; ) if (ap[*cp++]<NA) n1++;  }  lpos = ftell(libf);  fgets(lline,MAXLINE,libf);  return n1;}void src_int4_read(FILE *fd,  int *val){#ifdef IS_BIG_ENDIAN  fread((char *)val,(size_t)4,(size_t)1,fd);#else  unsigned char b[4];  fread((char *)&b[0],(size_t)1,(size_t)4,fd);  *val = 0;  *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)	  +(int)b[3];#endif}void src_int4_write(FILE *fd,  int val){#ifdef IS_BIG_ENDIAN  fwrite(&val,(size_t)4,(size_t)1,fd);#else  unsigned char b[4];  b[3] = val & 255;  b[2] = (val=val>>8)&255;  b[1] = (val=val>>8)&255;  b[0] = (val=val>>8)&255;  fwrite(b,(size_t)1,(size_t)4,fd);#endif}void src_long8_write(FILE *fd,  long val){#ifdef IS_BIG_ENDIAN  fwrite(&val,(size_t)8,(size_t)1,fd);#else  unsigned char b[8];  b[7] = val & 255;  b[6] = (val=val>>8)&255;  b[5] = (val=val>>8)&255;  b[4] = (val=val>>8)&255;  b[3] = (val=val>>8)&255;  b[2] = (val=val>>8)&255;  b[1] = (val=val>>8)&255;  b[0] = (val=val>>8)&255;  fwrite(b,(size_t)1,(size_t)8,fd);#endif}voidnewname(char *nname, char *oname, char *suff, int maxn){  strncpy(nname,oname,maxn-1);  strncat(nname,".",1);  strncat(nname,suff,maxn-strlen(nname));}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -