📄 ncbl2_mlib.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*	ncbl2_lib.c	functions to read ncbi-blast format files from			formatdb (blast2.0 format files)		copyright (c) 1999 William R. Pearson*//* $Name: fa35_03_06 $ - $Id: ncbl2_mlib.c,v 1.64 2007/11/20 12:45:53 wrp Exp $ *//* to turn on mmap()ing for Blast2 files: */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#ifdef UNIX#include <unistd.h>#endif#include <errno.h>/* ****************************************************************17-May-2006Modified to read NCBI .[np]al and .msk files.  The .nal or .pal fileprovides a way to read sequences from a list of files.  The .msk fileprovides a compact way of indicating the subset of sequences in alarger database (typically nr or nt) that comprise a smaller database(e.g. swissprot or pdbaa).  A .pal file (e.g. swissprot.00.pal) thatuses a .msk file has the form:	# Alias file generated by genmask	# Date created: Mon Apr 10 11:24:05 2006	#	TITLE     Non-redundant SwissProt sequences	DBLIST    nr.00	OIDLIST   swissprot.00.msk	LENGTH    74351250	NSEQ      198346	MAXOID    2617347	MEMB_BIT 1	# end of the fileTo work with this file, we must first load the nr.00 file, and thenread the swissprot.00.msk file, and then scan all the entries in theswissprot.00.msk file (which are packed 32 mask-bit to an int) todetermine whether a specific libpos index entry is present in thesubset database.**************************************************************** *//* ****************************************************************This code reads NCBI Blast2 format databases from formatdb version 3 and 4(From NCBI) This section describes the format of the databases.Formatdb creates three main files for proteins containing indices,sequences, and headers with the extensions, respectively, of pin, psq,and phr (for nucleotides these are nin, nsq, and nhr).  A number ofother ISAM indices are created, but these are described elsewhere.FORMAT OF THE INDEX FILE------------------------1.) formatdb version number 	[4 bytes].2.) protein dump flag (1 for a protein database, 0 for a nucleotide    database) [4 bytes].3.) length of the database title in bytes	[4 bytes].4.) the database title		[length given in 3.)].5.) length of the date/time string	[4 bytes].6.) the date/time string	[length given in 5.)].7.) the number of sequences in the database	[4 bytes].8.) the total length of the database in residues/basepairs	[4 bytes].9.) the length of the longest sequence in the database 		[4 bytes].10.) a list of the offsets for definitions (one for each sequence) inthe header file.  There are num_of_seq+1 of these, where num_of_seq isthe number of sequences given in 7.).11.) a list of the offsets for sequences (one for each sequence) inthe sequence file.  There are num_of_seq+1 of these, where num_of_seqis the number of sequences given in 7.).12.) a list of the offsets for the ambiguity characters (one for eachsequence) in the sequence file.  This list is only present fornucleotide databases and, since the database is compressed 4/1 fornucleotides, allows the ambiguity characters to be restored when thesequence is generated.  There are num_of_seq+1 of these, wherenum_of_seq is the number of sequences given in 7.).FORMAT OF THE SEQUENCE FILE---------------------------There are different formats for the protein and nucleotide sequence files.The protein sequence files is quite simple.  The first byte in thefile is a NULL byte, followed by the sequence in ncbistdaa format(described in the NCBI Software Development Toolkit documentation).Following the sequence is another NULL byte, followed by the nextsequence.  The file ends with a NULL byte, following the lastsequence.The nucleotide sequence file contains the nucleotide sequence, withfour basepairs compressed into one byte.  The format used is NCBI2na,documented in the NCBI Software Development Toolkit manual.  Anyambiguity characters present in the original sequence are replaced atrandom by A, C, G or T.  The true value of ambiguity characters arestored at the end of each sequence to allow true reproduction of theoriginal sequence.FORMAT OF THE HEADER FILE  (formatdb version 3)-------------------------The format of the header file depends on whether or not the identifiers in theoriginal file were parsed or not.  For the case that they were not, then eachentry has the format:gnl|BL_ORD_ID|entry_number my favorite yeast sequence...Here entry_number gives the ordinal number of the sequence in thedatabase (with zero offset).  The identifiergnl|BL_ORD_ID|entry_number is used by the BLAST software to identifythe entry, if the user has not provided another identifier.  If theidentifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced bythe correct identifier, as described inftp://ncbi.nlm.nih.gov/blast/db/README .There are no separators between these deflines.For formatdb version 4, the header file contains blast ASN.1 binarydeflines, which can parsed with parse_fastadl_asn().FORMAT OF THE .MSK FILE-----------------------The .msk file is simply a packed list of masks for formatdb "oids" forsome other file (typically nr).  The first value is the last oidavailable; the remainder are packed 32 oids/mask, so that the numberof masks is 1/32 the number of sequences in the file.**************************************************************** */#ifdef USE_MMAP#include <sys/types.h>#include <sys/stat.h>#include <sys/mman.h>#ifdef IBM_AIX#include <fcntl.h>#else#include <sys/fcntl.h>#endif#endif#ifdef USE_MMAP#ifndef MAP_FILE#define MAP_FILE 0#endif#endif#ifdef UNIX#define RBSTR "r"#else#define RBSTR "rb"#endif#ifdef WIN32#define SLASH_CHAR '\\'#define SLASH_STR "\\"#else#define SLASH_CHAR '/'#define SLASH_STR "/"#endif#define XTERNAL#include "uascii.h"#define XTERNAL#include "upam.h"#include "ncbl2_head.h"#include "defs.h"#include "mm_file.h"unsigned int bl2_uint4_cvt(unsigned int);unsigned int bl2_long4_cvt(long);int64_t bl2_long8_cvt(int64_t);void src_int4_read(FILE *fd,  int *valp);void src_uint4_read(FILE *fd,  unsigned int *valp);void src_long4_read(FILE *fd,  long *valp);void src_long8_read(FILE *fd,  int64_t *val);void ncbi_long8_read(FILE *fd,  int64_t *valp);void src_char_read(FILE *fd,  char *valp);unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,				 int *gi_p, int *db, char *acc,  char *name,				 char *title, int t_len, int *taxid);/* nt_btoa maps  from blast 2bit  format to ascii  characters */static char nt_btoa[5] = {"ACGT"};static char aa_b2toa[28]= {"-ABCDEFGHIKLMNPQRSTVWXYZU*OJ"};static int aa_btof[32];	/* maps to fasta alphabet */static int aa_btof_null = 0;static int dbtype, dbformat, amb_cnt;#define NCBIBL20 12int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);void newname(char *, char *, char *, int);void parse_pal(char *, char *, int *, int *, FILE *);void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);/* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format   file.  Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can   be used to read the database. */   struct lmf_str *ncbl2_openlib(char *name, int ldnaseq){  char lname[256];  char dname[256];  char msk_name[256];  char hname[256];  char sname[256];  char tname[256];  char db_dir[256];  int pref_db= -1;  char *bp;  int title_len;  char *title_str=NULL;  int date_len;  char *date_str=NULL;  long ltmp;  int64_t l8tmp;  int oid_seqs, max_oid;  int oid_cnt, oid_len;  unsigned int *oid_list, o_max;  int tmp;  int i;#ifdef USE_MMAP  struct stat statbuf;#endif  FILE *ifile;	/* index offsets, also DB info */  unsigned int *f_pos_arr;  struct lmf_str *m_fptr;  if (ldnaseq==SEQT_PROT) {	/* read a protein database */    newname(lname,name,AA_LIST_EXT,(int)sizeof(lname));    newname(tname,name,AA_INDEX_EXT,(int)sizeof(tname));    newname(hname,name,AA_HEADER_EXT,(int)sizeof(hname));    newname(sname,name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));    /* initialize map of BLAST2 amino acids to FASTA amino acids */    for (i=0; i<sizeof(aa_b2toa); i++) {      if ((tmp=aascii[aa_b2toa[i]])<NA) aa_btof[i]=tmp;      else if (aa_b2toa[i]=='*') aa_btof[i]=aascii['X'];      else aa_b2toa[i]=0;/*    else aa_btof[i]=aascii['X']; */    }    /* check to see if aa_btof[] actually does anything interesting */    aa_btof_null = 1;    for (i=0; i<sizeof(aa_b2toa); i++) {      if (i != aa_btof[i]) {#ifdef DEBUG	fprintf(stderr," difference at: i: %d aa_btof[i]: %d\n",i,aa_btof[i]);#endif	aa_btof_null = 0;      }    }  }  else {	/* reading DNA library */    newname(lname,name,NT_LIST_EXT,(int)sizeof(lname));    newname(tname,name,NT_INDEX_EXT,(int)sizeof(tname));    newname(hname,name,NT_HEADER_EXT,(int)sizeof(hname));    newname(sname,name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));  }	  /* check first for list name */  max_oid = oid_seqs = 0;  oid_list = NULL;  if ((ifile = fopen(lname,"r"))!=NULL) {    if ((bp = strrchr(name,SLASH_CHAR))!=NULL) {      *bp = '\0';      strncpy(db_dir,name,sizeof(db_dir));      strncat(db_dir,SLASH_STR,sizeof(db_dir)-strlen(db_dir)-1);      *bp = SLASH_CHAR;    }    else {      db_dir[0]='\0';    }    /* we have a list file, we need to parse it */    parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);    fclose(ifile);    pref_db = -1;    if (oid_seqs > 0) {      /* get the pref_db before adding the directory */      if (strncmp(msk_name,"swissprot",9)==0) {	pref_db = 7;      }      else if (strncmp(msk_name,"pdbaa",5)==0) {	pref_db = 14;      }      /* need to add directory to both dname and msk_name */      strncpy(tname,db_dir,sizeof(tname));      strncat(tname,msk_name, sizeof(tname));      strncpy(msk_name, tname, sizeof(msk_name));      strncpy(tname,db_dir,sizeof(tname));      strncat(tname,dname, sizeof(tname));      strncpy(dname,tname,sizeof(dname));      if (ldnaseq == SEQT_PROT) {	newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));	newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));	newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));      }      else {	/* reading DNA library */	newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));	newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));	newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));      }      /* now load the oid file */      if ((ifile = fopen(msk_name,RBSTR))==NULL) {	fprintf(stderr,"error - cannot load %s file\n",msk_name);	return NULL;      }      else {	src_uint4_read(ifile,&o_max);	if (o_max != max_oid) {	  fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max);	}	oid_len = (max_oid/32+1);	if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {	  fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);	  return NULL;	}	if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {	  fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);	  return NULL;	}	fclose(ifile);      }    }    else {	/* we had a .msk file, but there are no oid's in it.		   allocate an m_fptr and return it empty */      if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {	fprintf(stderr," cannot allocate lmf_str\n");	return NULL;      }      m_fptr->tmp_buf_max = 0;      /* load the oid info */      m_fptr->max_oid = 0;      m_fptr->oid_seqs = 0;      m_fptr->oid_list = (unsigned int *)calloc(1,sizeof(int));      m_fptr->pref_db= -1;      if (ldnaseq==SEQT_DNA) {	m_fptr->getlib = ncbl2_getlibn_o;	m_fptr->sascii = nascii;      }      else {	m_fptr->getlib = ncbl2_getliba_o;	m_fptr->sascii = aascii;      }      strncpy(m_fptr->lb_name,sname,MAX_FN);      return m_fptr;    }  }	  /* open the index file */  if ((ifile = fopen(tname,RBSTR))==NULL) {    fprintf(stderr," cannot open %s (%s) INDEX file",tname,name);    perror("...");    return 0;  }  src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */  src_uint4_read(ifile,(unsigned *)&dbtype);   /* get 1 for protein/0 DNA */  if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) {    fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n",	    tname,dbformat,FORMATDBV3);    return NULL;  }  if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) ||       (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) {    fprintf(stderr,"error - %s wrong format (%d/%d)\n",	    tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT));    return NULL;  }  /* the files are there - allocate lmf_str */  if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {    fprintf(stderr," cannot allocate lmf_str\n");    return NULL;  }  m_fptr->tmp_buf_max = 4096;  if ((m_fptr->tmp_buf=       (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {    fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");    return NULL;  }  /* load the oid info */  m_fptr->max_oid = max_oid;  m_fptr->oid_seqs = oid_seqs;  m_fptr->oid_list = oid_list;  m_fptr->pref_db= pref_db;  /* open the header file */  if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) {    fprintf(stderr," cannot open %s header file\n",hname);    goto error_r;  }  /* ncbl2_ranlib is used for all BLAST2.0 access */  m_fptr->ranlib = ncbl2_ranlib;  m_fptr->bl_format_ver = dbformat;  if (ldnaseq==SEQT_DNA) {    if (oid_seqs > 0) {      m_fptr->getlib = ncbl2_getlibn_o;    }    else {      m_fptr->getlib = ncbl2_getlibn;    }    m_fptr->sascii = nascii;  }  else {    if (oid_seqs > 0) {      m_fptr->getlib = ncbl2_getliba_o;    }    else {      m_fptr->getlib = ncbl2_getliba;    }    m_fptr->sascii = aascii;  }  strncpy(m_fptr->lb_name,sname,MAX_FN);  /* open the sequence file */#if defined (USE_MMAP)   m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);  if (!m_fptr->mm_flg) {    fprintf(stderr," cannot open %s",sname);    perror("...");  }  else {    if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {      fprintf(stderr," cannot fstat %s",sname);      perror("...");      m_fptr->mm_flg = 0;    }    else {      m_fptr->st_size = statbuf.st_size;      if((m_fptr->mmap_base = 	  mmap(NULL, m_fptr->st_size, PROT_READ,	       MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {	fprintf(stderr," cannot mmap %s",sname);	perror("...");	m_fptr->mm_flg = 0;      }        else {	m_fptr->mmap_addr = m_fptr->mmap_base;	m_fptr->mm_flg = 1;      }    }    /* regardless, close the open()ed version */    close(m_fptr->mmap_fd);  }#else  m_fptr->mm_flg = 0;#endif  if  (!m_fptr->mm_flg) {    if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) {      fprintf(stderr," cannot open %s sequence file",sname);      perror("...");      goto error_r;    }  }/* all files should be open */  src_uint4_read(ifile,(unsigned *)&title_len);  if (title_len > 0) {    if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) {      fprintf(stderr," cannot allocate title string (%d)\n",title_len);      goto error_r;    }    fread(title_str,(size_t)1,(size_t)title_len,ifile);  }    src_uint4_read(ifile,(unsigned *)&date_len);  if (date_len > 0) {    if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) {      fprintf(stderr," cannot allocate date string (%d)\n",date_len);      goto error_r;    }    fread(date_str,(size_t)1,(size_t)date_len,ifile);  }    m_fptr->lpos = 0;  src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt);    if (dbformat == FORMATDBV3) {    src_long4_read(ifile,&ltmp);    m_fptr->tot_len = ltmp;  }  else {    ncbi_long8_read(ifile,&l8tmp);    m_fptr->tot_len = ltmp;  }  src_long4_read(ifile,&ltmp);  m_fptr->max_len = ltmp;  /* currently we are not using this information, but perhaps later */  if (title_str!=NULL) free(title_str);  if (date_str!=NULL) free(date_str);#ifdef DEBUG    fprintf(stderr,"%s format: BL2 (%s)  max_cnt: %d, totlen: %lld, maxlen %ld\n",	    name,m_fptr->mm_flg ? "mmap" : "fopen", 	    m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len);#endif  /* allocate and read hdr indexes */  if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) {      fprintf(stderr," cannot allocate tmp header pointers\n");      goto error_r;    }  if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {      fprintf(stderr," cannot allocate header pointers\n");      goto error_r;    }  /* allocate and read sequence offsets */  if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -