⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mmgetaa.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
📖 第 1 页 / 共 2 页
字号:
/* mmgetaa.c - functions for mmap()ed access to libraries *//* copyright (c) 1999,2000 William R. Pearson *//* version 0 September, 1999 *//*  This is one of two alternative files that can be used to  read a database.  The two files are nmgetaa.c, and mmgetaa.c  (nxgetaa.c has been retired).  nmgetlib.c and mmgetaa.c are used together. nmgetlib.c provides  the same functions as nxgetaa.c if memory mapping is not used,  mmgetaa.c provides the database reading functions if memory  mapping is used. The decision to use memory mapping is made on  a file-by-file basis.*//* $Name: fa35_03_06 $ - $Id: mmgetaa.c,v 1.45 2007/11/27 12:47:01 wrp Exp $ */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <string.h>#include <errno.h>#include <sys/types.h>#include <sys/stat.h>#include <sys/mman.h>#include <fcntl.h>#define MAXLINE 512#define EOSEQ 0#define XTERNAL#include "uascii.h"/* #include "upam.h" */#undef XTERNAL#ifdef SUPERFAMNUMextern int nsfnum;	/* number of superfamily numbers */extern int sfnum[10];	/* superfamily number from types 0 and 5 */extern int nsfnum_n;extern int sfnum_n[10];static char tline[MAXLINE];#endif#define GCGBIN 6#ifndef MAP_FILE#define MAP_FILE 0#endif#include "defs.h"#include "mm_file.h"extern MM_OFF bl2_long8_cvt(int64_t);extern int bl2_uint4_cvt(int);long crck(char *, int);extern void src_int4_read(FILE *fd,  int *val);extern void src_long4_read(FILE *fd,  long  *valp);extern void src_long8_read(FILE *fd,  int64_t *val);/* load_mmap() loads the d_pos[] and s_pos[] arrays for rapid access */struct lmf_str *load_mmap(FILE *libi,	/* fd for already open ".xin" file */	  char *sname,	/* name of sequence database file */	  int lib_type,	/* 0-Fasta, 5-vms_pir, 6-gcg_binary */	  int ldnaseq,	/* 1 for DNA, 0 for protein */	  struct lmf_str *m_fd){  char format[4];  int i, lib_aa;  MM_OFF f_size;  long lf_size;  struct stat statbuf;  int max_cnt;  MM_OFF *d_pos_arr, *s_pos_arr;  int mm_flag, mm64_flag;  int *tmp_pos_arr;  /* first check that the necessary indices are up-to-date */  /* read the offsets in ".xin" file */  if (fread(format,1,4,libi)==0) {    fprintf(stderr," cannot read .xin format\n");    return NULL;  }      mm64_flag = (format[2]==1);	/* 4 bytes or 8 bytes for long? */#ifndef BIG_LIB64  if (mm64_flag) {return NULL;}#endif  if (format[3]!=lib_type) {    fprintf(stderr," cannot read format %d != lib_type %d\n",	    format[3],lib_type);    return NULL;  }  src_int4_read(libi,&lib_aa);  if (lib_aa == ldnaseq) { /* database residue mismatch */    fprintf(stderr," residue type mismatch %s != %s (.xin) in %s\n",	    (lib_aa ? "DNA" : "prot."),(ldnaseq ? "prot." : "DNA"),	    sname);    return NULL;  }      /* everything looks good, allocate an lmf_str */  m_fd->lib_aa = lib_aa;  /* get get file size from index */  if (mm64_flag) src_long8_read(libi,&f_size);  else {    src_long4_read(libi,&lf_size);    f_size = lf_size;  }  /* now, start to open mmap()ed file */  mm_flag=((m_fd->mmap_fd=open(sname,O_RDONLY))>=0);  if (!mm_flag) {    fprintf(stderr," cannot open %s for mmap()", sname);    perror("...");    return NULL;	/* file did not open */  }  /* fstat the library file and get size */  if(fstat(m_fd->mmap_fd, &statbuf) < 0) {    fprintf(stderr," cannot stat %s for mmap()", sname);    perror("...");    m_fd->mm_flg = 0;    goto finish;  }  /* check for identical sizes - if different, do not mmap */  if (f_size != statbuf.st_size) {    fprintf(stderr," %s file size (%lld) and expected size (%lld) don't match\n",	    sname,statbuf.st_size,f_size);    mm_flag = 0;    goto finish;      }  /* the index file and library file are open and the sizes match */  /* allocate the m_file struct and  map the file */  m_fd->st_size = statbuf.st_size;  if((m_fd->mmap_base =       mmap(NULL, m_fd->st_size, PROT_READ,	   MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) {    mm_flag = 0;#ifdef DEBUG    fprintf(stderr," cannot mmap %s", sname);    perror("...");#endif  }   finish:  close(m_fd->mmap_fd);  if (!mm_flag) { return NULL; }  /* now finish reading the index file */  src_int4_read(libi,&max_cnt);  if (mm64_flag) {    src_long8_read(libi,&m_fd->tot_len);  }  else {    src_long4_read(libi,&lf_size);    m_fd->tot_len = lf_size;  }  src_long4_read(libi,&lf_size);  m_fd->max_len = lf_size;#ifdef DEBUG  fprintf(stderr,	  "\n%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n",	  sname,format[0],format[1],format[2],format[3],	  max_cnt,m_fd->tot_len,m_fd->max_len);#endif  /* allocate array of description pointers */  if (!mm64_flag) {    if ((tmp_pos_arr=(int *)calloc(max_cnt+1,sizeof(int)))==NULL) {      fprintf(stderr," cannot allocate %d for tmp_pos array\n",	      max_cnt+1);    }  }  if ((d_pos_arr=(MM_OFF *)calloc(max_cnt+1, sizeof(MM_OFF)))==NULL) {    fprintf(stderr," cannot allocate %d for desc. array\n",max_cnt+1);    exit(1);  }  /* read m_fd->d_pos[max_cnt+1] */  if (mm64_flag) {    if (fread(d_pos_arr,sizeof(MM_OFF),max_cnt+1,libi)!=	max_cnt+1) {      fprintf(stderr," error reading desc. offsets: %s\n",sname);      return NULL;    }  }  else {    if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=	max_cnt+1) {      fprintf(stderr," error reading desc. offsets: %s\n",sname);      return NULL;    }#ifdef DEBUG    fprintf(stderr,"d_pos_crc: %ld\n",	    crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));#endif  }#ifndef IS_BIG_ENDIAN  if (mm64_flag)    for (i=0; i<=max_cnt; i++) {      d_pos_arr[i] = bl2_long8_cvt(d_pos_arr[i]);    }  else    for (i=0; i<=max_cnt; i++) {      d_pos_arr[i] = bl2_uint4_cvt(tmp_pos_arr[i]);    }#else  if (!mm64_flag) {    for (i=0; i<=max_cnt; i++) {      d_pos_arr[i] = tmp_pos_arr[i];    }  }#endif#ifdef DEBUG  for (i=0; i<max_cnt-1; i++) {    if (d_pos_arr[i+1] <= d_pos_arr[i] )      fprintf(stderr," ** dpos_error [%d]\t%ld\t%ld\n",	      i,d_pos_arr[i],d_pos_arr[i+1]);  }#endif  /* allocate array of sequence pointers */  if ((s_pos_arr=(MM_OFF *)calloc(max_cnt+1,sizeof(MM_OFF)))==NULL) {    fprintf(stderr," cannot allocate %d for seq. array\n",max_cnt+1);    exit(1);  }  /* read m_fd->s_pos[max_cnt+1] */  if (mm64_flag) {    if (fread(s_pos_arr,sizeof(MM_OFF),max_cnt+1,libi)!=	max_cnt+1) {      fprintf(stderr," error reading seq offsets: %s\n",sname);      return NULL;    }  }  else {    if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=	max_cnt+1) {      fprintf(stderr," error reading seq offsets: %s\n",sname);      return NULL;    }#ifdef DEBUG    fprintf(stderr,"s_pos_crc: %ld\n",	    crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));#endif  }#ifndef IS_BIG_ENDIAN  if (mm64_flag)    for (i=0; i<=max_cnt; i++)      s_pos_arr[i] = bl2_long8_cvt(s_pos_arr[i]);  else    for (i=0; i<=max_cnt; i++)      s_pos_arr[i] = (long)bl2_uint4_cvt(tmp_pos_arr[i]);#else  if (!mm64_flag)     for (i=0; i<=max_cnt; i++)      s_pos_arr[i] = (long)tmp_pos_arr[i];#endif#ifdef DEBUG  for (i=1; i<max_cnt-1; i++) {    if (s_pos_arr[i+1]<s_pos_arr[i])      fprintf(stderr," ** spos_error [%d]\t%ld\t%ld\n",	      i,s_pos_arr[i],s_pos_arr[i]);  }#endif  if (!mm64_flag) free(tmp_pos_arr);  m_fd->max_cnt = max_cnt;  m_fd->d_pos_arr = d_pos_arr;  m_fd->s_pos_arr = s_pos_arr;  m_fd->lpos = 0;  /*   check_mmap(m_fd,-2); */  return m_fd;}   char *mgets (char *s, int n, struct lmf_str *m_fd){  char *cs, *mfp;  mfp = m_fd->mmap_addr;  cs = s;  while (--n > 0 && (*mfp != (char)EOF))    if ((*cs++ = *mfp++) == '\n') break;  *cs = '\0';  m_fd->mmap_addr = mfp;  return (*mfp == (char)EOF && cs == s) ? NULL : s;}intagetlibm(unsigned char *seq,	 int maxs,	 char *libstr,	 int n_libstr,	 fseek_t *libpos,	 int *lcont,	 struct lmf_str *m_fd,	 long *l_off){  register unsigned char *cp, *seqp;  register int *ap;  int sel_status;  char *desc;  int lpos;		/* entry number in library */  long l;  unsigned char *seqm, *seqm1;  char *bp;  static long seq_len;  static unsigned char *cp_max;#ifdef SUPERFAMNUM  char *bp1, *bpa, *tp;  int i;#endif  *l_off = 1;  lpos = m_fd->lpos;  seqp = seq;  seqm = &seq[maxs-9];  seqm1 = seqm-1;  ap = m_fd->sascii;  if (*lcont==0) {  start_seq:    if (lpos >= m_fd->max_cnt) return (-1);    seq_len = m_fd->d_pos_arr[lpos+1] - m_fd->s_pos_arr[lpos];    if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {      fprintf(stderr," ** sequence over-run: %ld at %d\n",seq_len,lpos);      return(-1);    }    *libpos = (fseek_t)lpos;    desc = m_fd->mmap_base+m_fd->d_pos_arr[lpos]+m_fd->acc_off;    strncpy(libstr,desc,n_libstr-1);    libstr[n_libstr-1]='\0';    if ((m_fd->sel_acc_p != NULL) &&	(sel_status = (m_fd->sel_acc_p)(libstr, 0, m_fd->sel_local)) <= 0) {      if (sel_status < 0) return (-1);      lpos++;      goto start_seq;    }    if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0';    if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';    if (n_libstr > MAX_UID) {      bp = libstr;      while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';    }    for (bp = desc; *bp && (*bp != '\n'); *bp++ )      if (*bp == '@' && !strncmp(bp+1,"C:",2)) sscanf(bp+3,"%ld",l_off);#ifdef SUPERFAMNUM    sfnum[0]=nsfnum=0;    strncpy(tline,desc,sizeof(tline));    tline[MAXLINE-1]='\0';    if ((bp=strchr(tline,'\n'))!=NULL) *bp='\0';    if ((bp=strchr(tline,' ')) && (bp=strchr(bp+1,SFCHAR))) {      if ((bpa = strchr(bp+1,'\001'))!=NULL) *bpa = '\0';      if ((bp1=strchr(bp+1,SFCHAR))==NULL) {	fprintf(stderr," second %c missing: %s\n",SFCHAR,tline);      }      else {	*bp1 = '\0';	i = 0;	if ((tp = strtok(bp+1," \t"))!=NULL) {	  sfnum[i++] = atoi(tp);	  while ((tp = strtok((char *)NULL," \t")) != (char *)NULL) {	    sfnum[i++] = atoi(tp);	    if (i>=9) break;	  }	}	sfnum[nsfnum=i]= 0;	if (nsfnum>1) sf_sort(sfnum,nsfnum);	else {	  if (nsfnum<1) fprintf(stderr," found | but no sfnum: %s\n",libstr);	}      }    }    else {      sfnum[0] = nsfnum = 0;      }#endif    m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];    cp_max = (unsigned char *)(m_fd->mmap_addr+seq_len);  }  for (cp=(unsigned char *)m_fd->mmap_addr; seqp<seqm1; ) {    if ((*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA &&	(*seqp++=ap[*cp++])<NA) continue;    --seqp;    if (cp >= cp_max) break;  }  m_fd->mmap_addr = (char *)cp;  if (seqp>=seqm1) (*lcont)++;  else {    *lcont=0;    lpos++;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -