📄 mmgetaa.c
字号:
/* mmgetaa.c - functions for mmap()ed access to libraries *//* copyright (c) 1999,2000 William R. Pearson *//* version 0 September, 1999 *//* This is one of two alternative files that can be used to read a database. The two files are nmgetaa.c, and mmgetaa.c (nxgetaa.c has been retired). nmgetlib.c and mmgetaa.c are used together. nmgetlib.c provides the same functions as nxgetaa.c if memory mapping is not used, mmgetaa.c provides the database reading functions if memory mapping is used. The decision to use memory mapping is made on a file-by-file basis.*//* $Name: fa35_03_06 $ - $Id: mmgetaa.c,v 1.45 2007/11/27 12:47:01 wrp Exp $ */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <string.h>#include <errno.h>#include <sys/types.h>#include <sys/stat.h>#include <sys/mman.h>#include <fcntl.h>#define MAXLINE 512#define EOSEQ 0#define XTERNAL#include "uascii.h"/* #include "upam.h" */#undef XTERNAL#ifdef SUPERFAMNUMextern int nsfnum; /* number of superfamily numbers */extern int sfnum[10]; /* superfamily number from types 0 and 5 */extern int nsfnum_n;extern int sfnum_n[10];static char tline[MAXLINE];#endif#define GCGBIN 6#ifndef MAP_FILE#define MAP_FILE 0#endif#include "defs.h"#include "mm_file.h"extern MM_OFF bl2_long8_cvt(int64_t);extern int bl2_uint4_cvt(int);long crck(char *, int);extern void src_int4_read(FILE *fd, int *val);extern void src_long4_read(FILE *fd, long *valp);extern void src_long8_read(FILE *fd, int64_t *val);/* load_mmap() loads the d_pos[] and s_pos[] arrays for rapid access */struct lmf_str *load_mmap(FILE *libi, /* fd for already open ".xin" file */ char *sname, /* name of sequence database file */ int lib_type, /* 0-Fasta, 5-vms_pir, 6-gcg_binary */ int ldnaseq, /* 1 for DNA, 0 for protein */ struct lmf_str *m_fd){ char format[4]; int i, lib_aa; MM_OFF f_size; long lf_size; struct stat statbuf; int max_cnt; MM_OFF *d_pos_arr, *s_pos_arr; int mm_flag, mm64_flag; int *tmp_pos_arr; /* first check that the necessary indices are up-to-date */ /* read the offsets in ".xin" file */ if (fread(format,1,4,libi)==0) { fprintf(stderr," cannot read .xin format\n"); return NULL; } mm64_flag = (format[2]==1); /* 4 bytes or 8 bytes for long? */#ifndef BIG_LIB64 if (mm64_flag) {return NULL;}#endif if (format[3]!=lib_type) { fprintf(stderr," cannot read format %d != lib_type %d\n", format[3],lib_type); return NULL; } src_int4_read(libi,&lib_aa); if (lib_aa == ldnaseq) { /* database residue mismatch */ fprintf(stderr," residue type mismatch %s != %s (.xin) in %s\n", (lib_aa ? "DNA" : "prot."),(ldnaseq ? "prot." : "DNA"), sname); return NULL; } /* everything looks good, allocate an lmf_str */ m_fd->lib_aa = lib_aa; /* get get file size from index */ if (mm64_flag) src_long8_read(libi,&f_size); else { src_long4_read(libi,&lf_size); f_size = lf_size; } /* now, start to open mmap()ed file */ mm_flag=((m_fd->mmap_fd=open(sname,O_RDONLY))>=0); if (!mm_flag) { fprintf(stderr," cannot open %s for mmap()", sname); perror("..."); return NULL; /* file did not open */ } /* fstat the library file and get size */ if(fstat(m_fd->mmap_fd, &statbuf) < 0) { fprintf(stderr," cannot stat %s for mmap()", sname); perror("..."); m_fd->mm_flg = 0; goto finish; } /* check for identical sizes - if different, do not mmap */ if (f_size != statbuf.st_size) { fprintf(stderr," %s file size (%lld) and expected size (%lld) don't match\n", sname,statbuf.st_size,f_size); mm_flag = 0; goto finish; } /* the index file and library file are open and the sizes match */ /* allocate the m_file struct and map the file */ m_fd->st_size = statbuf.st_size; if((m_fd->mmap_base = mmap(NULL, m_fd->st_size, PROT_READ, MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) { mm_flag = 0;#ifdef DEBUG fprintf(stderr," cannot mmap %s", sname); perror("...");#endif } finish: close(m_fd->mmap_fd); if (!mm_flag) { return NULL; } /* now finish reading the index file */ src_int4_read(libi,&max_cnt); if (mm64_flag) { src_long8_read(libi,&m_fd->tot_len); } else { src_long4_read(libi,&lf_size); m_fd->tot_len = lf_size; } src_long4_read(libi,&lf_size); m_fd->max_len = lf_size;#ifdef DEBUG fprintf(stderr, "\n%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n", sname,format[0],format[1],format[2],format[3], max_cnt,m_fd->tot_len,m_fd->max_len);#endif /* allocate array of description pointers */ if (!mm64_flag) { if ((tmp_pos_arr=(int *)calloc(max_cnt+1,sizeof(int)))==NULL) { fprintf(stderr," cannot allocate %d for tmp_pos array\n", max_cnt+1); } } if ((d_pos_arr=(MM_OFF *)calloc(max_cnt+1, sizeof(MM_OFF)))==NULL) { fprintf(stderr," cannot allocate %d for desc. array\n",max_cnt+1); exit(1); } /* read m_fd->d_pos[max_cnt+1] */ if (mm64_flag) { if (fread(d_pos_arr,sizeof(MM_OFF),max_cnt+1,libi)!= max_cnt+1) { fprintf(stderr," error reading desc. offsets: %s\n",sname); return NULL; } } else { if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!= max_cnt+1) { fprintf(stderr," error reading desc. offsets: %s\n",sname); return NULL; }#ifdef DEBUG fprintf(stderr,"d_pos_crc: %ld\n", crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));#endif }#ifndef IS_BIG_ENDIAN if (mm64_flag) for (i=0; i<=max_cnt; i++) { d_pos_arr[i] = bl2_long8_cvt(d_pos_arr[i]); } else for (i=0; i<=max_cnt; i++) { d_pos_arr[i] = bl2_uint4_cvt(tmp_pos_arr[i]); }#else if (!mm64_flag) { for (i=0; i<=max_cnt; i++) { d_pos_arr[i] = tmp_pos_arr[i]; } }#endif#ifdef DEBUG for (i=0; i<max_cnt-1; i++) { if (d_pos_arr[i+1] <= d_pos_arr[i] ) fprintf(stderr," ** dpos_error [%d]\t%ld\t%ld\n", i,d_pos_arr[i],d_pos_arr[i+1]); }#endif /* allocate array of sequence pointers */ if ((s_pos_arr=(MM_OFF *)calloc(max_cnt+1,sizeof(MM_OFF)))==NULL) { fprintf(stderr," cannot allocate %d for seq. array\n",max_cnt+1); exit(1); } /* read m_fd->s_pos[max_cnt+1] */ if (mm64_flag) { if (fread(s_pos_arr,sizeof(MM_OFF),max_cnt+1,libi)!= max_cnt+1) { fprintf(stderr," error reading seq offsets: %s\n",sname); return NULL; } } else { if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!= max_cnt+1) { fprintf(stderr," error reading seq offsets: %s\n",sname); return NULL; }#ifdef DEBUG fprintf(stderr,"s_pos_crc: %ld\n", crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));#endif }#ifndef IS_BIG_ENDIAN if (mm64_flag) for (i=0; i<=max_cnt; i++) s_pos_arr[i] = bl2_long8_cvt(s_pos_arr[i]); else for (i=0; i<=max_cnt; i++) s_pos_arr[i] = (long)bl2_uint4_cvt(tmp_pos_arr[i]);#else if (!mm64_flag) for (i=0; i<=max_cnt; i++) s_pos_arr[i] = (long)tmp_pos_arr[i];#endif#ifdef DEBUG for (i=1; i<max_cnt-1; i++) { if (s_pos_arr[i+1]<s_pos_arr[i]) fprintf(stderr," ** spos_error [%d]\t%ld\t%ld\n", i,s_pos_arr[i],s_pos_arr[i]); }#endif if (!mm64_flag) free(tmp_pos_arr); m_fd->max_cnt = max_cnt; m_fd->d_pos_arr = d_pos_arr; m_fd->s_pos_arr = s_pos_arr; m_fd->lpos = 0; /* check_mmap(m_fd,-2); */ return m_fd;} char *mgets (char *s, int n, struct lmf_str *m_fd){ char *cs, *mfp; mfp = m_fd->mmap_addr; cs = s; while (--n > 0 && (*mfp != (char)EOF)) if ((*cs++ = *mfp++) == '\n') break; *cs = '\0'; m_fd->mmap_addr = mfp; return (*mfp == (char)EOF && cs == s) ? NULL : s;}intagetlibm(unsigned char *seq, int maxs, char *libstr, int n_libstr, fseek_t *libpos, int *lcont, struct lmf_str *m_fd, long *l_off){ register unsigned char *cp, *seqp; register int *ap; int sel_status; char *desc; int lpos; /* entry number in library */ long l; unsigned char *seqm, *seqm1; char *bp; static long seq_len; static unsigned char *cp_max;#ifdef SUPERFAMNUM char *bp1, *bpa, *tp; int i;#endif *l_off = 1; lpos = m_fd->lpos; seqp = seq; seqm = &seq[maxs-9]; seqm1 = seqm-1; ap = m_fd->sascii; if (*lcont==0) { start_seq: if (lpos >= m_fd->max_cnt) return (-1); seq_len = m_fd->d_pos_arr[lpos+1] - m_fd->s_pos_arr[lpos]; if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) { fprintf(stderr," ** sequence over-run: %ld at %d\n",seq_len,lpos); return(-1); } *libpos = (fseek_t)lpos; desc = m_fd->mmap_base+m_fd->d_pos_arr[lpos]+m_fd->acc_off; strncpy(libstr,desc,n_libstr-1); libstr[n_libstr-1]='\0'; if ((m_fd->sel_acc_p != NULL) && (sel_status = (m_fd->sel_acc_p)(libstr, 0, m_fd->sel_local)) <= 0) { if (sel_status < 0) return (-1); lpos++; goto start_seq; } if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0'; if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0'; if (n_libstr > MAX_UID) { bp = libstr; while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' '; } for (bp = desc; *bp && (*bp != '\n'); *bp++ ) if (*bp == '@' && !strncmp(bp+1,"C:",2)) sscanf(bp+3,"%ld",l_off);#ifdef SUPERFAMNUM sfnum[0]=nsfnum=0; strncpy(tline,desc,sizeof(tline)); tline[MAXLINE-1]='\0'; if ((bp=strchr(tline,'\n'))!=NULL) *bp='\0'; if ((bp=strchr(tline,' ')) && (bp=strchr(bp+1,SFCHAR))) { if ((bpa = strchr(bp+1,'\001'))!=NULL) *bpa = '\0'; if ((bp1=strchr(bp+1,SFCHAR))==NULL) { fprintf(stderr," second %c missing: %s\n",SFCHAR,tline); } else { *bp1 = '\0'; i = 0; if ((tp = strtok(bp+1," \t"))!=NULL) { sfnum[i++] = atoi(tp); while ((tp = strtok((char *)NULL," \t")) != (char *)NULL) { sfnum[i++] = atoi(tp); if (i>=9) break; } } sfnum[nsfnum=i]= 0; if (nsfnum>1) sf_sort(sfnum,nsfnum); else { if (nsfnum<1) fprintf(stderr," found | but no sfnum: %s\n",libstr); } } } else { sfnum[0] = nsfnum = 0; }#endif m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos]; cp_max = (unsigned char *)(m_fd->mmap_addr+seq_len); } for (cp=(unsigned char *)m_fd->mmap_addr; seqp<seqm1; ) { if ((*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA && (*seqp++=ap[*cp++])<NA) continue; --seqp; if (cp >= cp_max) break; } m_fd->mmap_addr = (char *)cp; if (seqp>=seqm1) (*lcont)++; else { *lcont=0; lpos++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -