📄 ssi.c
字号:
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. *****************************************************************/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/stat.h>#include <sys/types.h>#include <unistd.h>#include "squid.h"#include "ssi.h"static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */static sqd_uint32 v20swap = 0xb1e9f3f3; /* byteswapped */static int read_i16(FILE *fp, sqd_uint16 *ret_result);static int read_i32(FILE *fp, sqd_uint32 *ret_result);static int read_i64(FILE *fp, sqd_uint64 *ret_result);static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset);static int write_i16(FILE *fp, sqd_uint16 n);static int write_i32(FILE *fp, sqd_uint32 n);static int write_i64(FILE *fp, sqd_uint64 n);static int write_offset(FILE *fp, SSIOFFSET *offset);static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, sqd_uint32 recsize, sqd_uint32 maxidx);static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, sqd_uint32 n);static void clear_ssifile(SSIFILE *sfp);static int write_index(FILE *fp, SSIINDEX *g);static int write_index_chunk(SSIINDEX *g);static sqd_uint64 current_chunk_size(SSIINDEX *g);static int load_indexfile(SSIFILE *sfp);/* Function: SSIOpen() * Date: SRE, Sun Dec 31 12:40:03 2000 [St. Louis] * * Purpose: Opens the SSI index file {filename} and returns * a SSIFILE * stream thru {ret_sfp}. * The caller must eventually close this stream using * SSIClose(). More than one index file can be open * at once. * * Args: filename - full path to a SSI index file * * Returns: Returns 0 on success, nonzero on failure. */intSSIOpen(char *filename, SSIFILE **ret_sfp){ SSIFILE *sfp = NULL; int status; if ((sfp = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC; if ((sfp->fp = fopen(filename, "rb")) == NULL) return SSI_ERR_NOFILE; status = load_indexfile(sfp); *ret_sfp = sfp; return status;}/* load_indexfile(): given a SSIFILE structure with an open and positioned * stream (fp) -- but no other data loaded -- read the next SSIFILE * in from disk. We use this routine without its SSIOpen() wrapper * as part of the external mergesort when creating large indices. */static intload_indexfile(SSIFILE *sfp){ sqd_uint32 magic; sqd_uint16 i; /* counter over files */ int status; /* overall return status if an error is thrown */ status = SSI_ERR_BADFORMAT; /* default: almost every kind of error is a bad format error */ sfp->filename = NULL; sfp->fileformat = NULL; sfp->fileflags = NULL; sfp->bpl = NULL; sfp->rpl = NULL; sfp->nfiles = 0; if (! read_i32(sfp->fp, &magic)) {status = SSI_ERR_BADMAGIC; goto FAILURE; } if (magic != v20magic && magic != v20swap) {status = SSI_ERR_BADMAGIC; goto FAILURE; } if (! read_i32(sfp->fp, &(sfp->flags))) goto FAILURE; /* If we have 64-bit offsets, make sure we can deal with them. */#ifndef HAS_64BIT_FILE_OFFSETS if ((sfp->flags & SSI_USE64_INDEX) || (sfp->flags & SSI_USE64)) { status = SSI_ERR_NO64BIT; goto FAILURE; }#endif sfp->imode = (sfp->flags & SSI_USE64_INDEX) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; sfp->smode = (sfp->flags & SSI_USE64) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; if (! read_i16(sfp->fp, &(sfp->nfiles))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->nprimary))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->nsecondary))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->flen))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->plen))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->slen))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->frecsize))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->precsize))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->srecsize))) goto FAILURE; if (! read_offset(sfp->fp, sfp->imode, &(sfp->foffset))) goto FAILURE; if (! read_offset(sfp->fp, sfp->imode, &(sfp->poffset))) goto FAILURE; if (! read_offset(sfp->fp, sfp->imode, &(sfp->soffset))) goto FAILURE; /* Read the file information and keep it. * We expect the number of files to be small, so reading it * once should be advantageous overall. If SSI ever had to * deal with large numbers of files, you'd probably want to * read file information on demand. */ if (sfp->nfiles == 0) goto FAILURE; if ((sfp->filename=malloc(sizeof(char *) *sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } for (i = 0; i < sfp->nfiles; i++) sfp->filename[i] = NULL; if ((sfp->fileformat=malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } if ((sfp->fileflags =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } if ((sfp->bpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } if ((sfp->rpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } for (i = 0; i < sfp->nfiles; i++) { /* We have to explicitly position, because header and file * records may expand in the future; frecsize and foffset * give us forwards compatibility. */ if (indexfile_position(sfp, &(sfp->foffset), sfp->frecsize, i) !=0) goto FAILURE; if ((sfp->filename[i] =malloc(sizeof(char)*sfp->flen)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } if (fread(sfp->filename[i],sizeof(char),sfp->flen, sfp->fp)!=sfp->flen) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->fileformat[i]))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->fileflags[i]))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->bpl[i]))) goto FAILURE; if (! read_i32(sfp->fp, &(sfp->rpl[i]))) goto FAILURE; } /* Success. Return 0. */ return 0; FAILURE: /* Failure: free the damaged structure, return status code. */ SSIClose(sfp); return status;}/* Function: SSIGetOffsetByName() * Date: SRE, Sun Dec 31 13:55:31 2000 [St. Louis] * * Purpose: Looks up the string {key} in the open index {sfp}. * {key} can be either a primary or secondary key. If {key} * is found, {*ret_fh} contains a unique handle on * the file that contains {key} (suitable for an SSIFileInfo() * call, or for comparison to the handle of the last file * that was opened for retrieval), and {offset} is filled * in with the offset in that file. * * Args: sfp - open index file * key - string to search for * ret_fh - RETURN: handle on file that key is in * ret_offset - RETURN: offset of the start of that key's record * * Returns: 0 on success. * non-zero on error. */intSSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh, SSIOFFSET *ret_offset){ int status; sqd_uint16 fnum; /* Look in the primary keys. */ status = binary_search(sfp, key, sfp->plen, &(sfp->poffset), sfp->precsize, sfp->nprimary); if (status == 0) { /* We found it as a primary key; get our data & return. */ if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; *ret_fh = (int) fnum; if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; return 0; /* success! (we don't need the other key data) */ } else if (status == SSI_ERR_NO_SUCH_KEY) { /* Not in the primary keys? OK, try the secondary keys. */ if (sfp->nsecondary > 0) { char *pkey; status = binary_search(sfp, key, sfp->slen, &(sfp->soffset), sfp->srecsize, sfp->nsecondary); if (status != 0) return status; if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; status = SSIGetOffsetByName(sfp, pkey, ret_fh, ret_offset); free(pkey); } return status; } else return status; /*NOTREACHED*/}/* Function: SSIGetOffsetByNumber() * Date: SRE, Mon Jan 1 19:42:42 2001 [St. Louis] * * Purpose: Looks up primary key #{n} in the open index {sfp}. * {n} ranges from 0..nprimary-1. When key #{n} * is found, {*ret_fh} contains a unique * handle on the file that contains {key} (suitable * for an SSIFileInfo() call, or for comparison to * the handle of the last file that was opened for retrieval), * and {offset} is filled in with the offset in that file. * * Args: sfp - open index file * n - primary key number to retrieve. * ret_fh - RETURN: handle on file that key is in * ret_offset - RETURN: offset of the start of that key's record * * Returns: 0 on success. * non-zero on error. */intSSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, SSIOFFSET *ret_offset){ sqd_uint16 fnum; char *pkey; if (n >= sfp->nprimary) return SSI_ERR_NO_SUCH_KEY; if (indexfile_position(sfp, &(sfp->poffset), sfp->precsize, n) != 0) return SSI_ERR_SEEK_FAILED; if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; *ret_fh = fnum; free(pkey); return 0;}/* Function: SSIGetSubseqOffset() * Date: SRE, Mon Jan 1 19:49:31 2001 [St. Louis] * * Purpose: Implements SSI_FAST_SUBSEQ. * * Looks up a primary or secondary {key} in the open * index {sfp}. Asks for the nearest offset to a * subsequence starting at position {requested_start} * in the sequence (numbering the sequence 1..L). * If {key} is found, on return, {ret_fh} * contains a unique handle on the file that contains * {key} (suitable for an SSIFileInfo() call, or for * comparison to the handle of the last file that was * opened for retrieval); {record_offset} contains the * disk offset to the start of the record; {data_offset} * contains the disk offset either exactly at the requested * residue, or at the start of the line containing the * requested residue; {ret_actual_start} contains the * coordinate (1..L) of the first valid residue at or * after {data_offset}. {ret_actual_start} is <= * {requested_start}. * * Args: sfp - open index file * key - primary or secondary key to find * requested_start - residue we'd like to start at (1..L) * ret_fh - RETURN: handle for file the key is in * record_offset - RETURN: offset of entire record * data_offset - RETURN: offset of subseq (see above) * ret_actual_start- RETURN: coord (1..L) of residue at data_offset * * Returns: 0 on success, non-zero on failure. */intSSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start, int *ret_fh, SSIOFFSET *record_offset, SSIOFFSET *data_offset, int *ret_actual_start){ int status; sqd_uint32 len; int r, b, i, l; /* tmp variables for "clarity", to match docs */ /* Look up the key. Rely on the fact that SSIGetOffsetByName() * leaves the index file positioned at the rest of the data for this key. */ status = SSIGetOffsetByName(sfp, key, ret_fh, record_offset); if (status != 0) return status; /* Check that we're allowed to do subseq lookup on that file. */ if (! (sfp->fileflags[*ret_fh] & SSI_FAST_SUBSEQ)) return SSI_ERR_NO_SUBSEQS; /* Read the data we need for subseq lookup */ if (! read_offset(sfp->fp, sfp->smode, data_offset)) return SSI_ERR_NODATA; if (! read_i32(sfp->fp, &len)) return SSI_ERR_NODATA; /* Set up tmp variables for clarity of equations below, * and to make them match documentation (ssi-format.tex). */ r = sfp->rpl[*ret_fh]; /* residues per line */ b = sfp->bpl[*ret_fh]; /* bytes per line */ i = requested_start; /* start position 1..L */ l = (i-1)/r; /* data line # (0..) that the residue is on */ if (r == 0 || b == 0) return SSI_ERR_NO_SUBSEQS; if (i < 0 || i > len) return SSI_ERR_RANGE; /* When b = r+1, there's nothing but sequence on each data line (and the \0), * and we can find each residue precisely. */ if (b == r+1) { if (sfp->smode == SSI_OFFSET_I32) { data_offset->mode = SSI_OFFSET_I32; data_offset->off.i32 = data_offset->off.i32 + l*b + (i-1)%r; } else if (sfp->smode == SSI_OFFSET_I64) { data_offset->mode = SSI_OFFSET_I64; data_offset->off.i64 = data_offset->off.i64 + l*b + (i-1)%r; } *ret_actual_start = requested_start; } else { /* else, there's other stuff on seq lines, so the best * we can do easily is to position at start of relevant line. */ if (sfp->smode == SSI_OFFSET_I32) { data_offset->mode = SSI_OFFSET_I32; data_offset->off.i32 = data_offset->off.i32 + l*b; } else if (sfp->smode == SSI_OFFSET_I64) { data_offset->mode = SSI_OFFSET_I64; data_offset->off.i64 = data_offset->off.i64 + l*b; } /* yes, the eq below is = 1 + (i-1)/r*r but it's not = i. that's an integer /. */ *ret_actual_start = 1 + l*r; } return 0;}/* Function: SSISetFilePosition() * Date: SRE, Tue Jan 2 09:13:46 2001 [St. Louis] * * Purpose: Uses {offset} to sets the file position for {fp}, usually an * open sequence file, relative to the start of the file. * Hides the details of system-dependent shenanigans necessary for * file positioning in large (>2 GB) files. * * Behaves just like fseek(fp, offset, SEEK_SET) for 32 bit * offsets and <2 GB files. * * Warning: if all else fails, in desperation, it will try to * use fsetpos(). This requires making assumptions about fpos_t * that may be unwarranted... assumptions that ANSI C prohibits * me from making... though I believe the ./configure * script robustly tests whether I can play with fpos_t like this. * * Args: fp - file to position. * offset - SSI offset relative to file start. * * Returns: 0 on success, nonzero on error. */intSSISetFilePosition(FILE *fp, SSIOFFSET *offset){ if (offset->mode == SSI_OFFSET_I32) { if (fseek(fp, offset->off.i32, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; }#ifndef HAS_64BIT_FILE_OFFSETS else return SSI_ERR_NO64BIT;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -