📄 blast_lookup.c
字号:
/* * =========================================================================== * PRODUCTION $Log: blast_lookup.c,v $ * PRODUCTION Revision 1000.3 2004/06/01 18:07:25 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.25 * PRODUCTION * =========================================================================== *//* $Id: blast_lookup.c,v 1000.3 2004/06/01 18:07:25 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's offical duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== *//** @file blast_lookup.c * @todo FIXME needs file description */#include <algo/blast/core/blast_def.h>#include <algo/blast/core/blast_options.h>#include <algo/blast/core/blast_lookup.h>#include <algo/blast/core/blast_rps.h>#include <algo/blast/core/lookup_util.h>#include <algo/blast/core/blast_encoding.h>#include "blast_inline.h"static char const rcsid[] = "$Id: blast_lookup.c,v 1000.3 2004/06/01 18:07:25 gouriano Exp $";static void AddWordHits( LookupTable *lookup, Int4** matrix, Uint1* word, Int4 offset, Int4 query_bias);static void AddPSSMWordHits( LookupTable *lookup, Int4** matrix, Int4 offset, Int4 query_bias);Int4 BlastAaLookupNew(const LookupTableOptions* opt, LookupTable* * lut){ return LookupTableNew(opt, lut, TRUE);}Int4 RPSLookupTableNew(const RPSInfo *info, RPSLookupTable* * lut){ Int4 i; RPSLookupFileHeader *lookup_header; RPSProfileHeader *profile_header; RPSLookupTable* lookup = *lut = (RPSLookupTable*) calloc(1, sizeof(RPSLookupTable)); Int4* pssm_start; Int4 num_profiles; Int4 num_pssm_rows; Int4 longest_chain; ASSERT(info != NULL); /* Fill in the lookup table information. */ lookup_header = info->lookup_header; if (lookup_header->magic_number != RPS_MAGIC_NUM) return -1; lookup->rps_aux_info = (RPSAuxInfo *)(&info->aux_info); lookup->wordsize = BLAST_WORDSIZE_PROT; lookup->alphabet_size = PSI_ALPHABET_SIZE; lookup->charsize = ilog2(lookup->alphabet_size) + 1; lookup->backbone_size = 1 << (lookup->wordsize * lookup->charsize); lookup->mask = lookup->backbone_size - 1; lookup->rps_backbone = (RPSBackboneCell *)((Uint1 *)lookup_header + lookup_header->start_of_backbone); lookup->overflow = (Int4 *)((Uint1 *)lookup_header + lookup_header->start_of_backbone + (lookup->backbone_size + 1)* sizeof(RPSBackboneCell)); lookup->overflow_size = lookup_header->overflow_hits; /* fill in the pv_array */ lookup->pv = (PV_ARRAY_TYPE *) calloc((lookup->backbone_size >> PV_ARRAY_BTS) , sizeof(PV_ARRAY_TYPE)); longest_chain = 0; for (i = 0; i < lookup->backbone_size; i++) { if (lookup->rps_backbone[i].num_used > 0) { PV_SET(lookup,i); } if (lookup->rps_backbone[i].num_used > longest_chain) { longest_chain = lookup->rps_backbone[i].num_used; } } lookup->longest_chain = longest_chain; /* Fill in the PSSM information */ profile_header = info->profile_header; if (profile_header->magic_number != RPS_MAGIC_NUM) return -2; lookup->rps_seq_offsets = profile_header->start_offsets; num_profiles = profile_header->num_profiles; num_pssm_rows = lookup->rps_seq_offsets[num_profiles]; lookup->rps_pssm = (Int4 **)malloc((num_pssm_rows+1) * sizeof(Int4 *)); pssm_start = profile_header->start_offsets + num_profiles + 1; for (i = 0; i < num_pssm_rows + 1; i++) { lookup->rps_pssm[i] = pssm_start; pssm_start += lookup->alphabet_size; } return 0;}Int4 LookupTableNew(const LookupTableOptions* opt, LookupTable* * lut, Boolean is_protein){ LookupTable* lookup = *lut = (LookupTable*) calloc(1, sizeof(LookupTable)); ASSERT(lookup != NULL); if (is_protein) { Int4 i; lookup->charsize = ilog2(opt->alphabet_size) + 1; lookup->wordsize = opt->word_size; lookup->backbone_size = 0; for(i=0;i<lookup->wordsize;i++) lookup->backbone_size |= (opt->alphabet_size - 1) << (i * lookup->charsize); lookup->backbone_size += 1; lookup->mask = makemask(opt->word_size * lookup->charsize); } else { lookup->word_length = opt->word_size; lookup->scan_step = opt->scan_step; lookup->charsize = ilog2(opt->alphabet_size / COMPRESSION_RATIO); lookup->wordsize = (opt->word_size - MIN(lookup->scan_step,COMPRESSION_RATIO) + 1) / COMPRESSION_RATIO; lookup->reduced_wordsize = (lookup->wordsize >= 2) ? 2 : 1; lookup->backbone_size = iexp(2,lookup->reduced_wordsize*lookup->charsize*COMPRESSION_RATIO); lookup->mask = lookup->backbone_size - 1; } lookup->alphabet_size = opt->alphabet_size; lookup->exact_matches=0; lookup->neighbor_matches=0; lookup->threshold = opt->threshold; lookup->thin_backbone = (Int4**) calloc(lookup->backbone_size , sizeof(Int4*)); ASSERT(lookup->thin_backbone != NULL); lookup->use_pssm = opt->use_pssm; lookup->neighbors=NULL; lookup->overflow=NULL; return 0;}Int4 BlastAaLookupAddWordHit(LookupTable* lookup, /* in/out: the lookup table */ Uint1* w, Int4 query_offset){ Int4 index=0; Int4 chain_size = 0; /* total number of elements in the chain */ Int4 hits_in_chain = 0; /* number of occupied elements in the chain, not including the zeroth and first positions */ Int4 * chain = NULL; /* compute its index, */ _ComputeIndex(lookup->wordsize,lookup->charsize,lookup->mask, w, &index); ASSERT(index < lookup->backbone_size); /* if backbone cell is null, initialize a new chain */ if (lookup->thin_backbone[index] == NULL) { chain_size = 8; hits_in_chain = 0; chain = (Int4*) calloc( chain_size, sizeof(Int4) ); ASSERT(chain != NULL); chain[0] = chain_size; chain[1] = hits_in_chain; lookup->thin_backbone[index] = chain; } else /* otherwise, use the existing chain */ { chain = lookup->thin_backbone[index]; chain_size = chain[0]; hits_in_chain = chain[1]; } /* if the chain is full, allocate more room */ if ( (hits_in_chain + 2) == chain_size ) { chain_size = chain_size * 2; chain = (Int4*) realloc(chain, chain_size * sizeof(Int4) ); ASSERT(chain != NULL); lookup->thin_backbone[index] = chain; chain[0] = chain_size; } /* add the hit */ chain[ chain[1] + 2 ] = query_offset; chain[1] += 1; return 0;}Int4 _BlastAaLookupFinalize(LookupTable* lookup){ Int4 i; Int4 overflow_cells_needed=0; Int4 overflow_cursor = 0; Int4 backbone_occupancy=0; Int4 thick_backbone_occupancy=0; Int4 num_overflows=0; Int4 longest_chain=0; /* allocate the new lookup table */ lookup->thick_backbone = (LookupBackboneCell *) calloc(lookup->backbone_size , sizeof(LookupBackboneCell)); ASSERT(lookup->thick_backbone != NULL); /* allocate the pv_array */ lookup->pv = (PV_ARRAY_TYPE *) calloc((lookup->backbone_size >> PV_ARRAY_BTS) + 1 , sizeof(PV_ARRAY_TYPE)); ASSERT(lookup->pv != NULL); /* find out how many cells have >3 hits */ for(i=0;i<lookup->backbone_size;i++) if (lookup->thin_backbone[i] != NULL) { if (lookup->thin_backbone[i][1] > HITS_ON_BACKBONE) overflow_cells_needed += lookup->thin_backbone[i][1]; if (lookup->thin_backbone[i][1] > longest_chain) longest_chain = lookup->thin_backbone[i][1]; } lookup->longest_chain = longest_chain; /* allocate the overflow array */ lookup->overflow = (Int4*) calloc( overflow_cells_needed, sizeof(Int4) ); ASSERT(lookup->overflow != NULL);/* for each position in the lookup table backbone, */for(i=0;i<lookup->backbone_size;i++) { /* if there are hits there, */ if ( lookup->thin_backbone[i] != NULL ) { /* set the corresponding bit in the pv_array */ PV_SET(lookup,i); backbone_occupancy++; /* if there are three or fewer hits, */ if ( (lookup->thin_backbone[i])[1] <= HITS_ON_BACKBONE ) /* copy them into the thick_backbone cell */ { Int4 j; thick_backbone_occupancy++; lookup->thick_backbone[i].num_used = lookup->thin_backbone[i][1]; for(j=0;j<lookup->thin_backbone[i][1];j++) lookup->thick_backbone[i].payload.entries[j] = lookup->thin_backbone[i][j+2]; } else /* more than three hits; copy to overflow array */ { Int4 j; num_overflows++; lookup->thick_backbone[i].num_used = lookup->thin_backbone[i][1]; lookup->thick_backbone[i].payload.overflow_cursor = overflow_cursor; for(j=0;j<lookup->thin_backbone[i][1];j++) { lookup->overflow[overflow_cursor] = lookup->thin_backbone[i][j+2]; overflow_cursor++; } } /* done with this chain- free it */ sfree(lookup->thin_backbone[i]); lookup->thin_backbone[i]=NULL; } else /* no hits here */ { lookup->thick_backbone[i].num_used=0; } } /* end for */ lookup->overflow_size = overflow_cursor;/* done copying hit info- free the backbone */ sfree(lookup->thin_backbone); lookup->thin_backbone=NULL;#ifdef LOOKUP_VERBOSE printf("backbone size : %d\nbackbone occupancy: %d (%f%%)\nthick_backbone occupancy: %d (%f%%)\nnum_overflows: %d\noverflow size: %d\nlongest chain: %d\n",lookup->backbone_size, backbone_occupancy, 100.0 * (float) backbone_occupancy/ (float) lookup->backbone_size, thick_backbone_occupancy, 100.0 * (float) thick_backbone_occupancy / (float) lookup->backbone_size, num_overflows, overflow_cells_needed,longest_chain); printf("exact matches : %d\nneighbor matches : %d\n",lookup->exact_matches,lookup->neighbor_matches);#endif return 0;}Int4 BlastAaScanSubject(const LookupTableWrap* lookup_wrap, const BLAST_SequenceBlk *subject, Int4* offset, Uint4 * NCBI_RESTRICT query_offsets, Uint4 * NCBI_RESTRICT subject_offsets, Int4 array_size ){ Int4 index=0; Uint1* s=NULL; Uint1* s_first=NULL; Uint1* s_last=NULL; Int4 numhits = 0; /* number of hits found for a given subject offset */ Int4 totalhits = 0; /* cumulative number of hits found */ LookupTable* lookup = lookup_wrap->lut; s_first = subject->sequence + *offset; s_last = subject->sequence + subject->length - lookup->wordsize; _ComputeIndex(lookup->wordsize - 1, /* prime the index */ lookup->charsize, lookup->mask, s_first, &index); for(s=s_first; s <= s_last; s++) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -