📄 mb_lookup.c
字号:
/* * =========================================================================== * PRODUCTION $Log: mb_lookup.c,v $ * PRODUCTION Revision 1000.4 2004/06/01 18:08:17 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.33 * PRODUCTION * =========================================================================== *//* $Id: mb_lookup.c,v 1000.4 2004/06/01 18:08:17 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's offical duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Ilya Dondoshansky * *//** @file mb_lookup.c * Functions responsible for the creation of a lookup table * @todo FIXME @sa mb_lookup.h */static char const rcsid[] = "$Id: mb_lookup.c,v 1000.4 2004/06/01 18:08:17 gouriano Exp $";#include <algo/blast/core/blast_options.h>#include <algo/blast/core/blast_def.h>#include <algo/blast/core/mb_lookup.h>#include "blast_inline.h"MBLookupTable* MBLookupTableDestruct(MBLookupTable* mb_lt){ if (!mb_lt) return NULL; if (mb_lt->hashtable) sfree(mb_lt->hashtable); if (mb_lt->next_pos) sfree(mb_lt->next_pos); if (mb_lt->next_pos2) sfree(mb_lt->next_pos2); sfree(mb_lt->pv_array); sfree(mb_lt); return mb_lt;}/** Convert weight, template length and template type from input options into an MBTemplateType enum*/static DiscTemplateType GetDiscTemplateType(Int2 weight, Uint1 length, DiscWordType type){ if (weight == 11) { if (length == 16) { if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES) return TEMPL_11_16; else if (type == MB_WORD_OPTIMAL) return TEMPL_11_16_OPT; } else if (length == 18) { if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES) return TEMPL_11_18; else if (type == MB_WORD_OPTIMAL) return TEMPL_11_18_OPT; } else if (length == 21) { if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES) return TEMPL_11_21; else if (type == MB_WORD_OPTIMAL) return TEMPL_11_21_OPT; } } else if (weight == 12) { if (length == 16) { if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES) return TEMPL_12_16; else if (type == MB_WORD_OPTIMAL) return TEMPL_12_16_OPT; } else if (length == 18) { if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES) return TEMPL_12_18; else if (type == MB_WORD_OPTIMAL) return TEMPL_12_18_OPT; } else if (length == 21) { if (type == MB_WORD_CODING || type == MB_TWO_TEMPLATES) return TEMPL_12_21; else if (type == MB_WORD_OPTIMAL) return TEMPL_12_21_OPT; } } return TEMPL_CONTIGUOUS; /* All unsupported cases default to 0 */}#define SMALL_QUERY_CUTOFF 15000#define LARGE_QUERY_CUTOFF 800000/* Documentation in mb_lookup.h */Int2 MB_LookupTableNew(BLAST_SequenceBlk* query, ListNode* location, MBLookupTable** mb_lt_ptr, const LookupTableOptions* lookup_options){ Int4 query_length; Uint1* seq,* pos; Int4 index; Int4 ecode; Int4 mask; Int4 ecode1, ecode2; Uint1 val, nuc_mask = 0xfc; MBLookupTable* mb_lt; Int4 masked_word_count = 0; PV_ARRAY_TYPE *pv_array=NULL; Int4 pv_shift, pv_array_bts, pv_size, pv_index; Int2 word_length, extra_length; Int4 last_offset; Int4 table_entries;#ifdef USE_HASH_TABLE Int4 hash_shift, hash_mask, crc, size, length; Uint1* hash_buf;#endif DiscTemplateType template_type; Boolean amb_cond; Boolean two_templates = (lookup_options->mb_template_type == MB_TWO_TEMPLATES); Int2 bytes_in_word; Uint1 width; Int4 from, to; ListNode* loc; Int4 longest_chain = 0; Int4 pcount, pcount1=0; if (!location) { /* Empty sequence location provided */ *mb_lt_ptr = NULL; return -1; } template_type = GetDiscTemplateType(lookup_options->word_size, lookup_options->mb_template_length, (DiscWordType)lookup_options->mb_template_type); query_length = query->length; mb_lt = (MBLookupTable*) calloc(1, sizeof(MBLookupTable)); bytes_in_word = (lookup_options->word_size + 1)/ 4; if (bytes_in_word < 3) width = 2; else width = 3; mb_lt->template_type = template_type; mb_lt->two_templates = two_templates; if (two_templates) /* For now leave only one possibility for the second template */ mb_lt->second_template_type = template_type + 1;#ifdef USE_HASH_TABLE /* Make hash table size the smallest power of 2 greater than query length */ for (length=query_length,size=1; length>0; length=length>>1,size++); mb_lt->hashsize = 1<<size; hash_shift = (32 - size)/2; hash_mask = mb_lt->hashsize - 1; pv_shift = 0;#else if (width == 2) { mb_lt->hashsize = 1 << 16; pv_shift = 0; } else { /* determine the approximate number of hashtable entries */ table_entries = 0; for (loc = location; loc; loc = loc->next) { from = ((SSeqRange*) loc->ptr)->left; to = ((SSeqRange*) loc->ptr)->right; table_entries += (to - from); } /* To fit in the external cache of latter-day micro- processors, the PV array must be compressed. pv_shift below is the power of two that the array size is divided by. The target PV array size is 128 kBytes. If the query is too small or too large, the compression should be higher. Small queries don't reuse the PV array, and large queries saturate it. In either case, cache is better used on other data. */ if (lookup_options->word_size == 11 && lookup_options->mb_template_length > 0) { mb_lt->hashsize = 1 << 22; if( table_entries <= SMALL_QUERY_CUTOFF || table_entries >= LARGE_QUERY_CUTOFF ) pv_shift = 3; else pv_shift = 2; } else { mb_lt->hashsize = 1 << 24; if( table_entries <= SMALL_QUERY_CUTOFF || table_entries >= LARGE_QUERY_CUTOFF ) pv_shift = 5; else pv_shift = 4; } }#endif if (lookup_options->mb_template_length > 0) { mb_lt->discontiguous = TRUE; mb_lt->compressed_wordsize = width + 1; word_length = mb_lt->word_length = COMPRESSION_RATIO*(width+1); mb_lt->template_length = lookup_options->mb_template_length; extra_length = mb_lt->template_length - word_length; mb_lt->mask = (Int4) (-1); mask = (1 << (8*(width+1) - 2)) - 1; } else { mb_lt->compressed_wordsize = width; word_length = COMPRESSION_RATIO*width; mb_lt->word_length = lookup_options->word_size; mb_lt->template_length = COMPRESSION_RATIO*width; mb_lt->mask = mb_lt->hashsize - 1; mask = (1 << (8*width - 2)) - 1; extra_length = 0; } /* Scan step must be set in lookup_options; even if it is not, still try to set it reasonably in mb_lt - it can't remain 0 */ if (lookup_options->scan_step) mb_lt->scan_step = lookup_options->scan_step; else mb_lt->scan_step = COMPRESSION_RATIO; mb_lt->full_byte_scan = (mb_lt->scan_step % COMPRESSION_RATIO == 0); if ((mb_lt->hashtable = (Int4*) calloc(mb_lt->hashsize, sizeof(Int4))) == NULL) { MBLookupTableDestruct(mb_lt); return -1; } if ((mb_lt->next_pos = (Int4*) calloc((query_length+1), sizeof(Int4))) == NULL) { MBLookupTableDestruct(mb_lt); return -1; } if (two_templates) { if ((mb_lt->hashtable2 = (Int4*) calloc(mb_lt->hashsize, sizeof(Int4))) == NULL) { MBLookupTableDestruct(mb_lt); return -1; } if ((mb_lt->next_pos2 = (Int4*) calloc((query_length+1), sizeof(Int4))) == NULL) { MBLookupTableDestruct(mb_lt); return -1; } } for (loc = location; loc; loc = loc->next) { /* We want index to be always pointing to the start of the word. Since sequence pointer points to the end of the word, subtract word length from the loop boundaries. */ from = ((SSeqRange*) loc->ptr)->left; to = ((SSeqRange*) loc->ptr)->right - word_length; seq = query->sequence_start + from; pos = seq + word_length; ecode = 0; /* Also add 1 to all indices, because lookup table indices count from 1. */ from -= word_length - 2; last_offset = to - extra_length + 2; amb_cond = TRUE; for (index = from; index <= last_offset; index++) { val = *++seq; if ((val & nuc_mask) != 0) { /* ambiguity or gap */ ecode = 0; pos = seq + word_length; } else { /* get next base */ ecode = ((ecode & mask) << 2) + val; if (seq >= pos) { if (extra_length) { switch (template_type) { case TEMPL_11_18: case TEMPL_12_18: amb_cond = !GET_AMBIG_CONDITION_18(seq); break; case TEMPL_11_18_OPT: case TEMPL_12_18_OPT: amb_cond = !GET_AMBIG_CONDITION_18_OPT(seq); break; case TEMPL_11_21: case TEMPL_12_21: amb_cond = !GET_AMBIG_CONDITION_21(seq); break; case TEMPL_11_21_OPT: case TEMPL_12_21_OPT: amb_cond = !GET_AMBIG_CONDITION_21_OPT(seq); break; default: break; } } if (amb_cond) { switch (template_type) { case TEMPL_11_16: ecode1 = GET_WORD_INDEX_11_16(ecode); break; case TEMPL_12_16: ecode1 = GET_WORD_INDEX_12_16(ecode); break; case TEMPL_11_16_OPT: ecode1 = GET_WORD_INDEX_11_16_OPT(ecode); break; case TEMPL_12_16_OPT: ecode1 = GET_WORD_INDEX_12_16_OPT(ecode); break; case TEMPL_11_18: ecode1 = (GET_WORD_INDEX_11_18(ecode)) | (GET_EXTRA_CODE_18(seq)); break; case TEMPL_12_18: ecode1 = (GET_WORD_INDEX_12_18(ecode)) | (GET_EXTRA_CODE_18(seq)); break; case TEMPL_11_18_OPT: ecode1 = (GET_WORD_INDEX_11_18_OPT(ecode)) | (GET_EXTRA_CODE_18_OPT(seq)); break; case TEMPL_12_18_OPT: ecode1 = (GET_WORD_INDEX_12_18_OPT(ecode)) | (GET_EXTRA_CODE_18_OPT(seq)); break; case TEMPL_11_21: ecode1 = (GET_WORD_INDEX_11_21(ecode)) | (GET_EXTRA_CODE_21(seq)); break; case TEMPL_12_21: ecode1 = (GET_WORD_INDEX_12_21(ecode)) | (GET_EXTRA_CODE_21(seq)); break; case TEMPL_11_21_OPT: ecode1 = (GET_WORD_INDEX_11_21_OPT(ecode)) | (GET_EXTRA_CODE_21_OPT(seq)); break; case TEMPL_12_21_OPT: ecode1 = (GET_WORD_INDEX_12_21_OPT(ecode)) | (GET_EXTRA_CODE_21_OPT(seq)); break; default: /* Contiguous word */ ecode1 = ecode; break; } #ifdef USE_HASH_TABLE hash_buf = (Uint1*)&ecode1; CRC32(crc, hash_buf); ecode1 = (crc>>hash_shift) & hash_mask;#endif if (two_templates) { switch (template_type) { case TEMPL_11_16: ecode2 = GET_WORD_INDEX_11_16_OPT(ecode); break; case TEMPL_12_16: ecode2 = GET_WORD_INDEX_12_16_OPT(ecode); break; case TEMPL_11_18: ecode2 = (GET_WORD_INDEX_11_18_OPT(ecode)) | (GET_EXTRA_CODE_18_OPT(seq)); break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -