seqdbvol.cpp
来自「ncbi源码」· C++ 代码 · 共 1,014 行 · 第 1/2 页
CPP
1,014 行
/* * =========================================================================== * PRODUCTION $Log: seqdbvol.cpp,v $ * PRODUCTION Revision 1000.1 2004/06/01 19:46:54 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.15 * PRODUCTION * =========================================================================== *//* $Id: seqdbvol.cpp,v 1000.1 2004/06/01 19:46:54 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Kevin Bealer * */#include <ncbi_pch.hpp>#include "seqdbvol.hpp"BEGIN_NCBI_SCOPEchar CSeqDBVol::GetSeqType(void) const{ return x_GetSeqType();}char CSeqDBVol::x_GetSeqType(void) const{ return m_Idx.GetSeqType();}Int4 CSeqDBVol::GetSeqLength(Uint4 oid, bool approx) const{ Uint4 start_offset = 0; Uint4 end_offset = 0; Int4 length = -1; if (! m_Idx.GetSeqStartEnd(oid, start_offset, end_offset)) return -1; char seqtype = m_Idx.GetSeqType(); if (kSeqTypeProt == seqtype) { // Subtract one, for the inter-sequence null. length = end_offset - start_offset - 1; } else if (kSeqTypeNucl == seqtype) { Int4 whole_bytes = end_offset - start_offset - 1; if (approx) { // Same principle as below - but use lower bits of oid // instead of fetching the actual last byte. this should // correct for bias, unless sequence length modulo 4 has a // significant statistical bias, which seems unlikely to // me. length = (whole_bytes * 4) + (oid & 0x03); } else { // The last byte is partially full; the last two bits of // the last byte store the number of nucleotides in the // last byte (0 to 3). char amb_char = 0; m_Seq.ReadBytes(& amb_char, end_offset - 1, end_offset); Int4 remainder = amb_char & 3; length = (whole_bytes * 4) + remainder; } } return length;}static CFastMutex s_MapNaMutex;static vector<Uint1>s_SeqDBMapNA2ToNA4Setup(void){ vector<Uint1> translated; translated.resize(512); Uint1 convert[16] = {17, 18, 20, 24, 33, 34, 36, 40, 65, 66, 68, 72, 129, 130, 132, 136}; Int2 pair1 = 0; Int2 pair2 = 0; for(pair1 = 0; pair1 < 16; pair1++) { for(pair2 = 0; pair2 < 16; pair2++) { Int2 index = (pair1 * 16 + pair2) * 2; translated[index] = convert[pair1]; translated[index+1] = convert[pair2]; } } return translated;}static voids_SeqDBMapNA2ToNA4(const char * buf2bit, vector<char> & buf4bit, int base_length){ static vector<Uint1> expanded = s_SeqDBMapNA2ToNA4Setup(); Uint4 estimated_length = (base_length + 1)/2; buf4bit.reserve(estimated_length); int inp_chars = base_length/4; for(int i=0; i<inp_chars; i++) { Uint4 inp_char = (buf2bit[i] & 0xFF); buf4bit.push_back(expanded[ (inp_char*2) ]); buf4bit.push_back(expanded[ (inp_char*2) + 1 ]); } int bases_remain = base_length - (inp_chars*4); if (bases_remain) { Uint1 remainder_bits = 2 * bases_remain; Uint1 remainder_mask = (0xFF << (8 - remainder_bits)) & 0xFF; Uint4 last_masked = buf2bit[inp_chars] & remainder_mask; buf4bit.push_back(expanded[ (last_masked*2) ]); if (bases_remain > 2) { buf4bit.push_back(expanded[ (last_masked*2)+1 ]); } } _ASSERT(estimated_length == buf4bit.size());}static vector<Uint1>s_SeqDBMapNA2ToNA8Setup(void){ // Builds a table; each two bit slice holds 0,1,2 or 3. These are // converted to whole bytes containing 1,2,4, or 8, respectively. vector<Uint1> translated; translated.reserve(1024); for(Uint4 i = 0; i<256; i++) { Uint4 p1 = (i >> 6) & 0x3; Uint4 p2 = (i >> 4) & 0x3; Uint4 p3 = (i >> 2) & 0x3; Uint4 p4 = i & 0x3; translated.push_back(1 << p1); translated.push_back(1 << p2); translated.push_back(1 << p3); translated.push_back(1 << p4); } return translated;}static voids_SeqDBMapNA2ToNA8(const char * buf2bit, vector<char> & buf8bit, Uint4 base_length, bool sentinel_bytes){ static vector<Uint1> expanded = s_SeqDBMapNA2ToNA8Setup(); int sreserve = 0; if (sentinel_bytes) { sreserve = 2; buf8bit.push_back((unsigned char)(0)); } buf8bit.reserve(base_length + sreserve); int whole_input_chars = base_length/4; // In a nucleotide search, this loop is probably a noticeable time // consumer, at least relative to the CSeqDB universe. Each input // byte is used to look up a 4 byte output translation. That four // byte section is copied to the output vector. By pre-processing // the arithmetic in the ~Setup() function, we can just pull bytes // from a vector. for(int i = 0; i < whole_input_chars; i++) { Uint4 table_offset = (buf2bit[i] & 0xFF) * 4; buf8bit.push_back(expanded[ table_offset ]); buf8bit.push_back(expanded[ table_offset + 1 ]); buf8bit.push_back(expanded[ table_offset + 2 ]); buf8bit.push_back(expanded[ table_offset + 3 ]); } int bases_remain = base_length & 0x3; if (bases_remain) { // We use the last byte of the input to look up a four byte // translation; but we only use the bytes of it that we need. Uint4 table_offset = (buf2bit[whole_input_chars] & 0xFF) * 4; while(bases_remain--) { buf8bit.push_back(expanded[ ++table_offset ]); } } if (sentinel_bytes) { buf8bit.push_back((unsigned char)(0)); } _ASSERT(base_length == (buf8bit.size() - sreserve));}#if 0static vector<Uint1>s_SeqDBMapNcbiNA4ToBlastNA4Setup(void){ vector<Uint1> translated; translated.resize(256); // This mapping stolen., er, courtesy of blastkar.c. Uint1 trans_ncbi4na_to_blastna[] = { 15, /* Gap, 0 */ 0, /* A, 1 */ 1, /* C, 2 */ 6, /* M, 3 */ 2, /* G, 4 */ 4, /* R, 5 */ 9, /* S, 6 */ 13, /* V, 7 */ 3, /* T, 8 */ 8, /* W, 9 */ 5, /* Y, 10 */ 12, /* H, 11 */ 7, /* K, 12 */ 11, /* D, 13 */ 10, /* B, 14 */ 14 /* N, 15 */ }; for(int i = 0; i<256; i++) { int a1 = (i >> 4) & 0xF; int a2 = i & 0xF; int b1 = trans_ncbi4na_to_blastna[a1]; int b2 = trans_ncbi4na_to_blastna[a2]; translated[i] = (b1 << 4) | b2; } return translated;}static voids_SeqDBMapNcbiNA4ToBlastNA4(vector<char> & buf){ static vector<Uint1> trans = s_SeqDBMapNcbiNA4ToBlastNA4Setup(); for(Uint4 i = 0; i < buf.size(); i++) { buf[i] = trans[ unsigned(buf[i]) & 0xFF ]; }}#endifstatic voids_SeqDBMapNcbiNA8ToBlastNA8(vector<char> & buf){ Uint1 trans_ncbina8_to_blastna8[] = { 15, /* Gap, 0 */ 0, /* A, 1 */ 1, /* C, 2 */ 6, /* M, 3 */ 2, /* G, 4 */ 4, /* R, 5 */ 9, /* S, 6 */ 13, /* V, 7 */ 3, /* T, 8 */ 8, /* W, 9 */ 5, /* Y, 10 */ 12, /* H, 11 */ 7, /* K, 12 */ 11, /* D, 13 */ 10, /* B, 14 */ 14 /* N, 15 */ }; for(Uint4 i = 0; i < buf.size(); i++) { buf[i] = trans_ncbina8_to_blastna8[ buf[i] ]; }}//--------------------// NEW (long) version//--------------------inline Uint4 s_ResLenNew(const vector<Int4> & ambchars, Uint4 i){ return (ambchars[i] >> 16) & 0xFFF;}inline Uint4 s_ResPosNew(const vector<Int4> & ambchars, Uint4 i){ return ambchars[i+1];}//-----------------------// OLD (compact) version//-----------------------inline Uint4 s_ResVal(const vector<Int4> & ambchars, Uint4 i){ return (ambchars[i] >> 28);}inline Uint4 s_ResLenOld(const vector<Int4> & ambchars, Uint4 i){ return (ambchars[i] >> 24) & 0xF;}inline Uint4 s_ResPosOld(const vector<Int4> & ambchars, Uint4 i){ return ambchars[i];}static bools_SeqDBRebuildDNA_NA4(vector<char> & buf4bit, const vector<Int4> & amb_chars){ if (buf4bit.empty()) return false; if (amb_chars.empty()) return true; Uint4 amb_num = amb_chars[0]; /* Check if highest order bit set. */ bool new_format = (amb_num & 0x80000000) != 0; if (new_format) { amb_num &= 0x7FFFFFFF; } for(Uint4 i=1; i < amb_num+1; i++) { Int4 row_len = 0; Int4 position = 0; Uint1 char_r = 0; if (new_format) { char_r = s_ResVal (amb_chars, i); row_len = s_ResLenNew(amb_chars, i); position = s_ResPosNew(amb_chars, i); } else { char_r = s_ResVal (amb_chars, i); row_len = s_ResLenOld(amb_chars, i); position = s_ResPosOld(amb_chars, i); } Int4 pos = position / 2; Int4 rem = position & 1; /* 0 or 1 */ Uint1 char_l = char_r << 4; Int4 j; Int4 index = pos; // This could be made slightly faster for long runs. for(j = 0; j <= row_len; j++) { if (!rem) { buf4bit[index] = (buf4bit[index] & 0x0F) + char_l; rem = 1; } else { buf4bit[index] = (buf4bit[index] & 0xF0) + char_r; rem = 0; index++; } } if (new_format) /* for new format we have 8 bytes for each element. */ i++; } return true;}static bools_SeqDBRebuildDNA_NA8(vector<char> & buf4bit, const vector<Int4> & amb_chars){ if (buf4bit.empty()) return false; if (amb_chars.empty()) return true; Uint4 amb_num = amb_chars[0]; /* Check if highest order bit set. */ bool new_format = (amb_num & 0x80000000) != 0; if (new_format) { amb_num &= 0x7FFFFFFF; } for(Uint4 i = 1; i < amb_num+1; i++) { Int4 row_len = 0; Int4 position = 0; Uint1 trans_ch = 0; if (new_format) { trans_ch = s_ResVal (amb_chars, i); row_len = s_ResLenNew(amb_chars, i); position = s_ResPosNew(amb_chars, i); } else { trans_ch = s_ResVal (amb_chars, i); row_len = s_ResLenOld(amb_chars, i); position = s_ResPosOld(amb_chars, i); } Int4 index = position; // This could be made slightly faster for long runs. for(Int4 j = 0; j <= row_len; j++) { buf4bit[index++] = trans_ch;// if (!rem) {// buf4bit[index] = (buf4bit[index] & 0x0F) + char_l;// rem = 1;// } else {// buf4bit[index] = (buf4bit[index] & 0xF0) + char_r;// rem = 0;// index++;// } } if (new_format) /* for new format we have 8 bytes for each element. */ i++; } return true;}static voids_SeqDBWriteSeqDataProt(CSeq_inst & seqinst, const char * seq_buffer, Int4 length){ // stuff - ncbistdaa // mol = aa // This possibly/probably copies several times. // 1. One copy into stdaa_data. // 2. Second copy into NCBIstdaa. // 3. Third copy into seqdata. vector<char> aa_data; aa_data.resize(length); for(int i = 0; i < length; i++) { aa_data[i] = seq_buffer[i]; } seqinst.SetSeq_data().SetNcbistdaa().Set().swap(aa_data); seqinst.SetMol(CSeq_inst::eMol_aa);}static voids_SeqDBWriteSeqDataNucl(CSeq_inst & seqinst, const char * seq_buffer, Int4 length){ Int4 whole_bytes = length / 4; Int4 partial_byte = ((length & 0x3) != 0) ? 1 : 0; vector<char> na_data; na_data.resize(whole_bytes + partial_byte); for(int i = 0; i<whole_bytes; i++) { na_data[i] = seq_buffer[i]; } if (partial_byte) {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?