seqdbvol.cpp

来自「ncbi源码」· C++ 代码 · 共 1,014 行 · 第 1/2 页

CPP
1,014
字号
/* * =========================================================================== * PRODUCTION $Log: seqdbvol.cpp,v $ * PRODUCTION Revision 1000.1  2004/06/01 19:46:54  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.15 * PRODUCTION * =========================================================================== *//*  $Id: seqdbvol.cpp,v 1000.1 2004/06/01 19:46:54 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * * Author:  Kevin Bealer * */#include <ncbi_pch.hpp>#include "seqdbvol.hpp"BEGIN_NCBI_SCOPEchar CSeqDBVol::GetSeqType(void) const{    return x_GetSeqType();}char CSeqDBVol::x_GetSeqType(void) const{    return m_Idx.GetSeqType();}Int4 CSeqDBVol::GetSeqLength(Uint4 oid, bool approx) const{    Uint4 start_offset = 0;    Uint4 end_offset   = 0;        Int4 length = -1;        if (! m_Idx.GetSeqStartEnd(oid, start_offset, end_offset))        return -1;        char seqtype = m_Idx.GetSeqType();        if (kSeqTypeProt == seqtype) {        // Subtract one, for the inter-sequence null.        length = end_offset - start_offset - 1;    } else if (kSeqTypeNucl == seqtype) {        Int4 whole_bytes = end_offset - start_offset - 1;                if (approx) {            // Same principle as below - but use lower bits of oid            // instead of fetching the actual last byte.  this should            // correct for bias, unless sequence length modulo 4 has a            // significant statistical bias, which seems unlikely to            // me.                        length = (whole_bytes * 4) + (oid & 0x03);        } else {            // The last byte is partially full; the last two bits of            // the last byte store the number of nucleotides in the            // last byte (0 to 3).                        char amb_char = 0;                        m_Seq.ReadBytes(& amb_char, end_offset - 1, end_offset);                        Int4 remainder = amb_char & 3;            length = (whole_bytes * 4) + remainder;        }    }            return length;}static CFastMutex s_MapNaMutex;static vector<Uint1>s_SeqDBMapNA2ToNA4Setup(void){    vector<Uint1> translated;    translated.resize(512);        Uint1 convert[16] = {17,  18,  20,  24,  33,  34,  36,  40,                         65,  66,  68,  72, 129, 130, 132, 136};    Int2 pair1 = 0;    Int2 pair2 = 0;        for(pair1 = 0; pair1 < 16; pair1++) {        for(pair2 = 0; pair2 < 16; pair2++) {            Int2 index = (pair1 * 16 + pair2) * 2;                        translated[index]   = convert[pair1];            translated[index+1] = convert[pair2];        }    }        return translated;}static voids_SeqDBMapNA2ToNA4(const char   * buf2bit,                   vector<char> & buf4bit,                   int            base_length){    static vector<Uint1> expanded = s_SeqDBMapNA2ToNA4Setup();        Uint4 estimated_length = (base_length + 1)/2;    buf4bit.reserve(estimated_length);        int inp_chars = base_length/4;        for(int i=0; i<inp_chars; i++) {        Uint4 inp_char = (buf2bit[i] & 0xFF);                buf4bit.push_back(expanded[ (inp_char*2)     ]);        buf4bit.push_back(expanded[ (inp_char*2) + 1 ]);    }        int bases_remain = base_length - (inp_chars*4);        if (bases_remain) {        Uint1 remainder_bits = 2 * bases_remain;        Uint1 remainder_mask = (0xFF << (8 - remainder_bits)) & 0xFF;        Uint4 last_masked = buf2bit[inp_chars] & remainder_mask;                buf4bit.push_back(expanded[ (last_masked*2) ]);                if (bases_remain > 2) {            buf4bit.push_back(expanded[ (last_masked*2)+1 ]);        }    }        _ASSERT(estimated_length == buf4bit.size());}static vector<Uint1>s_SeqDBMapNA2ToNA8Setup(void){    // Builds a table; each two bit slice holds 0,1,2 or 3.  These are    // converted to whole bytes containing 1,2,4, or 8, respectively.        vector<Uint1> translated;    translated.reserve(1024);        for(Uint4 i = 0; i<256; i++) {        Uint4 p1 = (i >> 6) & 0x3;        Uint4 p2 = (i >> 4) & 0x3;        Uint4 p3 = (i >> 2) & 0x3;        Uint4 p4 = i & 0x3;                translated.push_back(1 << p1);        translated.push_back(1 << p2);        translated.push_back(1 << p3);        translated.push_back(1 << p4);    }        return translated;}static voids_SeqDBMapNA2ToNA8(const char   * buf2bit,                   vector<char> & buf8bit,                   Uint4          base_length,                   bool           sentinel_bytes){    static vector<Uint1> expanded = s_SeqDBMapNA2ToNA8Setup();        int sreserve = 0;        if (sentinel_bytes) {        sreserve = 2;        buf8bit.push_back((unsigned char)(0));    }        buf8bit.reserve(base_length + sreserve);        int whole_input_chars = base_length/4;        // In a nucleotide search, this loop is probably a noticeable time    // consumer, at least relative to the CSeqDB universe.  Each input    // byte is used to look up a 4 byte output translation.  That four    // byte section is copied to the output vector.  By pre-processing    // the arithmetic in the ~Setup() function, we can just pull bytes    // from a vector.        for(int i = 0; i < whole_input_chars; i++) {        Uint4 table_offset = (buf2bit[i] & 0xFF) * 4;                buf8bit.push_back(expanded[ table_offset ]);        buf8bit.push_back(expanded[ table_offset + 1 ]);        buf8bit.push_back(expanded[ table_offset + 2 ]);        buf8bit.push_back(expanded[ table_offset + 3 ]);    }        int bases_remain = base_length & 0x3;        if (bases_remain) {        // We use the last byte of the input to look up a four byte        // translation; but we only use the bytes of it that we need.                Uint4 table_offset = (buf2bit[whole_input_chars] & 0xFF) * 4;                while(bases_remain--) {            buf8bit.push_back(expanded[ ++table_offset ]);        }    }        if (sentinel_bytes) {        buf8bit.push_back((unsigned char)(0));    }        _ASSERT(base_length == (buf8bit.size() - sreserve));}#if 0static vector<Uint1>s_SeqDBMapNcbiNA4ToBlastNA4Setup(void){    vector<Uint1> translated;    translated.resize(256);        // This mapping stolen., er, courtesy of blastkar.c.        Uint1 trans_ncbi4na_to_blastna[] = {        15, /* Gap, 0 */        0,  /* A,   1 */        1,  /* C,   2 */        6,  /* M,   3 */        2,  /* G,   4 */        4,  /* R,   5 */        9,  /* S,   6 */        13, /* V,   7 */        3,  /* T,   8 */        8,  /* W,   9 */        5,  /* Y,  10 */        12, /* H,  11 */        7,  /* K,  12 */        11, /* D,  13 */        10, /* B,  14 */        14  /* N,  15 */    };        for(int i = 0; i<256; i++) {        int a1 = (i >> 4) & 0xF;        int a2 = i & 0xF;                int b1 = trans_ncbi4na_to_blastna[a1];        int b2 = trans_ncbi4na_to_blastna[a2];                translated[i] = (b1 << 4) | b2;    }        return translated;}static voids_SeqDBMapNcbiNA4ToBlastNA4(vector<char> & buf){    static vector<Uint1> trans = s_SeqDBMapNcbiNA4ToBlastNA4Setup();        for(Uint4 i = 0; i < buf.size(); i++) {        buf[i] = trans[ unsigned(buf[i]) & 0xFF ];    }}#endifstatic voids_SeqDBMapNcbiNA8ToBlastNA8(vector<char> & buf){    Uint1 trans_ncbina8_to_blastna8[] = {        15, /* Gap, 0 */        0,  /* A,   1 */        1,  /* C,   2 */        6,  /* M,   3 */        2,  /* G,   4 */        4,  /* R,   5 */        9,  /* S,   6 */        13, /* V,   7 */        3,  /* T,   8 */        8,  /* W,   9 */        5,  /* Y,  10 */        12, /* H,  11 */        7,  /* K,  12 */        11, /* D,  13 */        10, /* B,  14 */        14  /* N,  15 */    };        for(Uint4 i = 0; i < buf.size(); i++) {        buf[i] = trans_ncbina8_to_blastna8[ buf[i] ];    }}//--------------------// NEW (long) version//--------------------inline Uint4 s_ResLenNew(const vector<Int4> & ambchars, Uint4 i){    return (ambchars[i] >> 16) & 0xFFF;}inline Uint4 s_ResPosNew(const vector<Int4> & ambchars, Uint4 i){    return ambchars[i+1];}//-----------------------// OLD (compact) version//-----------------------inline Uint4 s_ResVal(const vector<Int4> & ambchars, Uint4 i){    return (ambchars[i] >> 28);}inline Uint4 s_ResLenOld(const vector<Int4> & ambchars, Uint4 i){    return (ambchars[i] >> 24) & 0xF;}inline Uint4 s_ResPosOld(const vector<Int4> & ambchars, Uint4 i){    return ambchars[i];}static bools_SeqDBRebuildDNA_NA4(vector<char>       & buf4bit,                      const vector<Int4> & amb_chars){    if (buf4bit.empty())        return false;        if (amb_chars.empty())         return true;        Uint4 amb_num = amb_chars[0];        /* Check if highest order bit set. */    bool new_format = (amb_num & 0x80000000) != 0;        if (new_format) {	amb_num &= 0x7FFFFFFF;    }        for(Uint4 i=1; i < amb_num+1; i++) {        Int4  row_len  = 0;        Int4  position = 0;        Uint1 char_r   = 0;        	if (new_format) {            char_r    = s_ResVal   (amb_chars, i);            row_len   = s_ResLenNew(amb_chars, i);             position  = s_ResPosNew(amb_chars, i);	} else {            char_r    = s_ResVal   (amb_chars, i);            row_len   = s_ResLenOld(amb_chars, i);             position  = s_ResPosOld(amb_chars, i);	}                Int4  pos = position / 2;        Int4  rem = position & 1;  /* 0 or 1 */        Uint1 char_l = char_r << 4;                Int4 j;        Int4 index = pos;                // This could be made slightly faster for long runs.                for(j = 0; j <= row_len; j++) {            if (!rem) {           	buf4bit[index] = (buf4bit[index] & 0x0F) + char_l;            	rem = 1;            } else {           	buf4bit[index] = (buf4bit[index] & 0xF0) + char_r;            	rem = 0;                index++;            }    	}        	if (new_format) /* for new format we have 8 bytes for each element. */            i++;    }        return true;}static bools_SeqDBRebuildDNA_NA8(vector<char>       & buf4bit,                      const vector<Int4> & amb_chars){    if (buf4bit.empty())        return false;        if (amb_chars.empty())         return true;        Uint4 amb_num = amb_chars[0];        /* Check if highest order bit set. */    bool new_format = (amb_num & 0x80000000) != 0;        if (new_format) {	amb_num &= 0x7FFFFFFF;    }        for(Uint4 i = 1; i < amb_num+1; i++) {        Int4  row_len  = 0;        Int4  position = 0;        Uint1 trans_ch = 0;        	if (new_format) {            trans_ch  = s_ResVal   (amb_chars, i);            row_len   = s_ResLenNew(amb_chars, i);             position  = s_ResPosNew(amb_chars, i);	} else {            trans_ch  = s_ResVal   (amb_chars, i);            row_len   = s_ResLenOld(amb_chars, i);             position  = s_ResPosOld(amb_chars, i);	}                Int4 index = position;                // This could be made slightly faster for long runs.                for(Int4 j = 0; j <= row_len; j++) {            buf4bit[index++] = trans_ch;//             if (!rem) {//            	buf4bit[index] = (buf4bit[index] & 0x0F) + char_l;//             	rem = 1;//             } else {//            	buf4bit[index] = (buf4bit[index] & 0xF0) + char_r;//             	rem = 0;//                 index++;//             }    	}        	if (new_format) /* for new format we have 8 bytes for each element. */            i++;    }        return true;}static voids_SeqDBWriteSeqDataProt(CSeq_inst  & seqinst,                        const char * seq_buffer,                        Int4         length){    // stuff - ncbistdaa    // mol = aa            // This possibly/probably copies several times.    // 1. One copy into stdaa_data.    // 2. Second copy into NCBIstdaa.    // 3. Third copy into seqdata.        vector<char> aa_data;    aa_data.resize(length);        for(int i = 0; i < length; i++) {        aa_data[i] = seq_buffer[i];    }        seqinst.SetSeq_data().SetNcbistdaa().Set().swap(aa_data);    seqinst.SetMol(CSeq_inst::eMol_aa);}static voids_SeqDBWriteSeqDataNucl(CSeq_inst    & seqinst,                        const char   * seq_buffer,                        Int4           length){    Int4 whole_bytes  = length / 4;    Int4 partial_byte = ((length & 0x3) != 0) ? 1 : 0;        vector<char> na_data;    na_data.resize(whole_bytes + partial_byte);        for(int i = 0; i<whole_bytes; i++) {        na_data[i] = seq_buffer[i];    }        if (partial_byte) {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?