seqdbvol.cpp

来自「ncbi源码」· C++ 代码 · 共 1,014 行 · 第 1/2 页

CPP
1,014
字号
        na_data[whole_bytes] = seq_buffer[whole_bytes] & (0xFF - 0x03);    }        seqinst.SetSeq_data().SetNcbi2na().Set().swap(na_data);    seqinst.SetMol(CSeq_inst::eMol_na);}static voids_SeqDBWriteSeqDataNucl(CSeq_inst    & seqinst,                        const char   * seq_buffer,                        Int4           length,                        vector<Int4> & amb_chars){    vector<char> buffer_4na;    s_SeqDBMapNA2ToNA4(seq_buffer, buffer_4na, length); // length is not /4 here    s_SeqDBRebuildDNA_NA4(buffer_4na, amb_chars);        seqinst.SetSeq_data().SetNcbi4na().Set().swap(buffer_4na);    seqinst.SetMol(CSeq_inst::eMol_na);}void s_GetDescrFromDefline(CRef<CBlast_def_line_set> deflines, string & descr){    descr.erase();        string seqid_str;        typedef list< CRef<CBlast_def_line> >::const_iterator TDefIt;     typedef list< CRef<CSeq_id        > >::const_iterator TSeqIt;       const list< CRef<CBlast_def_line> > & dl = deflines->Get();        for(TDefIt iter = dl.begin(); iter != dl.end(); iter++) {        ostringstream oss;                const CBlast_def_line & defline = **iter;                if (! descr.empty()) {            //oss << "\1";            oss << " ";        }                if (defline.CanGetSeqid()) {            const list< CRef<CSeq_id > > & sl = defline.GetSeqid();                        for (TSeqIt seqit = sl.begin(); seqit != sl.end(); seqit++) {                (*seqit)->WriteAsFasta(oss);            }        }                if (defline.CanGetTitle()) {            oss << defline.GetTitle();        }                descr += oss.str();    }}CRef<CBioseq>CSeqDBVol::GetBioseq(Int4 oid,                     bool /*use_objmgr,*/,                     bool /*insert_ctrlA*/) const{    // 1. Declare variables.        CRef<CBioseq> null_result;        // 2. if we can't find the seq_num (get_link), we are done.        //xx  Not relevant - the layer above this would do that before    //xx  This specific 'CSeqDB' object was involved.        // 3. get the descriptor - i.e. sip, defline.  Probably this    // is like the function above that gets asn, but split it up    // more when returning it.        // QUESTIONS: What do we actually want?  The defline & seqid, or    // the defline set and seqid set?  From readdb.c it looks like the    // single items are the prized artifacts.        CRef<CBlast_def_line_set> defline_set(x_GetHdr(oid));    CRef<CBlast_def_line>     defline;    list< CRef< CSeq_id > >   seqids;        {        if (defline_set.Empty() ||            (! defline_set->CanGet())) {            return null_result;        }                    defline = defline_set->Get().front();                if (! defline->CanGetSeqid()) {            // Not sure if this is necessary - if it turns out we can            // handle a null, this should be taken back out.            return null_result;        }                // Do we want the list, or just the first CRef<>??  For now, get        // the whole list.                seqids = defline->GetSeqid();    }        // 4. If insert_ctrlA is FALSE, we convert each "^A" to " >";    // Otherwise, we just copy the defline across. (inline func?)        //xx Ignoring ctrl-A issue for now (as per Tom's insns.)        // 5. Get length & sequence (_get_sequence).  That should be    // mimicked before this.        const char * seq_buffer = 0;        Int4 length = x_GetSequence(oid, & seq_buffer);        if (length < 1) {        return null_result;    }        // 6. If using obj mgr, BioseqNew() is called; otherwise    // MemNew().  --> This is the BioseqPtr, bsp.        //byte_store = BSNew(0);        // 7. A byte store is created (BSNew()).        //xx No allocation is done; instead we just call *Set() etc.        // 8. If protein, we set bsp->mol = Seq_mol_aa, seq_data_type    // = Seq_code_ncbistdaa; then we write the buffer into the    // byte store.        // 9. Nucleotide sequences require more pain, suffering.    // a. Try to get ambchars    // b. If there are any, convert sequence to 4 byte rep.    // c. Otherwise write to a byte store.    // d. Set mol = Seq_mol_na;        // 10. Stick the byte store into the bsp->seq_data        CRef<CBioseq> bioseq(new CBioseq);        CSeq_inst & seqinst = bioseq->SetInst();    //CSeq_data & seqdata = seqinst->SetSeq_data();        bool is_prot = (x_GetSeqType() == kSeqTypeProt);        if (is_prot) {        s_SeqDBWriteSeqDataProt(seqinst, seq_buffer, length);    } else {        // nucl        vector<Int4> ambchars;                if (! x_GetAmbChar(oid, ambchars)) {            return null_result;        }                if (ambchars.empty()) {            // keep as 2 bit            s_SeqDBWriteSeqDataNucl(seqinst, seq_buffer, length);        } else {            // translate to 4 bit            s_SeqDBWriteSeqDataNucl(seqinst, seq_buffer, length, ambchars);        }                // mol = na        seqinst.SetMol(CSeq_inst::eMol_na);    }        if (seq_buffer) {        m_MemPool.Free((void*) seq_buffer);        seq_buffer = 0;    }        // 11. Set the length, id (Seq_id), and repr (== raw).        seqinst.SetLength(length);    seqinst.SetRepr(CSeq_inst::eRepr_raw);    bioseq->SetId().swap(seqids);        // 12. Add the new_defline to the list at bsp->descr if    // non-null, with ValNodeAddString().        // 13. If the format is not text (si), we get the defline and    // chain it onto the bsp->desc list; then we read and append    // taxonomy names to the list (readdb_get_taxonomy_names()).        if (defline_set.NotEmpty()) {        // Convert defline to ... string.                //1. Have a defline.. maybe.  Want to add this as the title.(?)        // A. How to add a description item:                string description;                s_GetDescrFromDefline(defline_set, description);                CRef<CSeqdesc> desc1(new CSeqdesc);        desc1->SetTitle().swap(description);                CSeq_descr & seqdesc = bioseq->SetDescr();        seqdesc.Set().push_back(desc1);    }        // I'm going to let the taxonomy slide for now....        //cerr << "Taxonomy, etc." << endl;    // 14. Return the bsp.        // Everything seems eerily quiet so far, so...        return bioseq;}Int4 CSeqDBVol::GetSequence(Int4 oid, const char ** buffer) const{    return x_GetSequence(oid, buffer);}char * CSeqDBVol::x_AllocType(Uint4 length, ESeqDBAllocType alloc_type) const{    // Specifying an allocation type of zero, uses the memory pool to    // do the allocation.  This is not intended to be visible to the    // end user, so it is not enumerated in seqdbcommon.hpp.        char * retval = 0;        switch(alloc_type) {    case eMalloc:        retval = (char*) malloc(length);        break;        // MemNew is not available yet.//     case eMemNew://         retval = (char*) MemNew(length);//         break;            case eNew:        retval = new char[length];        break;            default:        retval = (char*) m_MemPool.Alloc(length);    }        return retval;}Int4 CSeqDBVol::GetAmbigSeq(Int4            oid,                            char         ** buffer,                            Uint4           nucl_code,                            ESeqDBAllocType alloc_type) const{    char * buf1 = 0;    Int4 baselen = x_GetAmbigSeq(oid, & buf1, nucl_code, alloc_type);        *buffer = buf1;    return baselen;}Int4 CSeqDBVol::x_GetAmbigSeq(Int4            oid,                              char         ** buffer,                              Uint4           nucl_code,                              ESeqDBAllocType alloc_type) const{    Int4 base_length = -1;        if (kSeqTypeProt == m_Idx.GetSeqType()) {        const char * buf2 = 0;                base_length = x_GetSequence(oid, & buf2);                if (alloc_type == ESeqDBAllocType(0)) {            *buffer = (char*) buf2;        } else {            char * obj = x_AllocType(base_length, alloc_type);                        memcpy(obj, buf2, base_length);            m_MemPool.Free((void*) buf2);                        *buffer = obj;        }    } else {        vector<char> buffer_na8;                {            // The code in this block is a few excerpts from            // GetBioseq() and s_SeqDBWriteSeqDataNucl().                        // Get the length and the (probably mmapped) data.                        const char * seq_buffer = 0;                        base_length = x_GetSequence(oid, & seq_buffer);                        if (base_length < 1) {                NCBI_THROW(CSeqDBException, eFileErr,                           "Error: could not get sequence.");            }                        // Get ambiguity characters.                        vector<Int4> ambchars;                        if (! x_GetAmbChar(oid, ambchars) ) {                NCBI_THROW(CSeqDBException, eFileErr,                           "File error: could not get ambiguity data.");            }                        // Combine and translate to 4 bits-per-character encoding.                        bool sentinel = (nucl_code == kSeqDBNuclBlastNA8);                        s_SeqDBMapNA2ToNA8(seq_buffer, buffer_na8, base_length, sentinel);            s_SeqDBRebuildDNA_NA8(buffer_na8, ambchars);                        if (sentinel) {                // Translate bytewise, in place.                s_SeqDBMapNcbiNA8ToBlastNA8(buffer_na8);            }                        // Return probably-mmapped sequence                        m_MemPool.Free((void*) seq_buffer);        }                // NOTE:!! This is a memory leak; this is known, and I am        // fixing it, but the fix involves reorganization of code, and        // I don't want to bundle any more in this checkin. (kmb)                int bytelen = buffer_na8.size();        char * uncomp_buf = x_AllocType(bytelen, alloc_type);                for(int i = 0; i < bytelen; i++) {            uncomp_buf[i] = buffer_na8[i];        }                *buffer = uncomp_buf;    }        return base_length;}Int4 CSeqDBVol::x_GetSequence(Int4 oid, const char ** buffer) const{    Uint4 start_offset = 0;    Uint4 end_offset   = 0;            Int4 length = -1;            if (! m_Idx.GetSeqStartEnd(oid, start_offset, end_offset))        return -1;            char seqtype = m_Idx.GetSeqType();            if (kSeqTypeProt == seqtype) {        // Subtract one, for the inter-sequence null.                        end_offset --;                        length = end_offset - start_offset;                    *buffer = m_Seq.GetRegion(start_offset, end_offset);    } else if (kSeqTypeNucl == seqtype) {        // The last byte is partially full; the last two bits of        // the last byte store the number of nucleotides in the        // last byte (0 to 3).                    *buffer = m_Seq.GetRegion(start_offset, end_offset);                    Int4 whole_bytes = end_offset - start_offset - 1;                    char last_char = (*buffer)[whole_bytes];        Int4 remainder = last_char & 3;        length = (whole_bytes * 4) + remainder;    }            return length;}list< CRef<CSeq_id> > CSeqDBVol::GetSeqIDs(Uint4 oid) const{    list< CRef< CSeq_id > > seqids;        CRef<CBlast_def_line_set> defline_set(x_GetHdr(oid));        if ((! defline_set.Empty()) && defline_set->CanGet()) {        CRef<CBlast_def_line> defline = defline_set->Get().front();                if (defline->CanGetSeqid()) {            seqids = defline->GetSeqid();        }    }        return seqids;}Uint8 CSeqDBVol::GetTotalLength(void) const{    return m_Idx.GetTotalLength();}CRef<CBlast_def_line_set> CSeqDBVol::GetHdr(Uint4 oid) const{    return x_GetHdr(oid);}CRef<CBlast_def_line_set> CSeqDBVol::x_GetHdr(Uint4 oid) const{    CRef<CBlast_def_line_set> nullret;            Uint4 hdr_start = 0;    Uint4 hdr_end   = 0;            if (! m_Idx.GetHdrStartEnd(oid, hdr_start, hdr_end)) {        return nullret;    }        const char * asn_region = m_Hdr.GetRegion(hdr_start, hdr_end);        // Now; create an ASN.1 object from the memory chunk provided    // here.            if (! asn_region) {        return nullret;    }        // Get the data as a string, then as a stringstream, then build    // the actual asn.1 object.  How much 'extra' work is done here?    // Perhaps a dedicated ASN.1 memory-stream object would be of some    // benefit, particularly in the "called 100000 times" cases.        istringstream asndata( string(asn_region, asn_region + (hdr_end - hdr_start)) );        m_MemPool.Free((void*) asn_region);        auto_ptr<CObjectIStream> inpstr(CObjectIStream::Open(eSerial_AsnBinary, asndata));        CRef<objects::CBlast_def_line_set> phil(new objects::CBlast_def_line_set);        *inpstr >> *phil;            return phil;}bool CSeqDBVol::x_GetAmbChar(Uint4 oid, vector<Int4> ambchars) const{    Uint4 start_offset = 0;    Uint4 end_offset   = 0;        bool ok = m_Idx.GetAmbStartEnd(oid, start_offset, end_offset);        if (! ok) {        return false;    }        Int4 length = end_offset - start_offset;        if (0 == length)        return true;        Int4 total = length / 4;        Int4 * buffer = (Int4*) m_Seq.GetRegion(start_offset, start_offset + (total * 4));        // This makes no sense...    total &= 0x7FFFFFFF;        ambchars.resize(total);        for(int i = 0; i<total; i++) {	//ambchars[i] = CByteSwap::GetInt4((const unsigned char *)(& buffer[i]));	ambchars[i] = SeqDB_GetStdOrd((const unsigned char *)(& buffer[i]));    }        m_MemPool.Free((void*) buffer);        return true;}Uint4 CSeqDBVol::GetNumSeqs(void) const{    return m_Idx.GetNumSeqs();}string CSeqDBVol::GetTitle(void) const{    return m_Idx.GetTitle();}string CSeqDBVol::GetDate(void) const{    return m_Idx.GetDate();}Uint4 CSeqDBVol::GetMaxLength(void) const{    return m_Idx.GetMaxLength();}END_NCBI_SCOPE

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?