seqdbvol.cpp
来自「ncbi源码」· C++ 代码 · 共 1,014 行 · 第 1/2 页
CPP
1,014 行
na_data[whole_bytes] = seq_buffer[whole_bytes] & (0xFF - 0x03); } seqinst.SetSeq_data().SetNcbi2na().Set().swap(na_data); seqinst.SetMol(CSeq_inst::eMol_na);}static voids_SeqDBWriteSeqDataNucl(CSeq_inst & seqinst, const char * seq_buffer, Int4 length, vector<Int4> & amb_chars){ vector<char> buffer_4na; s_SeqDBMapNA2ToNA4(seq_buffer, buffer_4na, length); // length is not /4 here s_SeqDBRebuildDNA_NA4(buffer_4na, amb_chars); seqinst.SetSeq_data().SetNcbi4na().Set().swap(buffer_4na); seqinst.SetMol(CSeq_inst::eMol_na);}void s_GetDescrFromDefline(CRef<CBlast_def_line_set> deflines, string & descr){ descr.erase(); string seqid_str; typedef list< CRef<CBlast_def_line> >::const_iterator TDefIt; typedef list< CRef<CSeq_id > >::const_iterator TSeqIt; const list< CRef<CBlast_def_line> > & dl = deflines->Get(); for(TDefIt iter = dl.begin(); iter != dl.end(); iter++) { ostringstream oss; const CBlast_def_line & defline = **iter; if (! descr.empty()) { //oss << "\1"; oss << " "; } if (defline.CanGetSeqid()) { const list< CRef<CSeq_id > > & sl = defline.GetSeqid(); for (TSeqIt seqit = sl.begin(); seqit != sl.end(); seqit++) { (*seqit)->WriteAsFasta(oss); } } if (defline.CanGetTitle()) { oss << defline.GetTitle(); } descr += oss.str(); }}CRef<CBioseq>CSeqDBVol::GetBioseq(Int4 oid, bool /*use_objmgr,*/, bool /*insert_ctrlA*/) const{ // 1. Declare variables. CRef<CBioseq> null_result; // 2. if we can't find the seq_num (get_link), we are done. //xx Not relevant - the layer above this would do that before //xx This specific 'CSeqDB' object was involved. // 3. get the descriptor - i.e. sip, defline. Probably this // is like the function above that gets asn, but split it up // more when returning it. // QUESTIONS: What do we actually want? The defline & seqid, or // the defline set and seqid set? From readdb.c it looks like the // single items are the prized artifacts. CRef<CBlast_def_line_set> defline_set(x_GetHdr(oid)); CRef<CBlast_def_line> defline; list< CRef< CSeq_id > > seqids; { if (defline_set.Empty() || (! defline_set->CanGet())) { return null_result; } defline = defline_set->Get().front(); if (! defline->CanGetSeqid()) { // Not sure if this is necessary - if it turns out we can // handle a null, this should be taken back out. return null_result; } // Do we want the list, or just the first CRef<>?? For now, get // the whole list. seqids = defline->GetSeqid(); } // 4. If insert_ctrlA is FALSE, we convert each "^A" to " >"; // Otherwise, we just copy the defline across. (inline func?) //xx Ignoring ctrl-A issue for now (as per Tom's insns.) // 5. Get length & sequence (_get_sequence). That should be // mimicked before this. const char * seq_buffer = 0; Int4 length = x_GetSequence(oid, & seq_buffer); if (length < 1) { return null_result; } // 6. If using obj mgr, BioseqNew() is called; otherwise // MemNew(). --> This is the BioseqPtr, bsp. //byte_store = BSNew(0); // 7. A byte store is created (BSNew()). //xx No allocation is done; instead we just call *Set() etc. // 8. If protein, we set bsp->mol = Seq_mol_aa, seq_data_type // = Seq_code_ncbistdaa; then we write the buffer into the // byte store. // 9. Nucleotide sequences require more pain, suffering. // a. Try to get ambchars // b. If there are any, convert sequence to 4 byte rep. // c. Otherwise write to a byte store. // d. Set mol = Seq_mol_na; // 10. Stick the byte store into the bsp->seq_data CRef<CBioseq> bioseq(new CBioseq); CSeq_inst & seqinst = bioseq->SetInst(); //CSeq_data & seqdata = seqinst->SetSeq_data(); bool is_prot = (x_GetSeqType() == kSeqTypeProt); if (is_prot) { s_SeqDBWriteSeqDataProt(seqinst, seq_buffer, length); } else { // nucl vector<Int4> ambchars; if (! x_GetAmbChar(oid, ambchars)) { return null_result; } if (ambchars.empty()) { // keep as 2 bit s_SeqDBWriteSeqDataNucl(seqinst, seq_buffer, length); } else { // translate to 4 bit s_SeqDBWriteSeqDataNucl(seqinst, seq_buffer, length, ambchars); } // mol = na seqinst.SetMol(CSeq_inst::eMol_na); } if (seq_buffer) { m_MemPool.Free((void*) seq_buffer); seq_buffer = 0; } // 11. Set the length, id (Seq_id), and repr (== raw). seqinst.SetLength(length); seqinst.SetRepr(CSeq_inst::eRepr_raw); bioseq->SetId().swap(seqids); // 12. Add the new_defline to the list at bsp->descr if // non-null, with ValNodeAddString(). // 13. If the format is not text (si), we get the defline and // chain it onto the bsp->desc list; then we read and append // taxonomy names to the list (readdb_get_taxonomy_names()). if (defline_set.NotEmpty()) { // Convert defline to ... string. //1. Have a defline.. maybe. Want to add this as the title.(?) // A. How to add a description item: string description; s_GetDescrFromDefline(defline_set, description); CRef<CSeqdesc> desc1(new CSeqdesc); desc1->SetTitle().swap(description); CSeq_descr & seqdesc = bioseq->SetDescr(); seqdesc.Set().push_back(desc1); } // I'm going to let the taxonomy slide for now.... //cerr << "Taxonomy, etc." << endl; // 14. Return the bsp. // Everything seems eerily quiet so far, so... return bioseq;}Int4 CSeqDBVol::GetSequence(Int4 oid, const char ** buffer) const{ return x_GetSequence(oid, buffer);}char * CSeqDBVol::x_AllocType(Uint4 length, ESeqDBAllocType alloc_type) const{ // Specifying an allocation type of zero, uses the memory pool to // do the allocation. This is not intended to be visible to the // end user, so it is not enumerated in seqdbcommon.hpp. char * retval = 0; switch(alloc_type) { case eMalloc: retval = (char*) malloc(length); break; // MemNew is not available yet.// case eMemNew:// retval = (char*) MemNew(length);// break; case eNew: retval = new char[length]; break; default: retval = (char*) m_MemPool.Alloc(length); } return retval;}Int4 CSeqDBVol::GetAmbigSeq(Int4 oid, char ** buffer, Uint4 nucl_code, ESeqDBAllocType alloc_type) const{ char * buf1 = 0; Int4 baselen = x_GetAmbigSeq(oid, & buf1, nucl_code, alloc_type); *buffer = buf1; return baselen;}Int4 CSeqDBVol::x_GetAmbigSeq(Int4 oid, char ** buffer, Uint4 nucl_code, ESeqDBAllocType alloc_type) const{ Int4 base_length = -1; if (kSeqTypeProt == m_Idx.GetSeqType()) { const char * buf2 = 0; base_length = x_GetSequence(oid, & buf2); if (alloc_type == ESeqDBAllocType(0)) { *buffer = (char*) buf2; } else { char * obj = x_AllocType(base_length, alloc_type); memcpy(obj, buf2, base_length); m_MemPool.Free((void*) buf2); *buffer = obj; } } else { vector<char> buffer_na8; { // The code in this block is a few excerpts from // GetBioseq() and s_SeqDBWriteSeqDataNucl(). // Get the length and the (probably mmapped) data. const char * seq_buffer = 0; base_length = x_GetSequence(oid, & seq_buffer); if (base_length < 1) { NCBI_THROW(CSeqDBException, eFileErr, "Error: could not get sequence."); } // Get ambiguity characters. vector<Int4> ambchars; if (! x_GetAmbChar(oid, ambchars) ) { NCBI_THROW(CSeqDBException, eFileErr, "File error: could not get ambiguity data."); } // Combine and translate to 4 bits-per-character encoding. bool sentinel = (nucl_code == kSeqDBNuclBlastNA8); s_SeqDBMapNA2ToNA8(seq_buffer, buffer_na8, base_length, sentinel); s_SeqDBRebuildDNA_NA8(buffer_na8, ambchars); if (sentinel) { // Translate bytewise, in place. s_SeqDBMapNcbiNA8ToBlastNA8(buffer_na8); } // Return probably-mmapped sequence m_MemPool.Free((void*) seq_buffer); } // NOTE:!! This is a memory leak; this is known, and I am // fixing it, but the fix involves reorganization of code, and // I don't want to bundle any more in this checkin. (kmb) int bytelen = buffer_na8.size(); char * uncomp_buf = x_AllocType(bytelen, alloc_type); for(int i = 0; i < bytelen; i++) { uncomp_buf[i] = buffer_na8[i]; } *buffer = uncomp_buf; } return base_length;}Int4 CSeqDBVol::x_GetSequence(Int4 oid, const char ** buffer) const{ Uint4 start_offset = 0; Uint4 end_offset = 0; Int4 length = -1; if (! m_Idx.GetSeqStartEnd(oid, start_offset, end_offset)) return -1; char seqtype = m_Idx.GetSeqType(); if (kSeqTypeProt == seqtype) { // Subtract one, for the inter-sequence null. end_offset --; length = end_offset - start_offset; *buffer = m_Seq.GetRegion(start_offset, end_offset); } else if (kSeqTypeNucl == seqtype) { // The last byte is partially full; the last two bits of // the last byte store the number of nucleotides in the // last byte (0 to 3). *buffer = m_Seq.GetRegion(start_offset, end_offset); Int4 whole_bytes = end_offset - start_offset - 1; char last_char = (*buffer)[whole_bytes]; Int4 remainder = last_char & 3; length = (whole_bytes * 4) + remainder; } return length;}list< CRef<CSeq_id> > CSeqDBVol::GetSeqIDs(Uint4 oid) const{ list< CRef< CSeq_id > > seqids; CRef<CBlast_def_line_set> defline_set(x_GetHdr(oid)); if ((! defline_set.Empty()) && defline_set->CanGet()) { CRef<CBlast_def_line> defline = defline_set->Get().front(); if (defline->CanGetSeqid()) { seqids = defline->GetSeqid(); } } return seqids;}Uint8 CSeqDBVol::GetTotalLength(void) const{ return m_Idx.GetTotalLength();}CRef<CBlast_def_line_set> CSeqDBVol::GetHdr(Uint4 oid) const{ return x_GetHdr(oid);}CRef<CBlast_def_line_set> CSeqDBVol::x_GetHdr(Uint4 oid) const{ CRef<CBlast_def_line_set> nullret; Uint4 hdr_start = 0; Uint4 hdr_end = 0; if (! m_Idx.GetHdrStartEnd(oid, hdr_start, hdr_end)) { return nullret; } const char * asn_region = m_Hdr.GetRegion(hdr_start, hdr_end); // Now; create an ASN.1 object from the memory chunk provided // here. if (! asn_region) { return nullret; } // Get the data as a string, then as a stringstream, then build // the actual asn.1 object. How much 'extra' work is done here? // Perhaps a dedicated ASN.1 memory-stream object would be of some // benefit, particularly in the "called 100000 times" cases. istringstream asndata( string(asn_region, asn_region + (hdr_end - hdr_start)) ); m_MemPool.Free((void*) asn_region); auto_ptr<CObjectIStream> inpstr(CObjectIStream::Open(eSerial_AsnBinary, asndata)); CRef<objects::CBlast_def_line_set> phil(new objects::CBlast_def_line_set); *inpstr >> *phil; return phil;}bool CSeqDBVol::x_GetAmbChar(Uint4 oid, vector<Int4> ambchars) const{ Uint4 start_offset = 0; Uint4 end_offset = 0; bool ok = m_Idx.GetAmbStartEnd(oid, start_offset, end_offset); if (! ok) { return false; } Int4 length = end_offset - start_offset; if (0 == length) return true; Int4 total = length / 4; Int4 * buffer = (Int4*) m_Seq.GetRegion(start_offset, start_offset + (total * 4)); // This makes no sense... total &= 0x7FFFFFFF; ambchars.resize(total); for(int i = 0; i<total; i++) { //ambchars[i] = CByteSwap::GetInt4((const unsigned char *)(& buffer[i])); ambchars[i] = SeqDB_GetStdOrd((const unsigned char *)(& buffer[i])); } m_MemPool.Free((void*) buffer); return true;}Uint4 CSeqDBVol::GetNumSeqs(void) const{ return m_Idx.GetNumSeqs();}string CSeqDBVol::GetTitle(void) const{ return m_Idx.GetTitle();}string CSeqDBVol::GetDate(void) const{ return m_Idx.GetDate();}Uint4 CSeqDBVol::GetMaxLength(void) const{ return m_Idx.GetMaxLength();}END_NCBI_SCOPE
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?