blob_splitter_impl.cpp
来自「ncbi源码」· C++ 代码 · 共 354 行
CPP
354 行
/* * =========================================================================== * PRODUCTION $Log: blob_splitter_impl.cpp,v $ * PRODUCTION Revision 1000.2 2004/06/01 19:24:48 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7 * PRODUCTION * =========================================================================== *//* $Id: blob_splitter_impl.cpp,v 1000.2 2004/06/01 19:24:48 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Author: Eugene Vasilchenko** File Description:* Application for splitting blobs withing ID1 cache** ===========================================================================*/#include <ncbi_pch.hpp>#include <objmgr/split/blob_splitter_impl.hpp>#include <serial/objostr.hpp>#include <serial/serial.hpp>#include <objmgr/split/blob_splitter.hpp>#include <objmgr/split/object_splitinfo.hpp>#include <objmgr/split/annot_piece.hpp>#include <objmgr/split/asn_sizer.hpp>#include <objmgr/split/chunk_info.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)template<class C>inlineC& NonConst(const C& c){ return const_cast<C&>(c);}/////////////////////////////////////////////////////////////////////////////// CBlobSplitter interface method to avoid recompilation of two files/////////////////////////////////////////////////////////////////////////////bool CBlobSplitter::Split(const CSeq_entry& entry){ CBlobSplitterImpl impl(m_Params); if ( impl.Split(entry) ) { m_SplitBlob = impl.GetBlob(); } else { m_SplitBlob.Reset(entry); } return m_SplitBlob.IsSplit();}/////////////////////////////////////////////////////////////////////////////// CBlobSplitterImpl/////////////////////////////////////////////////////////////////////////////static CAsnSizer s_Sizer;bool CBlobSplitterImpl::Split(const CSeq_entry& entry){ Reset();#if 0 size_t before_count = CountAnnotObjects(entry); NcbiCout << "Total: before: " << before_count << NcbiEndl;#endif // copying skeleton while stripping annotations CopySkeleton(*m_Skeleton, entry); // collect annot pieces stripping landmark annotations to main chunk CollectPieces(); if ( m_Pieces->empty() ) { return false; } // split pieces in chunks SplitPieces(); if ( m_Chunks.size() <= 1 ) { // only main chunk exists return false; } MakeID2SObjects();#if 0 size_t after_count = CountAnnotObjects(*m_Skeleton) + CountAnnotObjects(m_ID2_Chunks); NcbiCout << "Total: in chunks: " << after_count << NcbiEndl; _ASSERT(before_count == after_count);#endif return m_SplitBlob.IsSplit();}void CBlobSplitterImpl::CollectPieces(void){ // Collect annotation pieces and strip landmark and long annotations // to main chunk. m_Pieces.reset(new CAnnotPieces); SChunkInfo& main_chunk = m_Chunks[0]; ITERATE ( TBioseqs, it, m_Bioseqs ) { m_Pieces->Add(it->second, main_chunk); } if ( m_Params.m_Verbose ) { // display pieces statistics CSize single_ref; ITERATE ( CAnnotPieces, it, *m_Pieces ) { if ( it->second.size() <= 1 ) { single_ref += it->second.m_Size; } else { NcbiCout << "@" << it->first.AsString() << ": " << it->second.m_Size << '\n'; } } if ( single_ref ) { NcbiCout << "with 1 obj: " << single_ref << '\n'; } NcbiCout << NcbiEndl; }#if 0 {{ _ASSERT(m_Chunks.size() == 1); // count objects size_t count = CountAnnotObjects(*m_Skeleton) + m_Chunks.begin()->second.CountAnnotObjects() + m_Pieces->CountAnnotObjects(); NcbiCout << "Total: in pieces: " << count << NcbiEndl; }}#endif}SChunkInfo* CBlobSplitterImpl::NextChunk(SChunkInfo* chunk, const CSize& size){ if ( chunk ) { CSize::TDataSize cur_size = chunk->m_Size.GetZipSize(); CSize::TDataSize new_size = cur_size + size.GetZipSize(); if ( cur_size <= m_Params.m_ChunkSize && new_size <= m_Params.m_MaxChunkSize ) { return chunk; } } _ASSERT(!m_Chunks.empty()); int chunk_id = m_Chunks.size(); return &m_Chunks[chunk_id];}void CBlobSplitterImpl::SplitPieces(void){ SChunkInfo& main_chunk = m_Chunks[0]; SChunkInfo* chunk = 0; // split ids with large amount of pieces while ( !m_Pieces->empty() ) {#if 0 {{ size_t count = CountAnnotObjects(*m_Skeleton) + m_Pieces->CountAnnotObjects(); ITERATE ( TChunks, it, m_Chunks ) { count += it->second.CountAnnotObjects(); } NcbiCout << "Total count: " << count << '\n'; }}#endif // find id with most size of pieces on it CSize max_size; CAnnotPieces::iterator max_iter; NON_CONST_ITERATE ( CAnnotPieces, it, *m_Pieces ) { if ( it->second.m_Size > max_size ) { max_iter = it; max_size = it->second.m_Size; } } if ( max_size.GetZipSize() < m_Params.m_MaxChunkSize || max_size.GetCount() <= 1 ) { break; } // split this id if ( m_Params.m_Verbose ) { LOG_POST("Splitting @"<<max_iter->first.AsString()<< ": "<<max_size); } SIdAnnotPieces& objs = max_iter->second; size_t max_piece_length; // too long annotations {{ // how many chunks to make from these annotations size_t chunk_count = size_t(double(objs.m_Size.GetZipSize())/m_Params.m_ChunkSize +.5); // length of sequence covered by annotations size_t whole_length = objs.m_IdRange.GetLength(); // estimated length of sequence covered by one chunk size_t chunk_length = whole_length / chunk_count; // maximum length of one piece over the sequence max_piece_length = chunk_length / 4; }} // extract long pieces into chunk 0 vector<SAnnotPiece> pieces; ITERATE ( SIdAnnotPieces, it, objs ) { const SAnnotPiece& piece = *it; if ( piece.m_IdRange.GetLength() > max_piece_length ) { pieces.push_back(piece); } } if ( m_Params.m_Verbose && !pieces.empty() ) { LOG_POST(" "<<pieces.size()<<" long pieces"); } ITERATE ( vector<SAnnotPiece>, it, pieces ) { const SAnnotPiece& piece = *it; main_chunk.Add(piece); m_Pieces->Remove(piece); } pieces.clear(); ITERATE ( SIdAnnotPieces, it, objs ) { pieces.push_back(*it); } ITERATE ( vector<SAnnotPiece>, it, pieces ) { const SAnnotPiece piece = *it; chunk = NextChunk(chunk, piece.m_Size); chunk->Add(piece); m_Pieces->Remove(piece); } _ASSERT(max_iter->second.empty()); m_Pieces->erase(max_iter); } // combine ids with small amount of pieces while ( !m_Pieces->empty() ) {#if 0 {{ size_t count = CountAnnotObjects(*m_Skeleton) + m_Pieces->CountAnnotObjects(); ITERATE ( TChunks, it, m_Chunks ) { count += it->second.CountAnnotObjects(); } NcbiCout << "Total count: " << count << '\n'; }}#endif CAnnotPieces::iterator max_iter = m_Pieces->begin(); SIdAnnotPieces& objs = max_iter->second; if ( !objs.empty() ) { chunk = NextChunk(chunk, objs.m_Size); while ( !objs.empty() ) { SAnnotPiece piece = *objs.begin(); chunk->Add(piece); m_Pieces->Remove(piece); _ASSERT(objs.empty() || *objs.begin() != piece); } } _ASSERT(max_iter->second.empty()); m_Pieces->erase(max_iter); } _ASSERT(m_Pieces->empty()); m_Pieces.reset();#if 0 {{ size_t count = CountAnnotObjects(*m_Skeleton); ITERATE ( TChunks, it, m_Chunks ) { count += it->second.CountAnnotObjects(); } NcbiCout << "Total count: " << count << '\n'; }}#endif if ( m_Params.m_Verbose ) { // display collected chunks stats ITERATE ( TChunks, it, m_Chunks ) { NcbiCout << "Chunk: " << it->first << ": " << it->second.m_Size << NcbiEndl; } }}END_SCOPE(objects)END_NCBI_SCOPE/** ---------------------------------------------------------------------------* $Log: blob_splitter_impl.cpp,v $* Revision 1000.2 2004/06/01 19:24:48 gouriano* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7** Revision 1.7 2004/05/21 21:42:13 gorelenk* Added PCH ncbi_pch.hpp** Revision 1.6 2004/03/05 17:40:34 vasilche* Added 'verbose' option to splitter parameters.** Revision 1.5 2004/01/07 17:36:23 vasilche* Moved id2_split headers to include/objmgr/split.* Fixed include path to genbank.** Revision 1.4 2003/12/03 19:30:44 kuznets* Misprint fixed** Revision 1.3 2003/11/26 23:04:57 vasilche* Removed extra semicolons after BEGIN_SCOPE and END_SCOPE.** Revision 1.2 2003/11/26 17:56:01 vasilche* Implemented ID2 split in ID1 cache.* Fixed loading of splitted annotations.** Revision 1.1 2003/11/12 16:18:25 vasilche* First implementation of ID2 blob splitter withing cache.** ===========================================================================*/
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?