split_cache.cpp

来自「ncbi源码」· C++ 代码 · 共 827 行 · 第 1/2 页

CPP
827
字号
    SSplitterParams m_Params;    CID2_Reply_Data m_Data;    int m_DataType;    AutoPtr<CNcbiOstrstream> m_MStream;    AutoPtr<CObjectOStream>  m_OStream;};string CSplitCacheApp::GetFileName(const string& key,                                   const string& suffix,                                   const string& ext){    string dir = key;    string file = key + suffix;    if ( !suffix.empty() && !ext.empty() ) {        dir = dir + CDirEntry::GetPathSeparator() + ext;    }    CDir(dir).CreatePath();    return CDirEntry::MakePath(dir, file, ext);}void CSplitCacheApp::Process(void){    const CArgs& args = GetArgs();    m_DumpAsnText = args["dump"];    m_DumpAsnBinary = args["bdump"];    if ( args["compress"] ) {        m_SplitterParams.m_Compression = m_SplitterParams.eCompression_nlm_zip;    }    m_Resplit = args["resplit"];    m_Recurse = args["recurse"];    m_SplitterParams.SetChunkSize(args["chunk_size"].AsInteger()*1024);    if ( args["gi"] ) {        ProcessGi(args["gi"].AsInteger());    }    if ( args["gi_list"] ) {        CNcbiIstream& in = args["gi_list"].AsInputFile();        int gi;        while ( in >> gi ) {            ProcessGi(gi);        }    }    if ( args["id"] ) {        CSeq_id id(args["id"].AsString());        ProcessSeqId(id);    }    if ( args["id_list"] ) {        CNcbiIstream& in = args["id_list"].AsInputFile();        string id_name;        while ( NcbiGetline(in, id_name, '\n') ) {            CSeq_id id(id_name);            ProcessSeqId(id);        }    }}void CSplitCacheApp::ProcessGi(int gi){    CSeq_id id;    id.SetGi(gi);    ProcessSeqId(id);}void CSplitCacheApp::ProcessSeqId(const CSeq_id& id){    CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(id);    if ( !m_ProcessedIds.insert(idh).second ) {        // already processed        return;    }    if ( m_RecursionLevel == 0 ) {        m_Scope->ResetHistory();    }    LINE("Processing: " << id.AsFastaString());    {{        CLevelGuard level(m_RecursionLevel);                CId1Reader::TSeqrefs srs;        m_Reader->ResolveSeq_id(srs, id, 0);        if ( srs.empty() ) {            LINE("Skipping: no blobs");            return;        }        ITERATE ( CId1Reader::TSeqrefs, it, srs ) {            ProcessBlob(**it);        }        if ( m_Recurse ) {            LINE("Processing referenced sequences:");            CBioseq_Handle bh = m_Scope->GetBioseqHandle(idh);            if ( bh ) {                CSeqMap_CI it = bh.GetSeqMap()                    .begin_resolved(&*m_Scope, 0, CSeqMap::fFindRef);                while ( it ) {                    ProcessSeqId(*it.GetRefSeqid().GetSeqId());                    ++it;                }            }        }    }}    LINE("End of processing: " << id.AsFastaString());}template<class C>void Dump(CSplitCacheApp* app, const C& obj, ESerialDataFormat format,          const string& key, const string& suffix = kEmptyStr){    string ext;    switch ( format ) {    case eSerial_AsnText:   ext = "asn"; break;    case eSerial_AsnBinary: ext = "asb"; break;    case eSerial_Xml:       ext = "xml"; break;    default:                ext = "asn"; break;    }    string file_name = app->GetFileName(key, suffix, ext);    WAIT_LINE4(app) << "Dumping to " << file_name << " ...";    AutoPtr<CObjectOStream> out(CObjectOStream::Open(file_name,                                                     format));    *out << obj;}enum EDataType{    eDataType_MainBlob = 0,    eDataType_SplitInfo = 1,    eDataType_Chunk = 2};template<class C>void DumpData(CSplitCacheApp* app, const C& obj, EDataType data_type,              const string& key, const string& suffix = kEmptyStr){    string file_name = app->GetFileName(key, suffix, "bin");    WAIT_LINE4(app) << "Storing to " << file_name << " ...";    CSplitDataMaker data(app->GetParams(), data_type);    data << obj;    AutoPtr<CObjectOStream> out        (CObjectOStream::Open(file_name, eSerial_AsnBinary));    *out << data.GetData();}template<class C>void StoreToCache(CSplitCacheApp* app, const C& obj, EDataType data_type,                  const CSeqref& seqref, const string& suffix = kEmptyStr){    string key = app->GetReader().GetBlobKey(seqref);    WAIT_LINE4(app) << "Storing to cache " << key << " ...";    CNcbiOstrstream stream;    {{        CSplitDataMaker data(app->GetParams(), data_type);        data << obj;        AutoPtr<CObjectOStream> out            (CObjectOStream::Open(eSerial_AsnBinary, stream));        *out << data.GetData();    }}    size_t size = stream.pcount();    line << setiosflags(ios::fixed) << setprecision(2) <<        " " << setw(7) << (size/1024.0) << " KB";    const char* data = stream.str();    stream.freeze(false);    app->GetCache().Store(key, seqref.GetVersion(), suffix, data, size);}void CSplitCacheApp::ProcessBlob(const CSeqref& seqref){    {        pair<int, int> key = seqref.GetKeyByTSE();        pair<TProcessedBlobs::iterator, bool> ins = m_ProcessedBlobs.insert(key);        if ( !ins.second ) {            // already processed            return;        }    }    LINE("Processing blob "<< seqref.printTSE());    CLevelGuard level(m_RecursionLevel);    int version = m_Reader->GetVersion(seqref, 0);    if ( version > 1 ) {        CTime time(time_t(version*60));        LINE("Blob version: " << version << " - " << time.AsString());    }    else {        LINE("Blob version: " << version);    }    string blob_key = m_Reader->GetBlobKey(seqref);    if ( m_Cache->GetSize(blob_key, version, "Skeleton") ) {        if ( m_Resplit ) {            WAIT_LINE << "Removing old split data...";            m_Cache->Remove(blob_key);        }        else {            LINE("Already splitted: skipping");            return;        }    }    if ( m_Reader->IsSNPSeqref(seqref) ) {        LINE("Skipping SNP blob: not implemented");        return;    }    CBioseq_Handle bh;    {{        WAIT_LINE << "Loading...";        bh = m_Scope->GetBioseqHandle            (CSeq_id_Handle::GetGiHandle(seqref.GetGi()));    }}    if ( !bh ) {        LINE("Skipping: no bioseq???");        return;    }    CConstRef<CSeq_entry> seq_entry;    if ( !m_Reader->IsSNPSeqref(seqref) ) {        // get non-SNP blob        seq_entry = bh.GetTopLevelEntry().GetCompleteSeq_entry();    }    else {        LINE("Skipping SNP blob: not implemented");        return;        /*          SAnnotSelector sel;          sel.SetMaxSize(1);          sel.SetFeatSubtype(CSeqFeatData::eSubtype_variation);          CFeat_CI it(bh, 0, 0, sel);          if ( !it ) {          LINE("Skipping SNP blob: empty");          return;          }          const CSeq_annot& seq_annot = it.GetSeq_annot();          CConstRef<CSeq_annot_Info> seq_annot_info =          m_Scope->GetImpl().x_GetSeq_annot_Info(seq_annot);          blob.Reset(&seq_annot_info->GetTSE_Info());        */    }    if ( m_DumpAsnText ) {        Dump(this, *seq_entry, eSerial_AsnText, blob_key);    }    if ( m_DumpAsnBinary ) {        Dump(this, *seq_entry, eSerial_AsnBinary, blob_key);    }    size_t blob_size =        m_Cache->GetSize(m_Reader->GetBlobKey(seqref), version,                         m_Reader->GetSeqEntrySubkey());    if ( blob_size == 0 ) {        LINE("Skipping: blob is not in cache");        return;    }    if ( blob_size <= m_SplitterParams.m_MaxChunkSize ) {        LINE("Skipping: blob is small enough: " << blob_size);        return;    }    LINE("Blob size: " << blob_size);    CBlobSplitter splitter(m_SplitterParams);    if ( !splitter.Split(*seq_entry) ) {        LINE("Skipping: no chunks after splitting");        return;    }    const CSplitBlob& blob = splitter.GetBlob();    if ( m_DumpAsnText ) {        Dump(this, blob.GetMainBlob(), eSerial_AsnText, blob_key, "-main");        Dump(this, blob.GetSplitInfo(), eSerial_AsnText, blob_key, "-split");        ITERATE ( CSplitBlob::TChunks, it, blob.GetChunks() ) {            string suffix = "-chunk-" + NStr::IntToString(it->first);            Dump(this, *it->second, eSerial_AsnText, blob_key, suffix);        }    }    if ( m_DumpAsnBinary ) {        Dump(this, blob.GetMainBlob(), eSerial_AsnBinary, blob_key, "-main");        Dump(this, blob.GetSplitInfo(), eSerial_AsnBinary, blob_key, "-split");        ITERATE ( CSplitBlob::TChunks, it, blob.GetChunks() ) {            string suffix = "-chunk-" + NStr::IntToString(it->first);            Dump(this, *it->second, eSerial_AsnBinary, blob_key, suffix);        }    }    {{ // storing split data        DumpData(this, blob.GetMainBlob(), eDataType_MainBlob, blob_key, "-main");        DumpData(this, blob.GetSplitInfo(), eDataType_SplitInfo, blob_key, "-split");        ITERATE ( CSplitBlob::TChunks, it, blob.GetChunks() ) {            string suffix = "-chunk-" + NStr::IntToString(it->first);            DumpData(this, *it->second, eDataType_Chunk, blob_key, suffix);        }    }}    {{ // storing split data into cache        StoreToCache(this, blob.GetMainBlob(), eDataType_MainBlob, seqref,                     m_Reader->GetSkeletonSubkey());        StoreToCache(this, blob.GetSplitInfo(), eDataType_SplitInfo, seqref,                     m_Reader->GetSplitInfoSubkey());        ITERATE ( CSplitBlob::TChunks, it, blob.GetChunks() ) {            StoreToCache(this, *it->second, eDataType_Chunk, seqref,                         m_Reader->GetChunkSubkey(it->first));        }    }}}CConstRef<CSeqref> CSplitCacheApp::GetSeqref(CBioseq_Handle bh){    CSeq_entry_Handle tse = bh.GetTopLevelEntry();    CConstRef<CObject> id = tse.GetBlobId();    return ConstRef(dynamic_cast<const CSeqref*>(id.GetPointer()));}END_SCOPE(objects)END_NCBI_SCOPE///////////////////////////////////////////////////////////////////////////////  MAINint main(int argc, const char* argv[]){    return ncbi::objects::CSplitCacheApp().AppMain(argc, argv);}/** ---------------------------------------------------------------------------* $Log: split_cache.cpp,v $* Revision 1000.2  2004/06/01 19:42:02  gouriano* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.19** Revision 1.19  2004/05/21 21:42:52  gorelenk* Added PCH ncbi_pch.hpp** Revision 1.18  2004/04/28 17:06:26  vasilche* Load split blobs from new ICache.** Revision 1.17  2004/04/28 16:29:15  vasilche* Store split results into new ICache.** Revision 1.16  2004/03/16 16:03:11  vasilche* Removed Windows EOL.** Revision 1.15  2004/03/16 15:47:29  vasilche* Added CBioseq_set_Handle and set of EditHandles** Revision 1.14  2004/02/09 19:18:55  grichenk* Renamed CDesc_CI to CSeq_descr_CI. Redesigned CSeq_descr_CI* and CSeqdesc_CI to avoid using data directly.** Revision 1.13  2004/01/22 20:10:37  vasilche* 1. Splitted ID2 specs to two parts.* ID2 now specifies only protocol.* Specification of ID2 split data is moved to seqsplit ASN module.* For now they are still reside in one resulting library as before - libid2.* As the result split specific headers are now in objects/seqsplit.* 2. Moved ID2 and ID1 specific code out of object manager.* Protocol is processed by corresponding readers.* ID2 split parsing is processed by ncbi_xreader library - used by all readers.* 3. Updated OBJMGR_LIBS correspondingly.** Revision 1.12  2004/01/13 16:55:57  vasilche* CReader, CSeqref and some more classes moved from xobjmgr to separate lib.* Headers moved from include/objmgr to include/objtools/data_loaders/genbank.** Revision 1.11  2004/01/07 17:37:37  vasilche* Fixed include path to genbank loader.* Moved split_cache application.** Revision 1.10  2003/12/30 16:06:15  vasilche* Compression methods moved to separate header: id2_compress.hpp.** Revision 1.9  2003/12/03 19:30:45  kuznets* Misprint fixed** Revision 1.8  2003/12/02 23:46:20  vasilche* Fixed INTERNAL COMPILER ERROR on MSVC - splitted expression.** Revision 1.7  2003/12/02 23:24:33  vasilche* Added "-recurse" option to split all sequences referenced by SeqMap.** Revision 1.6  2003/12/02 19:59:15  vasilche* Added GetFileName() declaration.** Revision 1.5  2003/12/02 19:12:24  vasilche* Fixed compilation on MSVC.** Revision 1.4  2003/11/28 20:27:44  vasilche* Correctly print log lines in LINE macro.** Revision 1.3  2003/11/26 23:05:00  vasilche* Removed extra semicolons after BEGIN_SCOPE and END_SCOPE.** Revision 1.2  2003/11/26 17:56:03  vasilche* Implemented ID2 split in ID1 cache.* Fixed loading of splitted annotations.** Revision 1.1  2003/11/12 16:18:32  vasilche* First implementation of ID2 blob splitter withing cache.** ===========================================================================*/

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?