sage_dload.cpp

来自「ncbi源码」· C++ 代码 · 共 329 行

CPP
329
字号
/* * =========================================================================== * PRODUCTION $Log: sage_dload.cpp,v $ * PRODUCTION Revision 1000.4  2004/06/01 19:42:46  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.5 * PRODUCTION * =========================================================================== *//*  $Id: sage_dload.cpp,v 1000.4 2004/06/01 19:42:46 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors:  Mike DiCuccio * * File Description: * */#include <ncbi_pch.hpp>#include <objtools/data_loaders/table/sage_dload.hpp>#include <sqlite/sqlite.hpp>#include <objects/general/User_object.hpp>#include <objects/general/User_field.hpp>#include <objtools/manip/sage_manip.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/seqfeat/Seq_feat.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqloc/Seq_interval.hpp>#include <objects/seqset/Seq_entry.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objmgr/scope.hpp>#include <objmgr/impl/data_source.hpp>#include <objmgr/impl/synonyms.hpp>#include <objmgr/impl/handle_range_map.hpp>BEGIN_NCBI_SCOPEUSING_SCOPE(objects);bool CSageDataLoader::SIdHandleByContent::operator()(const CSeq_id_Handle& h1,                                                     const CSeq_id_Handle& h2) const{    CConstRef<CSeq_id> id1 = h1.GetSeqId();    CConstRef<CSeq_id> id2 = h2.GetSeqId();    return (*id1 < *id2);}CSageDataLoader::CSageDataLoader(const string& input_file,                                 const string& temp_file,                                 bool delete_file)    : CDataLoader(input_file){    //    // create our SQLite DB    //    m_Table.Reset(new CSQLiteTable(input_file, temp_file, delete_file));    //    // now, store some precalculated info about our table    //    {{        // extract the column names        list<string> cols;        m_Table->GetColumnTitles(cols);        m_Cols.reserve(cols.size());        std::copy(cols.begin(), cols.end(), back_inserter(m_Cols));    }}    // determine our column mapping    int i = 0;    m_ColAssign.resize(m_Cols.size(), eUnknown);    fill(m_ColIdx, m_ColIdx + eMaxKnownCol, -1);    ITERATE(vector<string>, iter, m_Cols) {        string str(*iter);        NStr::ToLower(str);        if (str == "contig"  ||            str == "contig_accession"  ||            str == "accession") {            m_ColAssign[i] = eAccession;            m_ColIdx[eAccession] = i;        } else if (str == "tag_position_on_contig"  ||                   str == "pos"  ||                   str == "contig_pos") {            m_ColAssign[i] = eFrom;            m_ColIdx[eFrom] = i;        } else if (str == "tag"  ||                   str == "catgtag") {            m_ColAssign[i] = eTag;            m_ColIdx[eTag] = i;        } else if (str == "method") {            m_ColAssign[i] = eMethod;            m_ColIdx[eMethod] = i;        } else if (str == "tag_orientation"  ||                   str == "strand"  ||                   str == "orientation") {            m_ColAssign[i] = eStrand;            m_ColIdx[eStrand] = i;        }        ++i;    }    string acc_col = "col" + NStr::IntToString(m_ColIdx[eAccession]);    CSQLite& sqlite = m_Table->SetDB();    // create an index on accession    try {        sqlite.Execute("create index IDX_accession "                       "on TableData (" + acc_col + ")");    }    catch (...) {        // index already exists - ignored    }    // extract a list of the accessions we have    CRef<CSQLiteQuery> q        (sqlite.Compile("select distinct " + acc_col +                        " from TableData order by " + acc_col));    int count;    const char** data = NULL;    const char** cols = NULL;    while (q->NextRow(count, data, cols)) {        CRef<CSeq_id> id(new CSeq_id(data[0]));        if (id->Which() == CSeq_id::e_not_set) {            LOG_POST(Error << "failed to index id = " << data[0]);            continue;        }        CSeq_id_Handle handle = CSeq_id_Handle::GetHandle(*id);        m_Ids.insert(TIdMap::value_type(handle, data[0]));        _TRACE("  id = " << data[0]);    }    LOG_POST(Info << "CSageDataLoader: " << m_Ids.size() << " distinct ids");}// Request from a datasource using handles and ranges instead of seq-loc// The TSEs loaded in this call will be added to the tse_set.void CSageDataLoader::GetRecords(const CSeq_id_Handle& idh,                                 EChoice choice){    string acc_col = "col" + NStr::IntToString(m_ColIdx[eAccession]);    //    // find out if we've already loaded annotations for this seq-id    //    TEntries::iterator iter = m_Entries.find(idh);    if (iter != m_Entries.end()) {        return;    }    //    // now, find out if this ID is in our list of ids    //    pair<TIdMap::iterator, TIdMap::iterator> id_iter = m_Ids.equal_range(idh);    if (id_iter.first == id_iter.second) {        return;    }    //    // okay, this ID is correct, so load all features with this id    //    string sql("select * from TableData where " + acc_col +               " in (");    string tmp;    for ( ;  id_iter.first != id_iter.second;  ++id_iter.first) {        TIdMap::iterator iter = id_iter.first;        if ( !tmp.empty() ) {            tmp += ", ";        }        tmp += "'" + iter->second + "'";    }    sql += tmp + ")";    CSQLiteTable::TIterator row_iter = m_Table->Begin(sql);    CRef<CSeq_annot> annot(new CSeq_annot());    vector<string> data;    for ( ;  *row_iter;  ++(*row_iter)) {        list<string> temp;        (*row_iter).GetRow(temp);        data.resize(temp.size());        std::copy(temp.begin(), temp.end(), data.begin());        // create a new feature        CRef<CSeq_feat> feat(new CSeq_feat());        CSeq_loc& loc = feat->SetLocation();        loc.SetInt().SetId().Assign(*idh.GetSeqId());        CUser_object& user = feat->SetData().SetUser()            .SetCategory(CUser_object::eCategory_Experiment);        CUser_object& experiment =            user.SetExperiment(CUser_object::eExperiment_Sage);        CSageData manip(experiment);        // fill in our columns        bool neg_strand = false;        TSeqPos from = 0;        TSeqPos len  = 0;        for (int i = 0;  i < data.size();  ++i) {            switch (m_ColAssign[i]) {            case eAccession:                // already handled as ID...                break;            case eTag:                manip.SetTag(data[i]);                len = data[i].length();                break;            case eStrand:                if (NStr::Compare(data[i], "-") == 0) {                    neg_strand = true;                }                break;            case eFrom:                from = NStr::StringToInt(data[i]);                break;            case eMethod:                manip.SetMethod(data[i]);                break;            case eUnknown:            default:                manip.SetField(m_Cols[i], data[i]);                break;            }        }        size_t to = from + len;        if (neg_strand) {            loc.SetInt().SetStrand(eNa_strand_minus);            size_t offs = to - from;            from -= offs;            to   -= offs;        } else {            loc.SetInt().SetStrand(eNa_strand_plus);        }        loc.SetInt().SetFrom(from);        loc.SetInt().SetTo  (to);        annot->SetData().SetFtable().push_back(feat);    }    CRef<CSeq_entry> entry;    if (annot) {        // we then add the object to the data loader        // we need to create a dummy TSE for it first        entry.Reset(new CSeq_entry());        entry->SetSet().SetSeq_set();        entry->SetSet().SetAnnot().push_back(annot);        GetDataSource()->AddTSE(*entry);        _TRACE("CSageDataLoader(): loaded "            << annot->GetData().GetFtable().size()            << " features for " << idh.AsString());    }    // we always save an entry here.  If the entry is empty,    // we have no information about this sequence, but we at    // least don't need to repeat an expensive search    m_Entries[idh] = entry;}END_NCBI_SCOPE/* * =========================================================================== * $Log: sage_dload.cpp,v $ * Revision 1000.4  2004/06/01 19:42:46  gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.5 * * Revision 1.5  2004/05/21 21:42:53  gorelenk * Added PCH ncbi_pch.hpp * * Revision 1.4  2003/11/28 13:41:10  dicuccio * Fixed to match new API in CDataLoader * * Revision 1.3  2003/11/13 21:01:03  dicuccio * Altered to support finding multiple accession strings matching a single handle * * Revision 1.2  2003/10/30 21:43:08  jcherry * Fixed typo * * Revision 1.1  2003/10/02 17:40:17  dicuccio * Initial revision * * =========================================================================== */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?