user_feature_dload.cpp
来自「ncbi源码」· C++ 代码 · 共 363 行
CPP
363 行
/* * =========================================================================== * PRODUCTION $Log: user_feature_dload.cpp,v $ * PRODUCTION Revision 1000.2 2004/06/01 19:42:52 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3 * PRODUCTION * =========================================================================== *//* $Id: user_feature_dload.cpp,v 1000.2 2004/06/01 19:42:52 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors: Josh Cherry * * File Description: Data loader for user features in tabular form * */#include <ncbi_pch.hpp>#include <objtools/data_loaders/table/user_feature_dload.hpp>#include <sqlite/sqlite.hpp>#include <objects/general/User_object.hpp>#include <objects/general/User_field.hpp>#include <objects/general/Object_id.hpp>#include <objects/seqfeat/Seq_feat.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/Seq_interval.hpp>#include <objects/seqset/Seq_entry.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objmgr/scope.hpp>#include <objmgr/impl/data_source.hpp>#include <objmgr/impl/synonyms.hpp>#include <objmgr/impl/handle_range_map.hpp>#include <serial/serial.hpp>#include <serial/objostr.hpp>BEGIN_NCBI_SCOPEUSING_SCOPE(objects);bool CUsrFeatDataLoader::SIdHandleByContent::operator() (const CSeq_id_Handle& h1, const CSeq_id_Handle& h2) const{ CConstRef<CSeq_id> id1 = h1.GetSeqId(); CConstRef<CSeq_id> id2 = h2.GetSeqId(); return (*id1 < *id2);}CUsrFeatDataLoader::CUsrFeatDataLoader(const string& input_file, const string& temp_file, bool delete_file, EOffset offset, const string& type, const CSeq_id* given_id) : CDataLoader(input_file), m_Offset(offset), m_Type(type){ // // create our SQLite DB // m_Table.Reset(new CSQLiteTable(input_file, temp_file, delete_file)); // // now, store some precalculated info about our table // {{ // extract the column names list<string> cols; m_Table->GetColumnTitles(cols); m_Cols.reserve(cols.size()); std::copy(cols.begin(), cols.end(), back_inserter(m_Cols)); }} // determine our column mapping int i = 0; m_ColAssign.resize(m_Cols.size(), eUnknown); fill(m_ColIdx, m_ColIdx + eMaxKnownCol, -1); ITERATE(vector<string>, iter, m_Cols) { string str(*iter); NStr::ToLower(str); if (str == "contig" || str == "contig_accession" || str == "accession" || str == "id") { m_ColAssign[i] = eAccession; m_ColIdx[eAccession] = i; } else if (str == "from") { m_ColAssign[i] = eFrom; m_ColIdx[eFrom] = i; } else if (str == "to") { m_ColAssign[i] = eTo; m_ColIdx[eTo] = i; } else if (str == "type") { m_ColAssign[i] = eType; m_ColIdx[eType] = i; } else if (str == "strand" || str == "orientation") { m_ColAssign[i] = eStrand; m_ColIdx[eStrand] = i; } ++i; } if (given_id) { m_SeqId.Reset(given_id); } if (!m_SeqId && m_ColIdx[eAccession] == -1) { LOG_POST(Info << "CUsrFeatDataLoader: no id column in file, " "and no id given as parameter"); throw runtime_error("no id column in file, " "and no id given as parameter"); } CSQLite& sqlite = m_Table->SetDB(); if (!m_SeqId) { string acc_col = "col" + NStr::IntToString(m_ColIdx[eAccession]); // create an index on accession try { sqlite.Execute("create index IDX_accession " "on TableData (" + acc_col + ")"); } catch (...) { // index already exists - ignored } // extract a list of the accessions we have CRef<CSQLiteQuery> q (sqlite.Compile("select distinct " + acc_col + " from TableData order by " + acc_col)); int count; const char** data = NULL; const char** cols = NULL; while (q->NextRow(count, data, cols)) { CRef<CSeq_id> id(new CSeq_id(data[0])); if (id->Which() == CSeq_id::e_not_set) { LOG_POST(Error << "failed to index id = " << data[0]); continue; } CSeq_id_Handle handle = CSeq_id_Handle::GetHandle(*id); m_Ids.insert(TIdMap::value_type(handle, data[0])); _TRACE(" id = " << data[0]); } LOG_POST(Info << "CUsrFeatDataLoader: " << m_Ids.size() << " distinct ids"); } else { LOG_POST(Info << "CUsrFeatDataLoader: using single " "specified id"); }}// Request from a datasource using handles and ranges instead of seq-loc// The TSEs loaded in this call will be added to the tse_set.void CUsrFeatDataLoader::GetRecords(const CSeq_id_Handle& idh, EChoice choice){ // // find out if we've already loaded annotations for this seq-id // TEntries::iterator iter = m_Entries.find(idh); if (iter != m_Entries.end()) { return; } CRef<CSeq_annot> annot = GetAnnot(idh); if (!annot) { return; } CRef<CSeq_entry> entry; // we then add the object to the data loader // we need to create a dummy TSE for it first entry.Reset(new CSeq_entry()); entry->SetSet().SetSeq_set(); entry->SetSet().SetAnnot().push_back(annot); GetDataSource()->AddTSE(*entry); _TRACE("CUsrFeatDataLoader(): loaded " << annot->GetData().GetFtable().size() << " features for " << idh.AsString()); // we always save an entry here. If the entry is empty, // we have no information about this sequence, but we at // least don't need to repeat an expensive search m_Entries[idh] = entry;}CRef<CSeq_annot> CUsrFeatDataLoader::GetAnnot(const CSeq_id_Handle& idh){ CRef<CSeq_annot> annot; CSQLiteTable::TIterator row_iter; if (!m_SeqId) { // // find out if this ID is in our list of ids // pair<TIdMap::iterator, TIdMap::iterator> id_iter = m_Ids.equal_range(idh); if (id_iter.first == id_iter.second) { return annot; // null CRef } // select just the rows with that match the id string acc_col = "col" + NStr::IntToString(m_ColIdx[eAccession]); string sql("select * from TableData where " + acc_col + " in ("); string tmp; for ( ; id_iter.first != id_iter.second; ++id_iter.first) { TIdMap::iterator iter = id_iter.first; if ( !tmp.empty() ) { tmp += ", "; } tmp += "'" + iter->second + "'"; } sql += tmp + ")"; row_iter = m_Table->Begin(sql); } else { // check that this is the right id if (!idh.GetSeqId()->Match(*m_SeqId)) { return annot; // null CRef } // select 'em all row_iter = m_Table->Begin("select * from TableData"); } annot.Reset(new CSeq_annot()); vector<string> data; for ( ; *row_iter; ++(*row_iter)) { list<string> temp; (*row_iter).GetRow(temp); data.resize(temp.size()); std::copy(temp.begin(), temp.end(), data.begin()); // create a new feature CRef<CSeq_feat> feat(new CSeq_feat()); CSeq_loc& loc = feat->SetLocation(); loc.SetInt().SetId().Assign(*idh.GetSeqId()); CUser_object& user = feat->SetData().SetUser(); // fill in our columns TSeqPos from; TSeqPos to; string strand_str; for (unsigned int i = 0; i < data.size(); ++i) { switch (m_ColAssign[i]) { case eAccession: // already handled as ID... break; case eStrand: strand_str = NStr::ToLower(data[i]); if (strand_str == "+" || strand_str == "plus" || strand_str == "positive" || strand_str == "forward") { loc.SetInt().SetStrand(eNa_strand_plus); } else if (strand_str == "-" || strand_str == "minus" || strand_str == "negative" || strand_str == "reverse") { loc.SetInt().SetStrand(eNa_strand_minus); } else if (strand_str == "b" || strand_str == "+/-" || strand_str == "both") { loc.SetInt().SetStrand(eNa_strand_both); } else { throw runtime_error(string("Invalid strand designation: ") + data[i]); } break; case eFrom: from = NStr::StringToInt(data[i]); break; case eTo: to = NStr::StringToInt(data[i]); break; case eType: user.SetType().SetStr(data[i]); break; case eUnknown: default: // add a user field, unless column title // starts with '.' if (!NStr::StartsWith(m_Cols[i], ".")) { user.AddField(m_Cols[i], data[i]); } break; } } loc.SetInt().SetFrom(from - m_Offset); loc.SetInt().SetTo (to - m_Offset); if (!m_Type.empty() || m_ColIdx[eType] == -1) { user.SetType().SetStr(m_Type); } annot->SetData().SetFtable().push_back(feat); } return annot;}END_NCBI_SCOPE/* * =========================================================================== * $Log: user_feature_dload.cpp,v $ * Revision 1000.2 2004/06/01 19:42:52 gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3 * * Revision 1.3 2004/05/21 21:42:53 gorelenk * Added PCH ncbi_pch.hpp * * Revision 1.2 2003/11/28 13:41:10 dicuccio * Fixed to match new API in CDataLoader * * Revision 1.1 2003/11/14 19:09:18 jcherry * Initial version * * =========================================================================== */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?