rebase.cpp
来自「ncbi源码」· C++ 代码 · 共 210 行
CPP
210 行
/* * =========================================================================== * PRODUCTION $Log: rebase.cpp,v $ * PRODUCTION Revision 1000.1 2004/06/01 20:55:41 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3 * PRODUCTION * =========================================================================== *//* $Id: rebase.cpp,v 1000.1 2004/06/01 20:55:41 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors: Josh Cherry * * File Description: Code to deal with REBASE restriction enzyme database * */#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <algo/sequence/restriction.hpp>#include "rebase.hpp"BEGIN_NCBI_SCOPECRSpec CRebase::MakeRSpec(const string& site){ // site contains a string such as // GACGTC, GACGT^C, GCAGC(8/12), or (8/13)GAGNNNNNCTC(13/8) CRSpec spec; int plus_cut, minus_cut; string s = site; if (s[0] == '(') { string::size_type idx = s.find_first_of(")"); if (idx == std::string::npos) { throw runtime_error(string("Error parsing site ") + site); } x_ParseCutPair(s.substr(0, idx + 1), plus_cut, minus_cut); s.erase(0, idx + 1); spec.SetPlusCuts().push_back(-plus_cut); spec.SetMinusCuts().push_back(-minus_cut); } if (s[s.length() - 1] == ')') { string::size_type idx = s.find_last_of("("); if (idx == std::string::npos) { throw runtime_error(string("Error parsing site ") + site); } x_ParseCutPair(s.substr(idx), plus_cut, minus_cut); s.erase(idx); spec.SetPlusCuts().push_back(plus_cut + s.length()); spec.SetMinusCuts().push_back(minus_cut + s.length()); } for (unsigned int i = 0; i < s.length(); i++) { if (s[i] == '^') { // This should be simple. If we have // G^GATCC, the cut on the plus strand is // at 1, and on the minus it's at the // symmetric position, 5. // The complication is that TspRI // is given as CASTGNN^, not NNCASTGNN^. // Someone someday might write NNCASTGNN^, // and something like ^NNGATC might arise, // so code defensively. s.erase(i, 1); int plus_cut = i; // this is the slightly complicated bit // trim any leading and trailing N's; // in case leading N's removed, adjust plus_cut accordingly string::size_type idx = s.find_first_not_of("N"); if (idx == string::npos) { // bizarre situation; all N's (possibly empty) s.erase(); plus_cut = 0; } else { s.erase(0, idx); plus_cut -= idx; } idx = s.find_last_not_of("N"); s.erase(idx + 1); // plus strand cut spec.SetPlusCuts().push_back(plus_cut); // symmetric cut on minus strand spec.SetMinusCuts().push_back(s.length() - plus_cut); break; // there better be just one '^' } } spec.SetSeq(s); return spec;}CREnzyme CRebase::MakeREnzyme(const string& name, const string& sites){ CREnzyme re; re.SetName(name); vector<string> site_vec; NStr::Tokenize(sites, ",", site_vec); ITERATE (vector<string>, iter, site_vec) { CRSpec spec = CRebase::MakeRSpec(*iter); re.SetSpecs().push_back(spec); } return re;}void CRebase::x_ParseCutPair(const string& s, int& plus_cut, int& minus_cut){ // s should look like "(8/13)"; plus_cut gets set to 8 and minus_cut to 13 list<string> l; NStr::Split(s.substr(1, s.length() - 2), "/", l); if (l.size() != 2) { throw runtime_error(string("Couldn't parse cut locations ") + s); } plus_cut = NStr::StringToInt(l.front()); minus_cut = NStr::StringToInt(l.back());}void CRebase::ReadNARFormat(istream& input, vector<CREnzyme>& enzymes, enum EEnzymesToLoad which){ string line; CREnzyme enzyme; string name; while (getline(input, line)) { vector<string> fields; NStr::Tokenize(line, "\t", fields); // the lines we're interested in have more than one field if (fields.size() < 2) { continue; } // name of enzyme is in field one or two (field zero is empty) name = fields[1]; if (name == " ") { if (which == ePrototype) { continue; // skip this non-protype } name = fields[2]; } if (which == eCommercial && fields[5].empty()) { continue; // skip this commercially unavailable enzyme } string sites = fields[3]; // this contains a comma-separted list of sites, // usually just one (but TaqII has two) enzyme = MakeREnzyme(name, sites); enzymes.push_back(enzyme); }}END_NCBI_SCOPE/* * =========================================================================== * $Log: rebase.cpp,v $ * Revision 1000.1 2004/06/01 20:55:41 gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3 * * Revision 1.3 2004/05/21 22:27:47 gorelenk * Added PCH ncbi_pch.hpp * * Revision 1.2 2003/08/21 19:24:16 jcherry * Moved restriction site finding to algo/sequence * * Revision 1.1 2003/08/12 18:52:58 jcherry * Initial version * * =========================================================================== */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?