📄 restriction.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: restriction.cpp,v $ * PRODUCTION Revision 1000.1 2004/06/01 18:10:51 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.11 * PRODUCTION * =========================================================================== *//* $Id: restriction.cpp,v 1000.1 2004/06/01 18:10:51 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors: Josh Cherry * * File Description: Classes for representing and finding restriction sites * */#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <objmgr/seq_vector.hpp>#include <algorithm>#include <algo/sequence/restriction.hpp>BEGIN_NCBI_SCOPEUSING_SCOPE(objects);bool CRSpec::operator<(const CRSpec& rhs) const{ if (GetSeq() != rhs.GetSeq()) { return GetSeq() < rhs.GetSeq(); } // otherwise sequences identical if (GetPlusCuts() != rhs.GetPlusCuts()) { return GetPlusCuts() < rhs.GetPlusCuts(); } if (GetMinusCuts() != rhs.GetMinusCuts()) { return GetMinusCuts() < rhs.GetMinusCuts(); } // otherwise arguments are equal, and < is false return false;}void CREnzyme::Reset(void){ m_Name.erase(); m_Specs.clear();}// helper functor for sorting enzymes by specificitystruct SCompareSpecs{ bool operator()(const CREnzyme& lhs, const CREnzyme& rhs) { return lhs.GetSpecs() < rhs.GetSpecs(); }};void CREnzyme::CombineIsoschizomers(vector<CREnzyme>& enzymes){ stable_sort(enzymes.begin(), enzymes.end(), SCompareSpecs()); vector<CREnzyme> result; ITERATE (vector<CREnzyme>, enzyme, enzymes) { if (enzyme != enzymes.begin() && enzyme->GetSpecs() == result.back().GetSpecs()) { result.back().SetName() += "/"; result.back().SetName() += enzyme->GetName(); } else { result.push_back(*enzyme); } } swap(enzymes, result);}void CRSpec::Reset(void){ m_Seq.erase(); m_PlusCuts.clear(); m_MinusCuts.clear();}ostream& operator<<(ostream& os, const CRSite& site){ os << "Recog. site: " << site.GetStart() << '-' << site.GetEnd() << endl; os << "Plus strand cuts: "; string s; ITERATE (vector<int>, cut, site.GetPlusCuts()) { if (!s.empty()) { s += " ,"; } s += NStr::IntToString(*cut); } os << s << endl; os << "Minus strand cuts: "; s.erase(); ITERATE (vector<int>, cut, site.GetMinusCuts()) { if (!s.empty()) { s += " ,"; } s += NStr::IntToString(*cut); } os << s << endl; return os; }ostream& operator<<(ostream& os, const CREnzResult& er){ os << "Enzyme: " << er.GetEnzymeName() << endl; os << er.GetDefiniteSites().size() << " definite sites:" << endl; ITERATE (vector<CRSite>, site, er.GetDefiniteSites()) { os << *site; } os << er.GetPossibleSites().size() << " possible sites:" << endl; ITERATE (vector<CRSite>, site, er.GetPossibleSites()) { os << *site; } return os;}struct SCompareLocation{ bool operator() (const CRSite& lhs, const CRSite& rhs) const { return lhs.GetStart() < rhs.GetStart(); }};// Class for internal use by restriction site finding.// Holds a pattern in ncbi8na, integers that specify// which enzyme (of a sequential collection) // and which specifity of that// enzyme it represents, a field indicating// whether it represents the complement of the specificity,// and the length of the initial subpattern that was// put into the fsm.class CPatternRec{public: enum EIsComplement { eIsNotComplement, eIsComplement }; CPatternRec(string pattern, int enzyme_index, int spec_index, EIsComplement is_complement, unsigned int fsm_pat_size) : m_Pattern(pattern), m_EnzymeIndex(enzyme_index), m_SpecIndex(spec_index), m_IsComplement(is_complement), m_FsmPatSize(fsm_pat_size) {} // member access const string& GetPattern(void) const {return m_Pattern;} int GetEnzymeIndex(void) const {return m_EnzymeIndex;} int GetSpecIndex(void) const {return m_SpecIndex;} EIsComplement GetIsComlement(void) const {return m_IsComplement;} bool IsComplement(void) const {return m_IsComplement == eIsComplement;} unsigned int GetFsmPatSize(void) const {return m_FsmPatSize;}private: // pattern in ncbi8na string m_Pattern; // which enzyme and specificity we represent int m_EnzymeIndex; int m_SpecIndex; // whether we represent the complement of the specificity EIsComplement m_IsComplement; unsigned int m_FsmPatSize;};void CFindRSites::x_ExpandRecursion(string& s, unsigned int pos, CTextFsm<int>& fsm, int match_value){ if (pos == s.size()) { // this is the place fsm.AddWord(s, match_value); return; } char orig_ch = s[pos]; for (char x = 1; x <= 8; x <<= 1) { if (orig_ch & x) { s[pos] = x; x_ExpandRecursion(s, pos + 1, fsm, match_value); } } // restore original character s[pos] = orig_ch;}void CFindRSites::x_AddPattern(const string& pat, CTextFsm<int>& fsm, int match_value){ string s = pat; x_ExpandRecursion(s, 0, fsm, match_value);}bool CFindRSites::x_IsAmbig(char nuc){ static const bool ambig_table[16] = { 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }; return ambig_table[nuc];}/// Find all definite and possible sites in a sequence/// for a vector of enzymes, using a finite state machine./// Templatized so that seq can be various containers/// (e.g., string, vector<char>, CSeqVector),/// but it must yield ncbi8na.template<class Seq>void x_FindRSite(const Seq& seq, const vector<CREnzyme>& enzymes, vector<CRef<CREnzResult> >& results){ results.clear(); results.reserve(enzymes.size()); // vector of patterns for internal use vector<CPatternRec> patterns; patterns.reserve(enzymes.size()); // an underestimate // the finite state machine for the search CTextFsm<int> fsm; // iterate over enzymes ITERATE (vector<CREnzyme>, enzyme, enzymes) { results.push_back(CRef<CREnzResult> (new CREnzResult(enzyme->GetName()))); string pat; const vector<CRSpec>& specs = enzyme->GetSpecs(); // iterate over specificities for this enzyme ITERATE (vector<CRSpec>, spec, specs) { // one or two patterns for this specificity // (one for pallindrome, two otherwise) // to avoid combinatorial explosion, // if there are more than two Ns only use // part of pattern before first N CSeqMatch::IupacToNcbi8na(spec->GetSeq(), pat); SIZE_TYPE fsm_pat_size = pat.find_first_of(0x0f); {{ SIZE_TYPE pos = pat.find_first_of(0x0f, fsm_pat_size + 1); if (pos == NPOS || pat.find_first_of(0x0f, pos + 1) == NPOS) { fsm_pat_size = pat.size(); } }} patterns.push_back(CPatternRec(pat, enzyme - enzymes.begin(), spec - specs.begin(), CPatternRec::eIsNotComplement,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -