📄 restriction.cpp
字号:
fsm_pat_size)); // add pattern to fsm // (add only fsm_pat_size of it) CFindRSites::x_AddPattern(pat.substr(0, fsm_pat_size), fsm, patterns.size() - 1); // if the pattern is not pallindromic, // do a search for its complement too string comp = pat; CSeqMatch::CompNcbi8na(comp); if (comp != pat) { {{ fsm_pat_size = comp.find_first_of(0x0f); SIZE_TYPE pos = comp.find_first_of(0x0f, fsm_pat_size + 1); if (pos == NPOS || comp.find_first_of(0x0f, pos + 1) == NPOS) { fsm_pat_size = comp.size(); } }} patterns.push_back(CPatternRec(comp, enzyme - enzymes.begin(), spec - specs.begin(), CPatternRec::eIsComplement, fsm_pat_size)); // add pattern to fsm // (add only fsm_pat_size of it) CFindRSites::x_AddPattern(comp.substr(0, fsm_pat_size), fsm, patterns.size() - 1); } } } // The fsm is set up. // Now do the search. fsm.Prime(); vector<int> ambig_nucs; // for dealing with ambiguities later int state = fsm.GetInitialState(); for (unsigned int i = 0; i < seq.size(); i++) { if (CFindRSites::x_IsAmbig(seq[i])) { ambig_nucs.push_back(i); } state = fsm.GetNextState(state, seq[i]); if (fsm.IsMatchFound(state)) { const vector<int>& matches = fsm.GetMatches(state); ITERATE (vector<int>, match, matches) { const CPatternRec& pattern = patterns[*match]; TSeqPos begin_pos = i + 1 - pattern.GetFsmPatSize(); TSeqPos end_pos = begin_pos + pattern.GetPattern().size() - 1; // check for a full match to sequence if (pattern.GetFsmPatSize() != pattern.GetPattern().size()) { // Pattern put into fsm was less than the full pattern. // Must check that the sequence really matches. if (end_pos >= seq.size()) { // full pattern goes off the end of sequence; // not a match continue; } // could really test match for just // right end of pattern here if (CSeqMatch::MatchNcbi8na(seq, pattern.GetPattern(), begin_pos) == CSeqMatch::eYes) { // There must be a site here. However, because // we'll later check all stretches containing // ambiguities, we must ignore them here to keep // from recording them twice. bool ambig = false; for (unsigned int n = begin_pos; n <= end_pos; n++) { if (CFindRSites::x_IsAmbig(seq[n])) { ambig = true; break; } } if (ambig) { continue; // ignore matches with ambig. seq. } } else { continue; // not a real match } } CRSite site(begin_pos, end_pos); const CRSpec& spec = enzymes[pattern.GetEnzymeIndex()] .GetSpecs()[pattern.GetSpecIndex()]; const vector<int>& plus_cuts = spec.GetPlusCuts(); ITERATE (vector<int>, cut, plus_cuts) { if (pattern.IsComplement()) { site.SetPlusCuts() .push_back(begin_pos + pattern.GetPattern().size() - *cut); } else { site.SetPlusCuts().push_back(begin_pos + *cut); } } const vector<int>& minus_cuts = spec.GetMinusCuts(); ITERATE (vector<int>, cut, minus_cuts) { if (pattern.IsComplement()) { site.SetMinusCuts() .push_back(begin_pos + pattern.GetPattern().size() - *cut); } else { site.SetMinusCuts().push_back(begin_pos + *cut); } } results[pattern.GetEnzymeIndex()]->SetDefiniteSites() .push_back(site); } } } // end iteration over the sequence // We've found all the matches involving unambiguous characters // in sequence. Now go back and examine any ambiguous places. if (!ambig_nucs.empty()) { ITERATE (vector<CPatternRec>, pattern, patterns) { const string& pat = pattern->GetPattern(); // for reordering later int ds_pos = results[pattern->GetEnzymeIndex()] ->GetDefiniteSites().size(); int ps_pos = results[pattern->GetEnzymeIndex()] ->GetPossibleSites().size(); // the next possible (starting) position to check int next_pos = 0; // iterate over ambiguous positions ITERATE (vector<int>, pos, ambig_nucs) { int begin_check = *pos - pat.size() + 1; // don't try to check a negative position begin_check = max(begin_check, 0); // to avoid recording a site multiple times // when there are two nearby ambiguities begin_check = max(begin_check, next_pos); int end_check = min(*pos, (int) (seq.size() - pat.size())); int i; for (i = begin_check; i <= end_check; i++) { CSeqMatch::EMatch match = CSeqMatch::MatchNcbi8na(seq, pat, i); if (match == CSeqMatch::eNo) { continue; // no match of any sort } // otherwise, a definite or possible site CRSite site(i, i + pat.size() -1); // figure out cleavage locations const vector<int>& plus_cuts = enzymes[pattern->GetEnzymeIndex()] .GetSpecs()[pattern->GetSpecIndex()].GetPlusCuts(); ITERATE (vector<int>, cut, plus_cuts) { if (pattern->IsComplement()) { site.SetPlusCuts() .push_back(i + pattern->GetPattern().size() - *cut); } else { site.SetPlusCuts().push_back(i + *cut); } } const vector<int>& minus_cuts = enzymes[pattern->GetEnzymeIndex()] .GetSpecs()[pattern->GetSpecIndex()] .GetMinusCuts(); ITERATE (vector<int>, cut, minus_cuts) { if (pattern->IsComplement()) { site.SetMinusCuts() .push_back(i + pattern->GetPattern().size() - *cut); } else { site.SetMinusCuts().push_back(i + *cut); } } if (match == CSeqMatch::eYes) { results[pattern->GetEnzymeIndex()] ->SetDefiniteSites().push_back(site); } else { results[pattern->GetEnzymeIndex()] ->SetPossibleSites().push_back(site); } } next_pos = i; } // end ITERATE over ambiguous positions // definite_sites and possible_sites may be out of order vector<CRSite>& def_sites = results[pattern->GetEnzymeIndex()] ->SetDefiniteSites(); inplace_merge(def_sites.begin(), def_sites.begin() + ds_pos, def_sites.end(), SCompareLocation()); vector<CRSite>& pos_sites = results[pattern->GetEnzymeIndex()] ->SetPossibleSites(); inplace_merge(pos_sites.begin(), pos_sites.begin() + ps_pos, pos_sites.end(), SCompareLocation()); } // end ITERATE over patterns } // end if (!ambig_nucs.empty())}// CFindRSites::Find for various sequence containersvoid CFindRSites::Find(const string& seq, const vector<CREnzyme>& enzymes, vector<CRef<CREnzResult> >& results){ x_FindRSite(seq, enzymes, results);}void CFindRSites::Find(const vector<char>& seq, const vector<CREnzyme>& enzymes, vector<CRef<CREnzResult> >& results){ x_FindRSite(seq, enzymes, results);}void CFindRSites::Find(const CSeqVector& seq, const vector<CREnzyme>& enzymes, vector<CRef<CREnzResult> >& results){ string seq_ncbi8na; CSeqVector vec(seq); vec.SetNcbiCoding(); vec.GetSeqData(0, vec.size(), seq_ncbi8na); x_FindRSite(seq_ncbi8na, enzymes, results);}END_NCBI_SCOPE/* * =========================================================================== * $Log: restriction.cpp,v $ * Revision 1000.1 2004/06/01 18:10:51 gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.11 * * Revision 1.11 2004/05/21 21:41:04 gorelenk * Added PCH ncbi_pch.hpp * * Revision 1.10 2003/08/28 15:49:43 jcherry * Fixed problem with x_FindRSite; pattern was being searched before * it was read. * * Revision 1.9 2003/08/22 14:25:58 ucko * Fix for MSVC, which seems to have problems with member templates. * * Revision 1.8 2003/08/22 02:17:18 ucko * Fix WorkShop compilation. * * Revision 1.7 2003/08/21 20:05:59 dicuccio * Added USING_SCOPE(objects) for MSVC * * Revision 1.6 2003/08/21 19:22:47 jcherry * Moved restriction site finding to algo/sequence * * Revision 1.5 2003/08/21 18:38:31 jcherry * Overloaded CFindRSites::Find to take several sequence containers. * Added option to lump together enzymes with identical specificities. * * Revision 1.4 2003/08/21 12:03:07 dicuccio * Make use of new typedef in plugin_utils.hpp for argument values. * * Revision 1.3 2003/08/20 22:57:44 jcherry * Reimplemented restriction site finding using finite state machine * * Revision 1.2 2003/08/17 19:25:30 jcherry * Changed member variable names to follow convention * * Revision 1.1 2003/08/12 18:52:58 jcherry * Initial version * * =========================================================================== */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -