📄 splign_hitparser.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: splign_hitparser.cpp,v $ * PRODUCTION Revision 1000.0 2004/06/01 18:12:53 gouriano * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.7 * PRODUCTION * =========================================================================== *//* $Id: splign_hitparser.cpp,v 1000.0 2004/06/01 18:12:53 gouriano Exp $* ===========================================================================** public DOMAIN NOTICE * National Center for Biotechnology Information* * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular* purpose. * * Please cite the author in any work or product based on this material. ** ===========================================================================** Author: Yuri Kapustin** File Description:* Temporary code - will be part of the hit filtering library**/#include <ncbi_pch.hpp>#include "splign_hitparser.hpp"#include <algo/align/align_exception.hpp>#include <corelib/ncbi_limits.hpp>#include <corelib/ncbistre.hpp>#include <algorithm>#include <numeric>BEGIN_NCBI_SCOPE// auxiliary structure utilized in calculation of collisions and other routinesstruct SNode { int x; // coordinate bool type; // true = open, false = close vector<CHit>::const_iterator pHit; // owner SNode(int n, bool b, vector<CHit>::const_iterator ph): x(n), type(b), pHit(ph) {}; bool operator>(const SNode& rhs) const { return x > rhs.x; } bool operator==(const SNode& rhs) const { return x == rhs.x; } bool operator<(const SNode& rhs) const { return x < rhs.x; }};//// general single-linkage clustering (SLC) algorithm //struct SNodePair{ SNodePair(int i, int j): n1(i), n2(j) {} int n1, n2;};template<class T, class PClustCrit, class CClustID_Set, class CClustID_Get>void DoSingleLinkageClustering ( vector<T>& vt, // vector of objects to be clustered const PClustCrit& pr2, // clustering criterion (two-argument predicate) const CClustID_Set& CID_Set, // function that assigns ClustID to objects const CClustID_Get& CID_Get // function that retrieves ClustID from objects){ vector<SNodePair> vnodes; // auxiliary vector size_t nSize = vt.size(), j = 0, i; for( j = 0; j < nSize; ++j ) { CID_Set( vt[j], j ); for( i = 0; i < j; ++i ) { if( pr2(vt[i], vt[j] ) ) vnodes.push_back( SNodePair( i, j ) ); } } vector<SNodePair>::const_iterator kk, kend; for( kk = vnodes.begin(), kend = vnodes.end(); kk != kend; ++kk ) { int n1 = CID_Get( vt[kk->n1] ), n2 = CID_Get( vt[kk->n2] ); if( n1 != n2 ) { NON_CONST_ITERATE (typename vector<T>, jj, vt) { if( CID_Get( *jj ) == n1 ) CID_Set( *jj, n2 ); } } }}CHitParser::CHitParser(){ x_Set2Defaults();}CHitParser::CHitParser(const vector<CHit>& vh, int& nGroupID){ Init(vh, nGroupID);}void CHitParser::Init(const vector<CHit>& vh, int& nGroupID){ m_Out = vh; x_RemoveEqual(); x_Set2Defaults(); nGroupID++; m_GroupID = &nGroupID;}void CHitParser::x_Set2Defaults(){ m_SameOrder = true; m_Strand = eStrand_Auto; m_MaxHitDistQ = kMax_Int; m_MaxHitDistS = kMax_Int; m_Method = eMethod_MaxScore; m_SplitQuery = eSplitMode_MaxScore; m_SplitSubject = eSplitMode_MaxScore; m_Prot2Nucl = false; m_CombinationProximity_pre = m_CombinationProximity_post = -1.; m_group_identification_method = eNone; m_CovStep = 90.; m_MinQueryCoverage = 0.; m_MinSubjCoverage = 0.; m_OutputAllGroups = false; m_Query = m_Subj = ""; m_GroupID = 0; x_CalcGlobalEnvelop();}void CHitParser::x_CalcGlobalEnvelop(){ const size_t dim = m_Out.size(); if(dim == 0) { m_ange[0] = m_ange[1] = m_ange[2] = m_ange[3] = 0; return; } m_ange[0] = m_ange[2] = kMax_Int; m_ange[1] = m_ange[3] = kMin_Int; vector<CHit>::iterator ii, iend; for(ii = m_Out.begin(), iend = m_Out.end(); ii != iend; ii++) { if(m_ange[0] > ii->m_ai[0]) m_ange[0] = ii->m_ai[0]; if(m_ange[2] > ii->m_ai[2]) m_ange[2] = ii->m_ai[2]; if(m_ange[1] < ii->m_ai[1]) m_ange[1] = ii->m_ai[1]; if(m_ange[3] < ii->m_ai[3]) m_ange[3] = ii->m_ai[3]; }}CHitParser::~CHitParser(){}bool CHitParser::x_DetectInclusion(const CHit& h1, const CHit& h2) const{ if( h2.m_ai[0] <= h1.m_ai[0] && h1.m_ai[1] <= h2.m_ai[1]) { return true; } if( h2.m_ai[2] <= h1.m_ai[2] && h1.m_ai[3] <= h2.m_ai[3]) { return true; } if( h1.m_ai[0] <= h2.m_ai[0] && h2.m_ai[1] <= h1.m_ai[1]) { return true; } if( h1.m_ai[2] <= h2.m_ai[2] && h2.m_ai[3] <= h1.m_ai[3]) { return true; } return false;}// implements half splitmodebool CHitParser::x_RunAltSplitMode(char cSide, CHit& hit1, CHit& hit2){ int nb = cSide*2; // detect intersection if(hit1.m_ai[nb] <= hit2.m_ai[nb] && hit2.m_ai[nb] <= hit1.m_ai[nb+1]) { int n1 = hit1.m_ai[nb + 1] + 1; int n2 = hit2.m_ai[nb] - 1; if(m_Prot2Nucl) { // frame must be preserved int nShift = n1 - hit2.m_ai[nb]; while(nShift % 3) { nShift++; n1++; } nShift = hit1.m_ai[nb + 1] - n2; while(nShift % 3) { nShift++; n2--; } } hit1.Move(nb + 1, n2, m_Prot2Nucl); hit2.Move(nb, n1, m_Prot2Nucl); hit1.UpdateAttributes(); hit2.UpdateAttributes(); } else if(hit2.m_ai[nb] <= hit1.m_ai[nb] && hit1.m_ai[nb] <= hit2.m_ai[nb+1]) { int n1 = hit1.m_ai[nb] - 1; int n2 = hit2.m_ai[nb + 1] + 1; if(m_Prot2Nucl) { // frame must be preserved int nShift = hit2.m_ai[nb + 1] - n1; while(nShift % 3) { nShift++; n1--; } nShift = n2 - hit1.m_ai[nb]; while(nShift % 3) { nShift++; n2++; } } hit1.Move(nb, n2, m_Prot2Nucl); hit2.Move(nb + 1, n1, m_Prot2Nucl); hit1.UpdateAttributes(); hit2.UpdateAttributes(); } return true;}int GetIntersectionSize (int a0, int b0, int c0, int d0) { int a = min(a0, b0), b = max(a0, b0), c = min(c0, d0), d = max(c0, d0); int an1[] = {a, b}; int an2[] = {c, d}; int* apn[] = {an1, an2}; if(c <= a) { apn[0] = an2; apn[1] = an1; } int n = (apn[0][1] - apn[1][0]) + 1; return n > 0? n: 0;}// accumulate coverage on query or subjectclass AcmCvg: public binary_function<int, const CHit*, int>{public: AcmCvg(char where): m_where(where), m_nFinish(-1) { m_i1 = m_where == 'q'? 0: 2; m_i2 = m_where == 'q'? 1: 3; } int operator() (int iVal, const CHit* ph) { const CHit& h = *ph; if(h.m_ai[m_i1] > m_nFinish) return iVal + (m_nFinish = h.m_ai[m_i2]) - h.m_ai[m_i1] + 1; else { int nFinish0 = m_nFinish; return (h.m_ai[m_i2] > nFinish0)? (iVal + (m_nFinish = h.m_ai[m_i2]) - nFinish0): iVal; } }private: char m_where; int m_nFinish; unsigned char m_i1, m_i2;};int CHitParser::CalcCoverage(vector<CHit>::const_iterator ib, vector<CHit>::const_iterator ie, char where){ vector<const CHit*> vh (ie - ib, (const CHit*)0); for(int i = 0; i < ie - ib; ++i) vh[i] = &(*ib) + i; if(where == 'q') { stable_sort(vh.begin(), vh.end(), CHit::PPrecedeQ_ptr); } else { stable_sort(vh.begin(), vh.end(), CHit::PPrecedeS_ptr); } int s = accumulate(vh.begin(), vh.end(), 0, AcmCvg(where)); return s;}//////// Maximum Score method:// 1. Sort the vector in desc order starting from the current hit// 2. Consider max score hit (the current)// 3. Process the other hits, so the current hit remains intact// and overlapping free// 4. Go to step 1.int CHitParser::x_RunMaxScore(){ // special pre-processing for prot2nucl mode: // remove those hits that intersect on the genomic // side having different reading frame if(m_Prot2Nucl && false) // -- currently disabled { list<vector<CHit>::iterator > l0; vector<CHit>::iterator ivh; for(ivh = m_Out.begin(); ivh != m_Out.end(); ivh++) l0.push_back(ivh); list<vector<CHit>::iterator >::iterator ii; bool bDel = false; for(ii = l0.begin(); ii != l0.end(); ) { list<vector<CHit>::iterator >::iterator jj = ii; for(++jj; jj != l0.end(); jj++) { int& a = (*ii)->m_ai[2]; int& b = (*ii)->m_ai[3]; int& c = (*jj)->m_ai[2]; int& d = (*jj)->m_ai[3]; bool bSameFrame = a%3 == c%3; if( !bSameFrame && GetIntersectionSize(a, b, c, d) > 0) { l0.erase(jj); bDel = true; break; } } if(bDel) { list<vector<CHit>::iterator >::iterator jj0 = ii; jj0++; l0.erase(ii); ii = jj0; bDel = false; } else ii++; } vector<CHit> vh1; for(ii = l0.begin(); ii != l0.end(); ii++) vh1.push_back(**ii); m_Out = vh1; } size_t nQuerySize = 0; if(m_MinQueryCoverage > 0.) { NCBI_THROW( CAlgoAlignException, eInternal, "Query coverage filtering not supported" ); } size_t nSubjSize = 0; if(m_MinSubjCoverage > 0.) { NCBI_THROW( CAlgoAlignException, eInternal, "Subj coverage filtering not supported" ); } // main processing int i0 = 0; bool bRestart = true; while(bRestart) { bRestart = false; for(i0 = 0; i0 < int(m_Out.size())-1; ++i0) { vector<CHit>::iterator ii = m_Out.begin() + i0; // Sort by score (& priority) starting from the current hit sort(ii, m_Out.end(), hit_greater()); // Consider max score hit (the current) CHit& hit = *ii; // Process the other hits, so the current hit // remains intact and overlapping free int j0 = 0; for(j0 = int(m_Out.size())-1; j0 > i0; --j0) { vector<CHit>::iterator jj = m_Out.begin() + j0; if(m_SplitQuery == eSplitMode_Clear || m_SplitSubject == eSplitMode_Clear) { if(x_DetectInclusion(hit, *jj)) { m_Out.erase(jj); continue; } } if(m_SplitQuery == eSplitMode_Clear) { if(x_RunAltSplitMode(0, hit, *jj) == false) { m_Out.erase(jj); continue; } } if(m_SplitSubject == eSplitMode_Clear) { if(x_RunAltSplitMode(1, hit, *jj) == false) { m_Out.erase(jj); continue; } } // We have two possibilities: // (1) there is at least one end of *jj inside the current hit; // (2) when *jj "embraces" the current hit; // (1) - proceed it in cyclic manner bool b [4]; bool bFirstLoop = true; bool bContinue = false; do { if(!bFirstLoop) { size_t i = 0; for(i = 0; i < 4; ++i) { if(b[i]) { switch(i) { case 0: { jj->Move(0, hit.m_ai[1] + 1, m_Prot2Nucl); for(size_t d = 2; d < 4; ++d) { b[d] = hit.m_ai[2] <= jj->m_ai[d] && jj->m_ai[d] <= hit.m_ai[3]; } } break; case 1: { jj->Move(1, hit.m_ai[0] - 1, m_Prot2Nucl); for(size_t d = 2; d < 4; ++d) { b[d] = hit.m_ai[2] <= jj->m_ai[d] && jj->m_ai[d] <= hit.m_ai[3]; } } break; case 2: { jj->Move(2, hit.m_ai[3] + 1, m_Prot2Nucl); for(size_t d = 0; d < 2; ++d) { b[d] = hit.m_ai[0] <= jj->m_ai[d] && jj->m_ai[d] <= hit.m_ai[1]; } } break; case 3: {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -