首页 › 资源下载 › 生物技术 › ncbi源码 › 源码查看
splign_hitparser.cpp

来自「ncbi源码」· C++ 代码 · 共 1,371 行 · 第 1/3 页
CPP
1,371 行
/* * =========================================================================== * PRODUCTION $Log: splign_hitparser.cpp,v $ * PRODUCTION Revision 1000.0  2004/06/01 18:12:53  gouriano * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.7 * PRODUCTION * =========================================================================== *//* $Id: splign_hitparser.cpp,v 1000.0 2004/06/01 18:12:53 gouriano Exp $* ===========================================================================**                            public DOMAIN NOTICE                          *               National Center for Biotechnology Information*                                                                          *  This software/database is a "United States Government Work" under the   *  terms of the United States Copyright Act.  It was written as part of    *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S.    *  Government have not placed any restriction on its use or reproduction.  *                                                                          *  Although all reasonable efforts have been taken to ensure the accuracy  *  and reliability of the software and data, the NLM and the U.S.          *  Government do not and cannot warrant the performance or results that    *  may be obtained by using this software or data. The NLM and the U.S.    *  Government disclaim all warranties, express or implied, including       *  warranties of performance, merchantability or fitness for any particular*  purpose.                                                                *                                                                          *  Please cite the author in any work or product based on this material.   ** ===========================================================================** Author:  Yuri Kapustin** File Description:*   Temporary code - will be part of the hit filtering library**/#include <ncbi_pch.hpp>#include "splign_hitparser.hpp"#include <algo/align/align_exception.hpp>#include <corelib/ncbi_limits.hpp>#include <corelib/ncbistre.hpp>#include <algorithm>#include <numeric>BEGIN_NCBI_SCOPE// auxiliary structure utilized in calculation of collisions and other routinesstruct SNode	{    int x; // coordinate    bool type; // true = open, false = close    vector<CHit>::const_iterator pHit; // owner    SNode(int n, bool b, vector<CHit>::const_iterator ph):        x(n), type(b), pHit(ph) {};    bool operator>(const SNode& rhs) const { return x > rhs.x; }    bool operator==(const SNode& rhs) const { return x == rhs.x; }    bool operator<(const SNode& rhs) const { return x < rhs.x; }};//// general single-linkage clustering (SLC) algorithm //struct SNodePair{    SNodePair(int i, int j): n1(i), n2(j) {}    int n1, n2;};template<class T, class PClustCrit, class CClustID_Set, class CClustID_Get>void DoSingleLinkageClustering (   vector<T>& vt,          // vector of objects to be clustered   const PClustCrit& pr2,        // clustering criterion (two-argument predicate)   const CClustID_Set& CID_Set,  // function that assigns ClustID to objects   const CClustID_Get& CID_Get   // function that retrieves ClustID from objects){    vector<SNodePair> vnodes;    // auxiliary vector    size_t nSize = vt.size(), j = 0, i;    for( j = 0; j < nSize; ++j ) {        CID_Set( vt[j], j );        for( i = 0; i < j; ++i ) {            if( pr2(vt[i], vt[j] ) )                vnodes.push_back( SNodePair( i, j ) );         }    }     vector<SNodePair>::const_iterator kk, kend;    for( kk = vnodes.begin(), kend = vnodes.end(); kk != kend; ++kk ) {        int n1 = CID_Get( vt[kk->n1] ), n2 = CID_Get( vt[kk->n2] );        if( n1 != n2 ) {            NON_CONST_ITERATE (typename vector<T>, jj, vt) {                if( CID_Get( *jj ) == n1 )                    CID_Set( *jj, n2 );            }        }    }}CHitParser::CHitParser(){    x_Set2Defaults();}CHitParser::CHitParser(const vector<CHit>& vh, int& nGroupID){    Init(vh, nGroupID);}void CHitParser::Init(const vector<CHit>& vh, int& nGroupID){    m_Out = vh;    x_RemoveEqual();    x_Set2Defaults();    nGroupID++;    m_GroupID = &nGroupID;}void CHitParser::x_Set2Defaults(){    m_SameOrder = true;    m_Strand = eStrand_Auto;    m_MaxHitDistQ = kMax_Int;    m_MaxHitDistS = kMax_Int;    m_Method = eMethod_MaxScore;    m_SplitQuery   = eSplitMode_MaxScore;    m_SplitSubject = eSplitMode_MaxScore;    m_Prot2Nucl = false;    m_CombinationProximity_pre = m_CombinationProximity_post = -1.;    m_group_identification_method = eNone;    m_CovStep = 90.;    m_MinQueryCoverage = 0.;    m_MinSubjCoverage = 0.;    m_OutputAllGroups = false;    m_Query = m_Subj = "";    m_GroupID = 0;        x_CalcGlobalEnvelop();}void CHitParser::x_CalcGlobalEnvelop(){    const size_t dim = m_Out.size();    if(dim == 0) {        m_ange[0] = m_ange[1] = m_ange[2] = m_ange[3] = 0;        return;    }    m_ange[0] = m_ange[2] = kMax_Int;    m_ange[1] = m_ange[3] = kMin_Int;    vector<CHit>::iterator ii, iend;    for(ii = m_Out.begin(), iend = m_Out.end(); ii != iend; ii++)    {        if(m_ange[0] > ii->m_ai[0]) m_ange[0] = ii->m_ai[0];        if(m_ange[2] > ii->m_ai[2]) m_ange[2] = ii->m_ai[2];        if(m_ange[1] < ii->m_ai[1]) m_ange[1] = ii->m_ai[1];        if(m_ange[3] < ii->m_ai[3]) m_ange[3] = ii->m_ai[3];    }}CHitParser::~CHitParser(){}bool  CHitParser::x_DetectInclusion(const CHit& h1, const CHit& h2) const{  if( h2.m_ai[0] <= h1.m_ai[0] && h1.m_ai[1] <= h2.m_ai[1]) {    return true;  }  if( h2.m_ai[2] <= h1.m_ai[2] && h1.m_ai[3] <= h2.m_ai[3]) {    return true;  }  if( h1.m_ai[0] <= h2.m_ai[0] && h2.m_ai[1] <= h1.m_ai[1]) {    return true;  }  if( h1.m_ai[2] <= h2.m_ai[2] && h2.m_ai[3] <= h1.m_ai[3]) {    return true;  }  return false;}// implements half splitmodebool CHitParser::x_RunAltSplitMode(char cSide, CHit& hit1, CHit& hit2){    int nb = cSide*2;    // detect intersection    if(hit1.m_ai[nb] <= hit2.m_ai[nb] && hit2.m_ai[nb] <= hit1.m_ai[nb+1])    {        int n1 = hit1.m_ai[nb + 1] + 1;        int n2 = hit2.m_ai[nb] - 1;        if(m_Prot2Nucl)        {  // frame must be preserved            int nShift = n1 - hit2.m_ai[nb];            while(nShift % 3) { nShift++; n1++; }            nShift = hit1.m_ai[nb + 1] - n2;            while(nShift % 3) { nShift++; n2--; }        }        hit1.Move(nb + 1, n2, m_Prot2Nucl);        hit2.Move(nb, n1, m_Prot2Nucl);        hit1.UpdateAttributes();        hit2.UpdateAttributes();    }    else    if(hit2.m_ai[nb] <= hit1.m_ai[nb] && hit1.m_ai[nb] <= hit2.m_ai[nb+1])    {        int n1 = hit1.m_ai[nb] - 1;        int n2 = hit2.m_ai[nb + 1] + 1;        if(m_Prot2Nucl)        {  // frame must be preserved            int nShift = hit2.m_ai[nb + 1] - n1;            while(nShift % 3) { nShift++; n1--; }            nShift = n2 - hit1.m_ai[nb];            while(nShift % 3) { nShift++; n2++; }        }        hit1.Move(nb, n2, m_Prot2Nucl);        hit2.Move(nb + 1, n1, m_Prot2Nucl);        hit1.UpdateAttributes();        hit2.UpdateAttributes();    }    return true;}int GetIntersectionSize (int a0, int b0, int c0, int d0) {    int a = min(a0, b0), b = max(a0, b0), c = min(c0, d0), d = max(c0, d0);    int an1[] = {a, b};    int an2[] = {c, d};    int* apn[] = {an1, an2};    if(c <= a) {        apn[0] = an2; apn[1] = an1;    }    int n = (apn[0][1] - apn[1][0]) + 1;    return n > 0? n: 0;}// accumulate coverage on query or subjectclass AcmCvg: public binary_function<int, const CHit*, int>{public:    AcmCvg(char where): m_where(where), m_nFinish(-1) {        m_i1 = m_where == 'q'? 0: 2;        m_i2 = m_where == 'q'? 1: 3;    }    int operator() (int iVal, const CHit* ph)    {        const CHit& h = *ph;        if(h.m_ai[m_i1] > m_nFinish)            return iVal + (m_nFinish = h.m_ai[m_i2]) - h.m_ai[m_i1] + 1;        else        {            int nFinish0 = m_nFinish;            return (h.m_ai[m_i2] > nFinish0)?                (iVal + (m_nFinish = h.m_ai[m_i2]) - nFinish0): iVal;        }    }private:    char m_where;    int  m_nFinish;    unsigned char m_i1, m_i2;};int CHitParser::CalcCoverage(vector<CHit>::const_iterator ib,                             vector<CHit>::const_iterator ie,                             char where){    vector<const CHit*> vh (ie - ib, (const CHit*)0);    for(int i = 0; i < ie - ib; ++i)  vh[i] = &(*ib) + i;    if(where == 'q') {        stable_sort(vh.begin(), vh.end(), CHit::PPrecedeQ_ptr);    }    else {        stable_sort(vh.begin(), vh.end(), CHit::PPrecedeS_ptr);    }    int s = accumulate(vh.begin(), vh.end(), 0, AcmCvg(where));    return s;}//////// Maximum Score method:// 1. Sort the vector in desc order starting from the current hit// 2. Consider max score hit (the current)// 3. Process the other hits, so the current hit remains intact//    and overlapping free// 4. Go to step 1.int CHitParser::x_RunMaxScore(){    // special pre-processing for prot2nucl mode:    // remove those hits that intersect on the genomic    // side having different reading frame    if(m_Prot2Nucl && false)  // -- currently disabled    {        list<vector<CHit>::iterator > l0;        vector<CHit>::iterator ivh;        for(ivh = m_Out.begin(); ivh != m_Out.end(); ivh++)            l0.push_back(ivh);        list<vector<CHit>::iterator >::iterator ii;        bool bDel = false;        for(ii = l0.begin(); ii != l0.end(); )        {            list<vector<CHit>::iterator >::iterator jj = ii;            for(++jj; jj != l0.end(); jj++)            {                int& a = (*ii)->m_ai[2];                int& b = (*ii)->m_ai[3];                                int& c = (*jj)->m_ai[2];                int& d = (*jj)->m_ai[3];                bool bSameFrame = a%3 == c%3;                if( !bSameFrame && GetIntersectionSize(a, b, c, d) > 0)                {                    l0.erase(jj);                    bDel = true;                    break;                }            }            if(bDel)            {                list<vector<CHit>::iterator >::iterator jj0 = ii; jj0++;                l0.erase(ii);                ii = jj0;                bDel = false;            }            else                ii++;        }        vector<CHit> vh1;        for(ii = l0.begin(); ii != l0.end(); ii++)            vh1.push_back(**ii);        m_Out = vh1;    }    size_t nQuerySize = 0;    if(m_MinQueryCoverage > 0.) {        NCBI_THROW( CAlgoAlignException,                    eInternal,                    "Query coverage filtering not supported" );    }    size_t nSubjSize = 0;    if(m_MinSubjCoverage > 0.) {        NCBI_THROW( CAlgoAlignException,                    eInternal,                    "Subj coverage filtering not supported" );    }    // main processing    int i0 = 0;    bool bRestart = true;    while(bRestart) {        bRestart = false;        for(i0 = 0; i0 < int(m_Out.size())-1; ++i0) {            vector<CHit>::iterator ii = m_Out.begin() + i0;            // Sort by score (& priority) starting from the current hit            sort(ii, m_Out.end(), hit_greater());            // Consider max score hit (the current)            CHit& hit = *ii;            // Process the other hits, so the current hit            // remains intact and overlapping free            int j0 = 0;            for(j0 = int(m_Out.size())-1; j0 > i0; --j0) {                vector<CHit>::iterator jj = m_Out.begin() + j0;		if(m_SplitQuery == eSplitMode_Clear		   || m_SplitSubject == eSplitMode_Clear) {		  if(x_DetectInclusion(hit, *jj)) {		    m_Out.erase(jj);		    continue;		  }		}                if(m_SplitQuery == eSplitMode_Clear) {                    if(x_RunAltSplitMode(0, hit, *jj) == false) {                        m_Out.erase(jj);                        continue;                    }                }                if(m_SplitSubject == eSplitMode_Clear) {                    if(x_RunAltSplitMode(1, hit, *jj) == false) {                        m_Out.erase(jj);                        continue;                    }                }                // We have two possibilities:                // (1) there is at least one end of *jj inside the current hit;                // (2) when *jj "embraces" the current hit;                // (1) - proceed it in cyclic manner                bool b [4];                bool bFirstLoop = true;                bool bContinue = false;                do {                    if(!bFirstLoop) {                        size_t i = 0;                        for(i = 0; i < 4; ++i) {                            if(b[i]) {                                switch(i) {                                case 0: {                                    jj->Move(0,                                             hit.m_ai[1] + 1,                                             m_Prot2Nucl);                                    for(size_t d = 2; d < 4; ++d) {                                        b[d] = hit.m_ai[2] <= jj->m_ai[d] &&                                               jj->m_ai[d] <= hit.m_ai[3];                                    }                                }                                break;                                case 1: {                                    jj->Move(1,                                             hit.m_ai[0] - 1,                                             m_Prot2Nucl);                                    for(size_t d = 2; d < 4; ++d) {                                        b[d] = hit.m_ai[2] <= jj->m_ai[d] &&                                               jj->m_ai[d] <= hit.m_ai[3];                                    }                                }                                break;                                case 2: {                                    jj->Move(2,                                             hit.m_ai[3] + 1,                                             m_Prot2Nucl);                                    for(size_t d = 0; d < 2; ++d) {                                        b[d] = hit.m_ai[0] <= jj->m_ai[d] &&                                               jj->m_ai[d] <= hit.m_ai[1];                                    }                                }                                break;                                case 3: {
splign_hitparser.cpp - 源码说明

本页面展示了「ncbi源码」中的 splign_hitparser.cpp 源码文件，采用 C++ 编程语言编写，共 1,371 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ncbi相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?