首页 › 资源下载 › 生物技术 › ncbi源码 › 源码查看
nw_aligner.cpp

来自「ncbi源码」· C++ 代码 · 共 1,134 行 · 第 1/3 页
CPP
1,134 行
/* * =========================================================================== * PRODUCTION $Log: nw_aligner.cpp,v $ * PRODUCTION Revision 1000.3  2004/06/01 18:04:52  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.48 * PRODUCTION * =========================================================================== *//* $Id: nw_aligner.cpp,v 1000.3 2004/06/01 18:04:52 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE                           *               National Center for Biotechnology Information *                                                                           *  This software/database is a "United States Government Work" under the    *  terms of the United States Copyright Act.  It was written as part of     *  the author's official duties as a United States Government employee and  *  thus cannot be copyrighted.  This software/database is freely available  *  to the public for use. The National Library of Medicine and the U.S.     *  Government have not placed any restriction on its use or reproduction.   *                                                                           *  Although all reasonable efforts have been taken to ensure the accuracy   *  and reliability of the software and data, the NLM and the U.S.            *  Government do not and cannot warrant the performance or results that     *  may be obtained by using this software or data. The NLM and the U.S.     *  Government disclaim all warranties, express or implied, including        *  warranties of performance, merchantability or fitness for any particular *  purpose.                                                                 *                                                                           *  Please cite the author in any work or product based on this material.    * * =========================================================================== * * Authors:  Yuri Kapustin, Boris Kiryutin * * File Description:  CNWAligner implementation *                    * =========================================================================== * */#include <ncbi_pch.hpp>#include <algo/align/nw_aligner.hpp>#include <algo/align/align_exception.hpp>BEGIN_NCBI_SCOPE// IUPACna alphabet (default)const char g_nwaligner_nucleotides [] = "AGTCBDHKMNRSVWY";CNWAligner::CNWAligner()    : m_Wm(GetDefaultWm()),      m_Wms(GetDefaultWms()),      m_Wg(GetDefaultWg()),      m_Ws(GetDefaultWs()),      m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),      m_abc(g_nwaligner_nucleotides),      m_prg_callback(0),      m_Seq1(0), m_SeqLen1(0),      m_Seq2(0), m_SeqLen2(0),      m_score(kInfMinus){    SetScoreMatrix(0);}CNWAligner::CNWAligner( const char* seq1, size_t len1,                        const char* seq2, size_t len2,                        const SNCBIPackedScoreMatrix* scoremat )    : m_Wm(GetDefaultWm()),      m_Wms(GetDefaultWms()),      m_Wg(GetDefaultWg()),      m_Ws(GetDefaultWs()),      m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),      m_abc(g_nwaligner_nucleotides),      m_prg_callback(0),      m_Seq1(seq1), m_SeqLen1(len1),      m_Seq2(seq2), m_SeqLen2(len2),      m_score(kInfMinus){    SetScoreMatrix(scoremat);    SetSequences(seq1, len1, seq2, len2);}void CNWAligner::SetSequences(const char* seq1, size_t len1,			      const char* seq2, size_t len2,			      bool verify){    if(!seq1 || !seq2) {        NCBI_THROW(CAlgoAlignException, eBadParameter,                   "NULL sequence pointer(s) passed");    }    if(verify) {        size_t iErrPos1 = x_CheckSequence(seq1, len1);	if(iErrPos1 < len1) {	    ostrstream oss;	    oss << "The first sequence is inconsistent with the current "		<< "scoring matrix type. Symbol " << seq1[iErrPos1] << " at "		<< iErrPos1;	    string message = CNcbiOstrstreamToString(oss);	    NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);	}	size_t iErrPos2 = x_CheckSequence(seq2, len2);	if(iErrPos2 < len2) {	    ostrstream oss;	    oss << "The second sequence is inconsistent with the current "		<< "scoring matrix type. Symbol " << seq2[iErrPos2] << " at "		<< iErrPos2;	    string message = CNcbiOstrstreamToString(oss);	    NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);	}    }    m_Seq1 = seq1;    m_SeqLen1 = len1;    m_Seq2 = seq2;    m_SeqLen2 = len2;    m_Transcript.clear();}void CNWAligner::SetEndSpaceFree(bool Left1, bool Right1,                                 bool Left2, bool Right2){    m_esf_L1 = Left1;    m_esf_R1 = Right1;    m_esf_L2 = Left2;    m_esf_R2 = Right2;}// evaluate score for each possible alignment;// fill out backtrace matrix// bit coding (four bits per value): D E Ec Fc// D:  1 if diagonal; 0 - otherwise// E:  1 if space in 1st sequence; 0 if space in 2nd sequence// Ec: 1 if gap in 1st sequence was extended; 0 if it is was opened// Fc: 1 if gap in 2nd sequence was extended; 0 if it is was opened//const unsigned char kMaskFc  = 0x0001;const unsigned char kMaskEc  = 0x0002;const unsigned char kMaskE   = 0x0004;const unsigned char kMaskD   = 0x0008;CNWAligner::TScore CNWAligner::x_Align(const char* seg1, size_t len1,                                       const char* seg2, size_t len2,                                       vector<ETranscriptSymbol>* transcript){    const size_t N1 = len1 + 1;    const size_t N2 = len2 + 1;    vector<TScore> stl_rowV (N2), stl_rowF(N2);    TScore* rowV    = &stl_rowV[0];    TScore* rowF    = &stl_rowF[0];    TScore* pV = rowV - 1;    const char* seq1 = seg1 - 1;    const char* seq2 = seg2 - 1;    const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;    m_terminate = false;    if(m_prg_callback) {        m_prg_info.m_iter_total = N1*N2;        m_prg_info.m_iter_done = 0;        if(m_terminate = m_prg_callback(&m_prg_info)) {	  return 0;	}    }    bool bFreeGapLeft1  = m_esf_L1 && seg1 == m_Seq1;    bool bFreeGapRight1 = m_esf_R1 && m_Seq1 + m_SeqLen1 - len1 == seg1;    bool bFreeGapLeft2  = m_esf_L2 && seg2 == m_Seq2;    bool bFreeGapRight2 = m_esf_R2 && m_Seq2 + m_SeqLen2 - len2 == seg2;    TScore wgleft1   = bFreeGapLeft1? 0: m_Wg;    TScore wsleft1   = bFreeGapLeft1? 0: m_Ws;    TScore wg1 = m_Wg, ws1 = m_Ws;    // index calculation: [i,j] = i*n2 + j    vector<unsigned char> stl_bm (N1*N2);    unsigned char* backtrace_matrix = &stl_bm[0];    // first row    size_t k;    rowV[0] = wgleft1;    for (k = 1; k < N2; k++) {        rowV[k] = pV[k] + wsleft1;        rowF[k] = kInfMinus;        backtrace_matrix[k] = kMaskE | kMaskEc;    }    rowV[0] = 0;	    if(m_prg_callback) {        m_prg_info.m_iter_done = k;        m_terminate = m_prg_callback(&m_prg_info);    }    // recurrences    TScore wgleft2   = bFreeGapLeft2? 0: m_Wg;    TScore wsleft2   = bFreeGapLeft2? 0: m_Ws;    TScore V  = rowV[N2 - 1];    TScore V0 = wgleft2;    TScore E, G, n0;    unsigned char tracer;    size_t i, j;    for(i = 1;  i < N1 && !m_terminate;  ++i) {                V = V0 += wsleft2;        E = kInfMinus;        backtrace_matrix[k++] = kMaskFc;        unsigned char ci = seq1[i];        if(i == N1 - 1 && bFreeGapRight1) {                wg1 = ws1 = 0;        }        TScore wg2 = m_Wg, ws2 = m_Ws;        for (j = 1; j < N2; ++j, ++k) {            G = pV[j] + sm[ci][(unsigned char)seq2[j]];            pV[j] = V;            n0 = V + wg1;            if(E >= n0) {                E += ws1;      // continue the gap                tracer = kMaskEc;            }            else {                E = n0 + ws1;  // open a new gap                tracer = 0;            }            if(j == N2 - 1 && bFreeGapRight2) {                wg2 = ws2 = 0;            }            n0 = rowV[j] + wg2;            if(rowF[j] >= n0) {                rowF[j] += ws2;                tracer |= kMaskFc;            }            else {                rowF[j] = n0 + ws2;            }            if (E >= rowF[j]) {                if(E >= G) {                    V = E;                    tracer |= kMaskE;                }                else {                    V = G;                    tracer |= kMaskD;                }            } else {                if(rowF[j] >= G) {                    V = rowF[j];                }                else {                    V = G;                    tracer |= kMaskD;                }            }            backtrace_matrix[k] = tracer;        }        pV[j] = V;        if(m_prg_callback) {            m_prg_info.m_iter_done = k;            if(m_terminate = m_prg_callback(&m_prg_info)) {                break;            }        }           }    if(!m_terminate) {        x_DoBackTrace(backtrace_matrix, N1, N2, transcript);    }    return V;}CNWAligner::TScore CNWAligner::Run(){    if(!m_Seq1 || !m_Seq2) {        NCBI_THROW(CAlgoAlignException, eNoData,                   "Sequence data not available for one or both sequences");    }    if(!x_CheckMemoryLimit()) {        NCBI_THROW(CAlgoAlignException, eMemoryLimit, "Out of space");    }    m_score = x_Run();    return m_score;}CNWAligner::TScore CNWAligner::x_Run(){    try {            if(m_guides.size() == 0) {            m_score = x_Align(m_Seq1,m_SeqLen1, m_Seq2,m_SeqLen2,                              &m_Transcript);        }        else {                        // run the algorithm for every segment between hits            m_Transcript.clear();            size_t guides_dim = m_guides.size() / 4;            size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0;            vector<ETranscriptSymbol> trans;                        // save original esf settings            bool esf_L1 = m_esf_L1, esf_R1 = m_esf_R1,                 esf_L2 = m_esf_L2, esf_R2 = m_esf_R2;            SetEndSpaceFree(false, esf_R1, false, esf_R2); // last region            for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) {                q0 = m_guides[i - 3] + 1;                s0 = m_guides[i - 1] + 1;                size_t dim_query = q1 - q0, dim_subj = s1 - s0;                x_Align(m_Seq1 + q0, dim_query, m_Seq2 + s0, dim_subj, &trans);                copy(trans.begin(), trans.end(), back_inserter(m_Transcript));                size_t dim_hit = m_guides[i - 3] - m_guides[i - 4] + 1;                for(size_t k = 0; k < dim_hit; ++k) {                    m_Transcript.push_back(eTS_Match);                }                q1 = m_guides[i - 4];                s1 = m_guides[i - 2];                trans.clear();                if(i == istart) {  // middle regions                    SetEndSpaceFree(false, false, false, false);                }            }            SetEndSpaceFree(esf_L1, false, esf_L2, false); // first region            x_Align(m_Seq1, q1, m_Seq2, s1, &trans);            SetEndSpaceFree(esf_L1, esf_R1, esf_L2, esf_R2); // restore            copy(trans.begin(), trans.end(), back_inserter(m_Transcript));            m_score = x_ScoreByTranscript();        }    }        catch( bad_alloc& ) {                NCBI_THROW(CAlgoAlignException, eMemoryLimit, "Memory limit exceeded");    }        return m_score;}// perform backtrace step;void CNWAligner::x_DoBackTrace(const unsigned char* backtrace,                               size_t N1, size_t N2,                               vector<ETranscriptSymbol>* transcript){    transcript->clear();    transcript->reserve(N1 + N2);
nw_aligner.cpp - 源码说明

本页面展示了「ncbi源码」中的 nw_aligner.cpp 源码文件，采用 C++ 编程语言编写，共 1,134 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ncbi相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?