📄 nw_aligner.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: nw_aligner.cpp,v $ * PRODUCTION Revision 1000.3 2004/06/01 18:04:52 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.48 * PRODUCTION * =========================================================================== *//* $Id: nw_aligner.cpp,v 1000.3 2004/06/01 18:04:52 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors: Yuri Kapustin, Boris Kiryutin * * File Description: CNWAligner implementation * * =========================================================================== * */#include <ncbi_pch.hpp>#include <algo/align/nw_aligner.hpp>#include <algo/align/align_exception.hpp>BEGIN_NCBI_SCOPE// IUPACna alphabet (default)const char g_nwaligner_nucleotides [] = "AGTCBDHKMNRSVWY";CNWAligner::CNWAligner() : m_Wm(GetDefaultWm()), m_Wms(GetDefaultWms()), m_Wg(GetDefaultWg()), m_Ws(GetDefaultWs()), m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false), m_abc(g_nwaligner_nucleotides), m_prg_callback(0), m_Seq1(0), m_SeqLen1(0), m_Seq2(0), m_SeqLen2(0), m_score(kInfMinus){ SetScoreMatrix(0);}CNWAligner::CNWAligner( const char* seq1, size_t len1, const char* seq2, size_t len2, const SNCBIPackedScoreMatrix* scoremat ) : m_Wm(GetDefaultWm()), m_Wms(GetDefaultWms()), m_Wg(GetDefaultWg()), m_Ws(GetDefaultWs()), m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false), m_abc(g_nwaligner_nucleotides), m_prg_callback(0), m_Seq1(seq1), m_SeqLen1(len1), m_Seq2(seq2), m_SeqLen2(len2), m_score(kInfMinus){ SetScoreMatrix(scoremat); SetSequences(seq1, len1, seq2, len2);}void CNWAligner::SetSequences(const char* seq1, size_t len1, const char* seq2, size_t len2, bool verify){ if(!seq1 || !seq2) { NCBI_THROW(CAlgoAlignException, eBadParameter, "NULL sequence pointer(s) passed"); } if(verify) { size_t iErrPos1 = x_CheckSequence(seq1, len1); if(iErrPos1 < len1) { ostrstream oss; oss << "The first sequence is inconsistent with the current " << "scoring matrix type. Symbol " << seq1[iErrPos1] << " at " << iErrPos1; string message = CNcbiOstrstreamToString(oss); NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message); } size_t iErrPos2 = x_CheckSequence(seq2, len2); if(iErrPos2 < len2) { ostrstream oss; oss << "The second sequence is inconsistent with the current " << "scoring matrix type. Symbol " << seq2[iErrPos2] << " at " << iErrPos2; string message = CNcbiOstrstreamToString(oss); NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message); } } m_Seq1 = seq1; m_SeqLen1 = len1; m_Seq2 = seq2; m_SeqLen2 = len2; m_Transcript.clear();}void CNWAligner::SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2){ m_esf_L1 = Left1; m_esf_R1 = Right1; m_esf_L2 = Left2; m_esf_R2 = Right2;}// evaluate score for each possible alignment;// fill out backtrace matrix// bit coding (four bits per value): D E Ec Fc// D: 1 if diagonal; 0 - otherwise// E: 1 if space in 1st sequence; 0 if space in 2nd sequence// Ec: 1 if gap in 1st sequence was extended; 0 if it is was opened// Fc: 1 if gap in 2nd sequence was extended; 0 if it is was opened//const unsigned char kMaskFc = 0x0001;const unsigned char kMaskEc = 0x0002;const unsigned char kMaskE = 0x0004;const unsigned char kMaskD = 0x0008;CNWAligner::TScore CNWAligner::x_Align(const char* seg1, size_t len1, const char* seg2, size_t len2, vector<ETranscriptSymbol>* transcript){ const size_t N1 = len1 + 1; const size_t N2 = len2 + 1; vector<TScore> stl_rowV (N2), stl_rowF(N2); TScore* rowV = &stl_rowV[0]; TScore* rowF = &stl_rowF[0]; TScore* pV = rowV - 1; const char* seq1 = seg1 - 1; const char* seq2 = seg2 - 1; const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s; m_terminate = false; if(m_prg_callback) { m_prg_info.m_iter_total = N1*N2; m_prg_info.m_iter_done = 0; if(m_terminate = m_prg_callback(&m_prg_info)) { return 0; } } bool bFreeGapLeft1 = m_esf_L1 && seg1 == m_Seq1; bool bFreeGapRight1 = m_esf_R1 && m_Seq1 + m_SeqLen1 - len1 == seg1; bool bFreeGapLeft2 = m_esf_L2 && seg2 == m_Seq2; bool bFreeGapRight2 = m_esf_R2 && m_Seq2 + m_SeqLen2 - len2 == seg2; TScore wgleft1 = bFreeGapLeft1? 0: m_Wg; TScore wsleft1 = bFreeGapLeft1? 0: m_Ws; TScore wg1 = m_Wg, ws1 = m_Ws; // index calculation: [i,j] = i*n2 + j vector<unsigned char> stl_bm (N1*N2); unsigned char* backtrace_matrix = &stl_bm[0]; // first row size_t k; rowV[0] = wgleft1; for (k = 1; k < N2; k++) { rowV[k] = pV[k] + wsleft1; rowF[k] = kInfMinus; backtrace_matrix[k] = kMaskE | kMaskEc; } rowV[0] = 0; if(m_prg_callback) { m_prg_info.m_iter_done = k; m_terminate = m_prg_callback(&m_prg_info); } // recurrences TScore wgleft2 = bFreeGapLeft2? 0: m_Wg; TScore wsleft2 = bFreeGapLeft2? 0: m_Ws; TScore V = rowV[N2 - 1]; TScore V0 = wgleft2; TScore E, G, n0; unsigned char tracer; size_t i, j; for(i = 1; i < N1 && !m_terminate; ++i) { V = V0 += wsleft2; E = kInfMinus; backtrace_matrix[k++] = kMaskFc; unsigned char ci = seq1[i]; if(i == N1 - 1 && bFreeGapRight1) { wg1 = ws1 = 0; } TScore wg2 = m_Wg, ws2 = m_Ws; for (j = 1; j < N2; ++j, ++k) { G = pV[j] + sm[ci][(unsigned char)seq2[j]]; pV[j] = V; n0 = V + wg1; if(E >= n0) { E += ws1; // continue the gap tracer = kMaskEc; } else { E = n0 + ws1; // open a new gap tracer = 0; } if(j == N2 - 1 && bFreeGapRight2) { wg2 = ws2 = 0; } n0 = rowV[j] + wg2; if(rowF[j] >= n0) { rowF[j] += ws2; tracer |= kMaskFc; } else { rowF[j] = n0 + ws2; } if (E >= rowF[j]) { if(E >= G) { V = E; tracer |= kMaskE; } else { V = G; tracer |= kMaskD; } } else { if(rowF[j] >= G) { V = rowF[j]; } else { V = G; tracer |= kMaskD; } } backtrace_matrix[k] = tracer; } pV[j] = V; if(m_prg_callback) { m_prg_info.m_iter_done = k; if(m_terminate = m_prg_callback(&m_prg_info)) { break; } } } if(!m_terminate) { x_DoBackTrace(backtrace_matrix, N1, N2, transcript); } return V;}CNWAligner::TScore CNWAligner::Run(){ if(!m_Seq1 || !m_Seq2) { NCBI_THROW(CAlgoAlignException, eNoData, "Sequence data not available for one or both sequences"); } if(!x_CheckMemoryLimit()) { NCBI_THROW(CAlgoAlignException, eMemoryLimit, "Out of space"); } m_score = x_Run(); return m_score;}CNWAligner::TScore CNWAligner::x_Run(){ try { if(m_guides.size() == 0) { m_score = x_Align(m_Seq1,m_SeqLen1, m_Seq2,m_SeqLen2, &m_Transcript); } else { // run the algorithm for every segment between hits m_Transcript.clear(); size_t guides_dim = m_guides.size() / 4; size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0; vector<ETranscriptSymbol> trans; // save original esf settings bool esf_L1 = m_esf_L1, esf_R1 = m_esf_R1, esf_L2 = m_esf_L2, esf_R2 = m_esf_R2; SetEndSpaceFree(false, esf_R1, false, esf_R2); // last region for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) { q0 = m_guides[i - 3] + 1; s0 = m_guides[i - 1] + 1; size_t dim_query = q1 - q0, dim_subj = s1 - s0; x_Align(m_Seq1 + q0, dim_query, m_Seq2 + s0, dim_subj, &trans); copy(trans.begin(), trans.end(), back_inserter(m_Transcript)); size_t dim_hit = m_guides[i - 3] - m_guides[i - 4] + 1; for(size_t k = 0; k < dim_hit; ++k) { m_Transcript.push_back(eTS_Match); } q1 = m_guides[i - 4]; s1 = m_guides[i - 2]; trans.clear(); if(i == istart) { // middle regions SetEndSpaceFree(false, false, false, false); } } SetEndSpaceFree(esf_L1, false, esf_L2, false); // first region x_Align(m_Seq1, q1, m_Seq2, s1, &trans); SetEndSpaceFree(esf_L1, esf_R1, esf_L2, esf_R2); // restore copy(trans.begin(), trans.end(), back_inserter(m_Transcript)); m_score = x_ScoreByTranscript(); } } catch( bad_alloc& ) { NCBI_THROW(CAlgoAlignException, eMemoryLimit, "Memory limit exceeded"); } return m_score;}// perform backtrace step;void CNWAligner::x_DoBackTrace(const unsigned char* backtrace, size_t N1, size_t N2, vector<ETranscriptSymbol>* transcript){ transcript->clear(); transcript->reserve(N1 + N2);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -