📄 nw_aligner.cpp
字号:
size_t k = N1*N2 - 1; while (k != 0) { unsigned char Key = backtrace[k]; if (Key & kMaskD) { transcript->push_back(eTS_Match); k -= N2 + 1; } else if (Key & kMaskE) { transcript->push_back(eTS_Insert); --k; while(k > 0 && (Key & kMaskEc)) { transcript->push_back(eTS_Insert); Key = backtrace[k--]; } } else { transcript->push_back(eTS_Delete); k -= N2; while(k > 0 && (Key & kMaskFc)) { transcript->push_back(eTS_Delete); Key = backtrace[k]; k -= N2; } } }}void CNWAligner::SetPattern(const vector<size_t>& guides){ size_t dim = guides.size(); const char* err = 0; if(dim % 4 == 0) { for(size_t i = 0; i < dim; i += 4) { if( guides[i] > guides[i+1] || guides[i+2] > guides[i+3] ) { err = "Pattern hits must be specified in plus strand"; break; } if(i > 4) { if(guides[i] <= guides[i-3] || guides[i+2] <= guides[i-2]){ err = "Pattern hits coordinates must be sorted"; break; } } size_t dim1 = guides[i + 1] - guides[i]; size_t dim2 = guides[i + 3] - guides[i + 2]; if( dim1 != dim2) { err = "Pattern hits must have equal length on both sequences"; break; } if(guides[i+1] >= m_SeqLen1 || guides[i+3] >= m_SeqLen2) { err = "One or several pattern hits are out of range"; break; } } } else { err = "Pattern must have a dimension multiple of four"; } if(err) { NCBI_THROW(CAlgoAlignException, eBadParameter, err); } else { m_guides = guides; }}void CNWAligner::GetEndSpaceFree(bool* L1, bool* R1, bool* L2, bool* R2) const{ if(L1) *L1 = m_esf_L1; if(R1) *R1 = m_esf_R1; if(L2) *L2 = m_esf_L2; if(R2) *R2 = m_esf_R2;}// Return transcript as a readable stringstring CNWAligner::GetTranscriptString(void) const{ const size_t dim = m_Transcript.size(); string s; s.resize(dim); size_t i1 = 0, i2 = 0, i = 0; for (int k = dim - 1; k >= 0; --k) { ETranscriptSymbol c0 = m_Transcript[k]; char c = 0; switch ( c0 ) { case eTS_Match: case eTS_Replace: { c = (m_Seq1[i1++] == m_Seq2[i2++])? 'M': 'R'; } break; case eTS_Insert: { c = 'I'; ++i2; } break; case eTS_Delete: { c = 'D'; ++i1; } break; case eTS_Intron: { c = '+'; ++i2; } break; default: { NCBI_THROW(CAlgoAlignException, eInternal, "Unexpected transcript symbol"); } } s[i++] = c; } if(i < s.size()) { s.resize(i + 1); } return s;}void CNWAligner::SetProgressCallback ( FProgressCallback prg_callback, void* data ){ m_prg_callback = prg_callback; m_prg_info.m_data = data;} void CNWAligner::SetScoreMatrix(const SNCBIPackedScoreMatrix* psm){ if(psm) { m_abc = psm->symbols; NCBISM_Unpack(psm, &m_ScoreMatrix); } else { // assume IUPACna m_abc = g_nwaligner_nucleotides; memset(&m_ScoreMatrix.s, m_Wms, sizeof m_ScoreMatrix.s); // set diag to Wm except ambiguity symbols unsigned char c1, c2; const size_t kNuclSize = strlen(g_nwaligner_nucleotides); for(size_t i = 0; i < kNuclSize; ++i) { c1 = g_nwaligner_nucleotides[i]; for(size_t j = 0; j < i; ++j) { c2 = g_nwaligner_nucleotides[j]; m_ScoreMatrix.s[c1][c2] = m_ScoreMatrix.s[c2][c1] = m_Wms; } m_ScoreMatrix.s[c1][c1] = i < 4? m_Wm: m_Wms; } }}// Check that all characters in sequence are valid for // the current sequence type.// Return an index to the first invalid character// or len if all characters are valid.size_t CNWAligner::x_CheckSequence(const char* seq, size_t len) const{ char Flags [256]; memset(Flags, 0, sizeof Flags); const size_t abc_size = strlen(m_abc); size_t k; for(k = 0; k < abc_size; ++k) { Flags[unsigned(m_abc[k])] = 1; } for(k = 0; k < len; ++k) { if(Flags[unsigned(seq[k])] == 0) break; } return k;}CNWAligner::TScore CNWAligner::GetScore() const { if(m_Transcript.size()) { return m_score; } else { NCBI_THROW(CAlgoAlignException, eNoData, "Sequences not aligned yet"); }}bool CNWAligner::x_CheckMemoryLimit(){ const size_t elem_size = GetElemSize(); const size_t gdim = m_guides.size(); const size_t max_mem = kMax_UInt / 2; if(gdim) { size_t dim1 = m_guides[0], dim2 = m_guides[2]; double mem = double(dim1)*dim2*elem_size; if(mem >= max_mem) { return false; } for(size_t i = 4; i < gdim; i += 4) { dim1 = m_guides[i] - m_guides[i-3] + 1; dim2 = m_guides[i + 2] - m_guides[i-1] + 1; mem = double(dim1)*dim2*elem_size; if(mem >= max_mem) { return false; } } dim1 = m_SeqLen1 - m_guides[gdim-3]; dim2 = m_SeqLen2 - m_guides[gdim-1]; mem = double(dim1)*dim2*elem_size; if(mem >= max_mem) { return false; } return true; } else { double mem = double(m_SeqLen1 + 1)*(m_SeqLen2 + 1)*elem_size; return mem < max_mem; }}CNWAligner::TScore CNWAligner::x_ScoreByTranscript() const{ const size_t dim = m_Transcript.size(); if(dim == 0) return 0; vector<ETranscriptSymbol> transcript (dim); for(size_t i = 0; i < dim; ++i) { transcript[i] = m_Transcript[dim - i - 1]; } TScore score = 0; const char* p1 = m_Seq1; const char* p2 = m_Seq2; int state1; // 0 = normal, 1 = gap, 2 = intron int state2; // 0 = normal, 1 = gap switch(transcript[0]) { case eTS_Match: state1 = state2 = 0; break; case eTS_Insert: state1 = 1; state2 = 0; score += m_Wg; break; case eTS_Delete: state1 = 0; state2 = 1; score += m_Wg; break; default: { NCBI_THROW(CAlgoAlignException, eInternal, "Invalid transcript symbol"); } } const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s; for(size_t i = 0; i < dim; ++i) { unsigned char c1 = m_Seq1? *p1: 'N'; unsigned char c2 = m_Seq2? *p2: 'N'; switch(transcript[i]) { case eTS_Match: { state1 = state2 = 0; score += sm[c1][c2]; ++p1; ++p2; } break; case eTS_Insert: { if(state1 != 1) score += m_Wg; state1 = 1; state2 = 0; score += m_Ws; ++p2; } break; case eTS_Delete: { if(state2 != 1) score += m_Wg; state1 = 0; state2 = 1; score += m_Ws; ++p1; } break; default: { NCBI_THROW(CAlgoAlignException, eInternal, "Invalid transcript symbol"); } } } if(m_esf_L1) { size_t g = 0; for(size_t i = 0; i < dim; ++i) { if(transcript[i] == eTS_Insert) ++g; else break; } if(g > 0) { score -= (m_Wg + g*m_Ws); } } if(m_esf_L2) { size_t g = 0; for(size_t i = 0; i < dim; ++i) { if(transcript[i] == eTS_Delete) ++g; else break; } if(g > 0) { score -= (m_Wg + g*m_Ws); } } if(m_esf_R1) { size_t g = 0; for(int i = dim - 1; i >= 0; --i) { if(transcript[i] == eTS_Insert) ++g; else break; } if(g > 0) { score -= (m_Wg + g*m_Ws); } } if(m_esf_R2) { size_t g = 0; for(int i = dim - 1; i >= 0; --i) { if(transcript[i] == eTS_Delete) ++g; else break; } if(g > 0) { score -= (m_Wg + g*m_Ws); } } return score;}size_t CNWAligner::GetLeftSeg(size_t* q0, size_t* q1, size_t* s0, size_t* s1, size_t min_size) const{ size_t trdim = m_Transcript.size(); size_t cur = 0, maxseg = 0; const char* p1 = m_Seq1; const char* p2 = m_Seq2; size_t i0 = 0, j0 = 0, imax = i0, jmax = j0; for(int k = trdim - 1; k >= 0; --k) { switch(m_Transcript[k]) { case eTS_Insert: { ++p2; if(cur > maxseg) { maxseg = cur; imax = i0; jmax = j0; if(maxseg >= min_size) goto ret_point; } cur = 0; } break; case eTS_Delete: { ++p1; if(cur > maxseg) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -