📄 splign.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: splign.cpp,v $ * PRODUCTION Revision 1000.0 2004/06/01 18:12:28 gouriano * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.12 * PRODUCTION * =========================================================================== *//* $Id: splign.cpp,v 1000.0 2004/06/01 18:12:28 gouriano Exp $* ===========================================================================** public DOMAIN NOTICE * National Center for Biotechnology Information* * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular* purpose. * * Please cite the author in any work or product based on this material. ** ===========================================================================** Author: Yuri Kapustin** File Description:* CSplign class implementation**/#include <ncbi_pch.hpp>#include "splign_compartment_finder.hpp"#include "splign_util.hpp"#include <algo/align/splign/splign.hpp>#include <algo/align/nw_formatter.hpp>#include <algo/align/align_exception.hpp>#include <deque>#include <math.h>#include <algorithm>BEGIN_NCBI_SCOPECSplign::CSplign( void ){ m_min_query_coverage = 0.25; m_compartment_penalty = 0.65; m_minidty = 0.75; m_endgaps = true; m_strand = true; m_max_genomic_ext = 100000; m_nopolya = false; m_model_id = 0;}void CSplign::SetAligner( CRef<CSplicedAligner>& aligner ) { m_aligner = aligner;}const CRef<CSplicedAligner>& CSplign::GetAligner( void ) const { return m_aligner;}void CSplign::SetSeqAccessor( CRef<CSplignSeqAccessor>& sa ) { m_sa = sa;}const CRef<CSplignSeqAccessor>& CSplign::GetSeqAccessor( void ) const { return m_sa;}void CSplign::SetEndGapDetection( bool on ) { m_endgaps = on;}bool CSplign::GetEndGapDetection( void ) const { return m_endgaps;}void CSplign::SetPolyaDetection( bool on ) { m_nopolya = !on;}bool CSplign::GetPolyaDetection( void ) const { return !m_nopolya;}void CSplign::SetStrand( bool strand ) { m_strand = strand;}bool CSplign::GetStrand( void ) const { return m_strand;}void CSplign::SetMinExonIdentity( double idty ){ if(!(0 <= idty && idty <= 1)) { NCBI_THROW( CAlgoAlignException, eBadParameter, "Identity threshold must be between 0 and 1" ); } else { m_minidty = idty; }}double CSplign::GetMinExonIdentity( void ) const { return m_minidty;}void CSplign::SetMaxGenomicExtension( size_t ext ) { m_max_genomic_ext = ext;}size_t CSplign::GetMaxGenomicExtension( void ) const { return m_max_genomic_ext;}void CSplign::SetMinQueryCoverage( double cov ){ if(cov < 0 || cov > 1) { NCBI_THROW( CAlgoAlignException, eBadParameter, "Min query coverage out of range"); } m_min_query_coverage = cov;}double CSplign::GetMinQueryCoverage( void ) const { return m_min_query_coverage;}void CSplign::SetCompartmentPenalty(double penalty){ if(penalty < 0 || penalty > 1) { NCBI_THROW( CAlgoAlignException, eBadParameter, "Min query coverage out of range"); } m_compartment_penalty = penalty;}double CSplign::GetCompartmentPenalty( void ) const { return m_compartment_penalty;}void CSplign::x_SetPattern(THits* hits){ sort(hits->begin(), hits->end(), CHit::PPrecedeQ); vector<size_t> pattern0; for(size_t i = 0, n = hits->size(); i < n; ++i) { const CHit& h = (*hits)[i]; if(1 + h.m_ai[1] - h.m_ai[0] >= 10) { pattern0.push_back( h.m_ai[0] ); pattern0.push_back( h.m_ai[1] ); pattern0.push_back( h.m_ai[2] ); pattern0.push_back( h.m_ai[3] ); } } const char* Seq1 = &m_mrna.front(); const size_t SeqLen1 = m_polya_start < kMax_UInt? m_polya_start: m_mrna.size(); const char* Seq2 = &m_genomic.front(); const size_t SeqLen2 = m_genomic.size(); // verify some conditions on the input hit pattern size_t dim = pattern0.size(); const char* err = 0; if(dim % 4 == 0) { for(size_t i = 0; i < dim; i += 4) { if(pattern0[i] > pattern0[i+1] || pattern0[i+2] > pattern0[i+3]) { err = "Pattern hits must be specified in plus strand"; break; } if(i > 4) { if(pattern0[i]<=pattern0[i-3] || pattern0[i+2]<=pattern0[i-2]){ err = "Pattern hits coordinates must be sorted"; break; } } if(pattern0[i+1] >= SeqLen1 || pattern0[i+3] >= SeqLen2) { err = "One or several pattern hits are out of range"; break; } } } else { err = "Pattern must have a dimension multiple of four"; } if(err) { NCBI_THROW( CAlgoAlignException, eBadParameter, err ); } else { m_alnmap.clear(); m_pattern.clear(); // copy from pattern0 to pattern so that each hit is not too large const size_t max_len = kMax_UInt;// turn this off: sometimes we really // need just the longest perf match // and there is no direct relationship // btw hits and exons vector<size_t> pattern; for(size_t i = 0; i < dim; i += 4) { size_t lenq = 1 + pattern0[i+1] - pattern0[i]; if(lenq <= max_len) { copy(pattern0.begin() + i, pattern0.begin() + i + 4, back_inserter(pattern)); } else { const size_t d = (lenq-1) / max_len + 1; const size_t inc = lenq / d + 1; for(size_t a = pattern0[i], b = a , c = pattern0[i+2], d = c; a < pattern0[i+1]; (a = b + 1), (c = d + 1) ) { b = a + inc - 1; d = c + inc - 1; if(b > pattern0[i+1] || d > pattern0[i+3]) { b = pattern0[i+1]; d = pattern0[i+3]; } pattern.push_back(a); pattern.push_back(b); pattern.push_back(c); pattern.push_back(d); } } } dim = pattern.size(); SAlnMapElem map_elem; map_elem.m_box[0] = map_elem.m_box[2] = 0; map_elem.m_pattern_start = map_elem.m_pattern_end = -1; // realign pattern hits and build the alignment map for(size_t i = 0; i < dim; i += 4) { CNWAligner nwa ( Seq1 + pattern[i], pattern[i+1] - pattern[i] + 1, Seq2 + pattern[i+2], pattern[i+3] - pattern[i+2] + 1 ); nwa.Run(); size_t L1, R1, L2, R2; const size_t max_seg_size = nwa.GetLongestSeg(&L1, &R1, &L2, &R2); if(max_seg_size) { const size_t hitlen_q = pattern[i + 1] - pattern[i] + 1; const size_t hlq3 = hitlen_q/3; const size_t sh = hlq3; size_t delta = sh > L1? sh - L1: 0; size_t q0 = pattern[i] + L1 + delta; size_t s0 = pattern[i+2] + L2 + delta; const size_t h2s_right = hitlen_q - R1 - 1; delta = sh > h2s_right? sh - h2s_right: 0; size_t q1 = pattern[i] + R1 - delta; size_t s1 = pattern[i+2] + R2 - delta; if(q0 > q1 || s0 > s1) { // longest seg was probably too short q0 = pattern[i] + L1; s0 = pattern[i+2] + L2; q1 = pattern[i] + R1; s1 = pattern[i+2] + R2; } m_pattern.push_back(q0); m_pattern.push_back(q1); m_pattern.push_back(s0); m_pattern.push_back(s1); const size_t pattern_dim = m_pattern.size(); if(map_elem.m_pattern_start == -1) { map_elem.m_pattern_start = pattern_dim - 4;; } map_elem.m_pattern_end = pattern_dim - 1; } map_elem.m_box[1] = pattern[i+1]; map_elem.m_box[3] = pattern[i+3]; } map_elem.m_box[1] = SeqLen1 - 1; map_elem.m_box[3] = SeqLen2 - 1; m_alnmap.push_back(map_elem); }}void CSplign::Run( THits* phits ){ if(!phits) { NCBI_THROW( CAlgoAlignException, eInternal, "Unexpected NULL pointers" ); } THits& hits = *phits; if(m_sa.IsNull()) { NCBI_THROW( CAlgoAlignException, eNotInitialized, "Sequence accessor object not specified" ); } if(m_aligner.IsNull()) { NCBI_THROW( CAlgoAlignException, eNotInitialized, "Spliced aligner object not specified" ); } if(hits.size() == 0) { NCBI_THROW( CAlgoAlignException, eNoData, "Empty hit vector passed to CSplign" ); } const string query ( hits[0].m_Query ); const string subj ( hits[0].m_Subj ); m_result.clear(); // pre-load the spliced sequence and calculate min coverage m_mrna.clear(); m_sa->Load(query, &m_mrna, 0, kMax_UInt); const size_t mrna_size = m_mrna.size(); const size_t min_coverage = size_t(m_min_query_coverage * mrna_size); const size_t comp_penalty_bps = size_t(m_compartment_penalty * mrna_size); // iterate through compartments CCompartmentAccessor comps (hits.begin(), hits.end(), comp_penalty_bps, min_coverage); THits comp_hits; size_t smin = 0, smax = kMax_UInt; bool same_strand = false; for(size_t i = 0, dim = comps.GetCount(); i < dim; ++i) { comps.Get(i, comp_hits); // limit the space beyond the compartment // to avoid shared exons if(i+1 == dim) { smax = kMax_UInt; same_strand = false; } else { bool strand_this = comps.GetStrand(i); bool strand_next = comps.GetStrand(i+1); same_strand = strand_this == strand_next; if(same_strand) { const size_t* box0 = comps.GetBox(i); const size_t* box1 = box0 + 4; const size_t dist = box1[2] - box0[3]; size_t qdim0, qdim1; if(strand_this) { qdim0 = mrna_size - box0[1] - 1; qdim1 = box1[0]; } else { qdim0 = box0[0]; qdim1 = mrna_size - box1[1] - 1; } const size_t qdim = qdim0 + qdim1; if(qdim == 0) { smax = box1[2] - 1; } else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -