📄 splign.cpp
字号:
const double a0 = double(qdim0) / qdim; smax = size_t(floor (box0[3] + a0*dist)); if(smax == box1[2]) { --smax; } } } else { // !same_strand smax = kMax_UInt; } } try { SAlignedCompartment ac = x_RunOnCompartment(&comp_hits, smin, smax); ac.m_id = ++m_model_id; ac.m_segments = m_segments; ac.m_error = false; ac.m_msg = "Ok"; m_result.push_back(ac); } catch(CException& e) { m_result.push_back(SAlignedCompartment(0, true, e.GetMsg().c_str())); } smin = same_strand? (smax + 1): 0; }}// naive polya detectionsize_t CSplign::x_TestPolyA(void){ const size_t dim = m_mrna.size(); int i = dim - 1; for(; i >= 0; --i) { if(m_mrna[i] != 'A') break; } const size_t len = dim - i - 1;; return len > 3 ? i + 1 : kMax_UInt;}CSplign::SAlignedCompartment CSplign::x_RunOnCompartment( THits* hits, size_t range_left, size_t range_right ){ SAlignedCompartment rv; m_segments.clear(); if(range_left > range_right) { NCBI_THROW( CAlgoAlignException, eInternal, "Invalid range data"); } XFilter(hits); if(hits->size() == 0) { NCBI_THROW( CAlgoAlignException, eNoData, "No hits left after filtering"); } const string query ( hits->front().m_Query ); const string subj ( hits->front().m_Subj ); const size_t mrna_size = m_mrna.size(); if( !m_strand ) { // make reverse complimentary reverse (m_mrna.begin(), m_mrna.end()); transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment()); // adjust the hits for(size_t i = 0, n = hits->size(); i < n; ++i) { CHit& h = (*hits)[i]; bool plus = h.IsStraight(); size_t a0 = mrna_size - h.m_ai[0] + 1; size_t a1 = mrna_size - h.m_ai[1] + 1; h.m_an[0] = h.m_ai[0] = a1; h.m_an[1] = h.m_ai[1] = a0; // change strand if(plus) { h.m_an[2] = h.m_ai[3]; h.m_an[3] = h.m_ai[2]; } else { h.m_an[2] = h.m_ai[2]; h.m_an[3] = h.m_ai[3]; } } } m_polya_start = m_nopolya? kMax_UInt: x_TestPolyA(); if(m_polya_start < kMax_UInt) { CleaveOffByTail(hits, m_polya_start + 1); // cleave off hits beyond cds if(hits->size() == 0) { NCBI_THROW( CAlgoAlignException, eNoData, "No hits found beyond Poly(A), if any"); } } // find regions of interest on mRna (query) // and contig (subj) size_t qmin, qmax, smin, smax; GetHitsMinMax(*hits, &qmin, &qmax, &smin, &smax); --qmin; --qmax; --smin; --smax; qmin = 0; qmax = m_polya_start < kMax_UInt? m_polya_start - 1: mrna_size - 1; smin = max(0, int(smin - m_max_genomic_ext)); smax += m_max_genomic_ext; if(smin < range_left) { smin = range_left; } if(smax > range_right) { smax = range_right; } bool ctg_strand = (*hits)[0].IsStraight(); m_genomic.clear(); m_sa->Load(subj, &m_genomic, smin, smax); const size_t ctg_end = smin + m_genomic.size(); if(ctg_end - 1 < smax) { // perhabs adjust smax smax = ctg_end - 1; } if(!ctg_strand) { // make reverse complementary // for the contig's area of interest reverse (m_genomic.begin(), m_genomic.end()); transform(m_genomic.begin(), m_genomic.end(), m_genomic.begin(), SCompliment()); // flip the hits for(size_t i = 0, n = hits->size(); i < n; ++i) { CHit& h = (*hits)[i]; size_t a2 = smax - (h.m_ai[3] - smin) + 2; size_t a3 = smax - (h.m_ai[2] - smin) + 2; h.m_an[2] = h.m_ai[2] = a2; h.m_an[3] = h.m_ai[3] = a3; } } rv.m_QueryStrand = m_strand; rv.m_SubjStrand = ctg_strand; rv.m_qmin = qmin; rv.m_smin = smin; rv.m_smax = smax; rv.m_mrnasize = mrna_size; // shift hits so that they originate from qmin, smin; // also make them zero-based for(size_t i = 0, n = hits->size(); i < n; ++i) { CHit& h = (*hits)[i]; h.m_an[0] = h.m_ai[0] -= qmin + 1; h.m_an[1] = h.m_ai[1] -= qmin + 1; h.m_an[2] = h.m_ai[2] -= smin + 1; h.m_an[3] = h.m_ai[3] -= smin + 1; } x_SetPattern( hits ); x_Run(&m_mrna.front(), &m_genomic.front()); const size_t seg_dim = m_segments.size(); if(seg_dim == 0) { NCBI_THROW( CAlgoAlignException, eNoData, "No alignment found."); } // try to extend the last segment into the PolyA area if(m_polya_start < kMax_UInt && seg_dim && m_segments[seg_dim-1].m_exon) { CSplign::SSegment& s = const_cast<CSplign::SSegment&>( m_segments[seg_dim-1]); const char* p0 = &m_mrna.front() + s.m_box[1] + 1; const char* q = &m_genomic.front() + s.m_box[3] + 1; const char* p = p0; const char* pe = &m_mrna.front() + mrna_size; const char* qe = &m_genomic.front() + m_genomic.size(); for(; p < pe && q < qe; ++p, ++q) { if(*p != 'A') break; if(*p != *q) break; } const size_t sh = p - p0; if(sh) { // resize s.m_box[1] += sh; s.m_box[3] += sh; s.m_details.append(sh, 'M'); s.RestoreIdentity(); // correct annotation const size_t ann_dim = s.m_annot.size(); if(ann_dim > 2 && s.m_annot[ann_dim - 3] == '>') { ++q; s.m_annot[ann_dim - 2] = q < qe? *q: ' '; ++q; s.m_annot[ann_dim - 1] = q < qe? *q: ' '; } m_polya_start += sh; } } int j = seg_dim - 1; // look for PolyA in trailing segments: // if a segment is mostly 'A's then we add it to PolyA for(; j >= 0; --j) { const CSplign::SSegment& s = m_segments[j]; const char* p0 = &m_mrna[qmin] + s.m_box[0]; const char* p1 = &m_mrna[qmin] + s.m_box[1] + 1; size_t count = 0; for(const char* pc = p0; pc != p1; ++pc) { if(*pc == 'A') ++count; } const size_t len = p1 - p0; double min_a_content = 0.799; // also check splices if(s.m_exon && j > 0 && m_segments[j-1].m_exon) { if(!IsConsensus(m_segments[j-1].GetDonor(), s.GetAcceptor())) { min_a_content = 0.599; } } if(!s.m_exon) { min_a_content = 0.599; } if(double(count)/len < min_a_content) { break; } } if(j >= 0 && j < int(seg_dim - 1)) { m_polya_start = m_segments[j].m_box[1] + 1; } // test if we have at least one exon bool some_exons = false; for(int i = 0; i <= j; ++i ) { if(m_segments[i].m_exon) { some_exons = true; break; } } if(!some_exons) { NCBI_THROW( CAlgoAlignException, eNoData, "No exons found above identity limit."); } m_segments.resize(j + 1); return rv;}void CSplign::x_Run(const char* Seq1, const char* Seq2){ deque<SSegment> segments; for(size_t i = 0, map_dim = m_alnmap.size(); i < map_dim; ++i) { const SAlnMapElem& zone = m_alnmap[i]; // setup sequences m_aligner->SetSequences(Seq1 + zone.m_box[0], zone.m_box[1] - zone.m_box[0] + 1, Seq2 + zone.m_box[2], zone.m_box[3] - zone.m_box[2] + 1, false); // prepare the pattern vector<size_t> pattern; if(zone.m_pattern_start >= 0) { copy(m_pattern.begin() + zone.m_pattern_start, m_pattern.begin() + zone.m_pattern_end + 1, back_inserter(pattern)); for(size_t j = 0, pt_dim = pattern.size(); j < pt_dim; j += 4) {//#define DBG_DUMP_PATTERN#ifdef DBG_DUMP_PATTERN cerr << pattern[j] << '\t' << pattern[j+1] << '\t' << pattern[j+2] << '\t' << pattern[j+3] << endl;#endif pattern[j] -= zone.m_box[0]; pattern[j+1] -= zone.m_box[0]; pattern[j+2] -= zone.m_box[2]; pattern[j+3] -= zone.m_box[2]; } if(pattern.size()) { m_aligner->SetPattern(pattern); } // setup esf m_aligner->SetEndSpaceFree(true, true, true, true); // align m_aligner->Run();// #define DBG_DUMP_TYPE2#ifdef DBG_DUMP_TYPE2 {{ CNWFormatter fmt (*m_aligner); string txt; fmt.AsText(&txt, CNWFormatter::eFormatType2); cerr << txt; }} #endif // create list of segments CNWFormatter formatter (*m_aligner); string exons; formatter.AsText(&exons, CNWFormatter::eFormatExonTableEx); CNcbiIstrstream iss_exons (exons.c_str()); while(iss_exons) { string id1, id2, txt, repr; size_t q0, q1, s0, s1, size; double idty; iss_exons >> id1 >> id2 >> idty >> size >> q0 >> q1 >> s0 >> s1 >> txt >> repr; if(!iss_exons) break; q0 += zone.m_box[0]; q1 += zone.m_box[0]; s0 += zone.m_box[2]; s1 += zone.m_box[2]; SSegment e; e.m_exon = true; e.m_idty = idty; e.m_len = size; e.m_box[0] = q0; e.m_box[1] = q1; e.m_box[2] = s0; e.m_box[3] = s1; e.m_annot = txt; e.m_details = repr; segments.push_back(e); } // append a gap if(i + 1 < map_dim) { SSegment g; g.m_exon = false; g.m_box[0] = zone.m_box[1] + 1; g.m_box[1] = m_alnmap[i+1].m_box[0] - 1; g.m_box[2] = zone.m_box[3] + 1; g.m_box[3] = m_alnmap[i+1].m_box[2] - 1; g.m_idty = 0; g.m_len = g.m_box[1] - g.m_box[0] + 1; g.m_annot = "<GAP>"; g.m_details.append(Seq1+g.m_box[0], g.m_box[1]-g.m_box[0]+1); segments.push_back(g); } } } // zone iterations end // segment-level postprocessing size_t seg_dim = segments.size(); if(seg_dim == 0) { return; } // First go from the ends and see if we // can improve boundary exons size_t k0 = 0; while(k0 < seg_dim) { SSegment& s = segments[k0]; if(s.m_exon) { if(s.m_idty < m_minidty || m_endgaps) { s.ImproveFromLeft(Seq1, Seq2); } if(s.m_idty >= m_minidty) { break; } } ++k0; } // fill the left-hand gap, if any if(segments[0].m_exon && segments[0].m_box[0] > 0) { segments.push_front(SSegment()); SSegment& g = segments.front(); g.m_exon = false; g.m_box[0] = 0; g.m_box[1] = segments[0].m_box[0] - 1; g.m_box[2] = 0; g.m_box[3] = segments[0].m_box[2] - 1; g.m_idty = 0; g.m_len = segments[0].m_box[0]; g.m_annot = "<GAP>"; g.m_details = string(Seq1 + g.m_box[0], g.m_box[1] + 1 - g.m_box[0]); ++seg_dim;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -