alnvec.cpp
来自「ncbi源码」· C++ 代码 · 共 1,047 行 · 第 1/3 页
CPP
1,047 行
// take care of the remaining coords if necessary if (record_coords) { // previous scrns TSeqPos pos_diff = aln_pos - scrn_pos; if (pos_diff > 0) { nscrns = pos_diff / scrn_width; if (pos_diff % scrn_width) { nscrns++; } for (int i = 0; i < nscrns; i++) { scrn_lefts->push_back(scrn_lft_seq_pos); scrn_rights->push_back(scrn_rgt_seq_pos); if (i == 0) { scrn_lft_seq_pos = scrn_rgt_seq_pos; } scrn_pos += scrn_width; } } } c_buff[aln_len] = '\0'; buffer = c_buff; delete [] c_buff; return buffer;}//// CreateConsensus()//// compute a consensus sequence given a particular alignment// the rules for a consensus are:// - a segment is consensus gap if > 50% of the sequences are gap at this// segment. 50% exactly is counted as sequence// - for a segment counted as sequence, for each position, the most// frequently occurring base is counted as consensus. in the case of// a tie, the consensus is considered muddied, and the consensus is// so marked//CRef<CDense_seg> CAlnVec::CreateConsensus(int& consensus_row) const{ if ( !m_DS ) { return CRef<CDense_seg>(); } int i; int j; // temporary storage for our consensus vector<string> consens(m_NumSegs); // determine what the number of segments required for a gapped consensus // segment is. this must be rounded to be at least 50%. int gap_seg_thresh = m_NumRows - m_NumRows / 2; for (j = 0; j < m_NumSegs; ++j) { // evaluate for gap / no gap int gap_count = 0; for (i = 0; i < m_NumRows; ++i) { if (m_Starts[ j*m_NumRows + i ] == -1) { ++gap_count; } } // check to make sure that this seg is not a consensus // gap seg if ( gap_count <= gap_seg_thresh ) { // we will build a segment with enough bases to match consens[j].resize(m_Lens[j]); // retrieve all sequences for this segment vector<string> segs(m_NumRows); for (i = 0; i < m_NumRows; ++i) { if (m_Starts[ j*m_NumRows + i ] != -1) { TSeqPos start = m_Starts[j*m_NumRows+i]; TSeqPos stop = start + m_Lens[j]; if (IsPositiveStrand(i)) { x_GetSeqVector(i).GetSeqData(start, stop, segs[i]); } else { CSeqVector & seq_vec = x_GetSeqVector(i); TSeqPos size = seq_vec.size(); seq_vec.GetSeqData(size - stop, size - start, segs[i]); } for (int c = 0; c < segs[i].length(); ++c) { segs[i][c] = FromIupac(segs[i][c]); } } } typedef multimap<int, unsigned char, greater<int> > TRevMap; // // evaluate for a consensus // for (i = 0; i < m_Lens[j]; ++i) { // first, we record which bases occur and how often // this is computed in NCBI4na notation int base_count[4]; base_count[0] = base_count[1] = base_count[2] = base_count[3] = 0; for (int row = 0; row < m_NumRows; ++row) { if (segs[row] != "") { for (int pos = 0; pos < 4; ++pos) { if (segs[row][i] & (1<<pos)) { ++base_count[ pos ]; } } } } // we create a sorted list (in descending order) of // frequencies of appearance to base // the frequency is "global" for this position: that is, // if 40% of the sequences are gapped, the highest frequency // any base can have is 0.6 TRevMap rev_map; for (int k = 0; k < 4; ++k) { // this gets around a potentially tricky idiosyncrasy // in some implementations of multimap. depending on // the library, the key may be const (or not) TRevMap::value_type p(base_count[k], (1<<k)); rev_map.insert(p); } // the base threshold for being considered unique is at least // 70% of the available sequences int base_thresh = ((m_NumRows - gap_count) * 7 + 5) / 10; // now, the first element here contains the best frequency // we scan for the appropriate bases if (rev_map.count(rev_map.begin()->first) == 1 && rev_map.begin()->first >= base_thresh) { consens[j][i] = ToIupac(rev_map.begin()->second); } else { // now we need to make some guesses based on IUPACna // notation int count; unsigned char c = 0x00; int freq = 0; TRevMap::iterator curr = rev_map.begin(); TRevMap::iterator prev = rev_map.begin(); for (count = 0; curr != rev_map.end() && (freq < base_thresh || prev->first == curr->first); ++curr, ++count) { prev = curr; freq += curr->first; c |= curr->second; } // // catchall // if (count > 2) { consens[j][i] = 'N'; } else { consens[j][i] = ToIupac(c); } } } } } // // now, create a new CDense_seg // we create a new CBioseq for our data, add it to our scope, and // copy the contents of the CDense_seg // string data; TSignedSeqPos total_bases = 0; CRef<CDense_seg> new_ds(new CDense_seg()); new_ds->SetDim(m_NumRows + 1); new_ds->SetNumseg(m_NumSegs); new_ds->SetLens() = m_Lens; new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs); if ( !m_Strands.empty() ) { new_ds->SetStrands().reserve(m_Strands.size() + m_NumSegs); } for (i = 0; i < consens.size(); ++i) { // copy the old entries for (j = 0; j < m_NumRows; ++j) { int idx = i * m_NumRows + j; new_ds->SetStarts().push_back(m_Starts[idx]); if ( !m_Strands.empty() ) { new_ds->SetStrands().push_back(m_Strands[idx]); } } // add our new entry // this places the consensus as the last sequence // it should preferably be the first, but this would mean adjusting // the bioseq handle and seqvector caches, and all row numbers would // shift if (consens[i].length() != 0) { new_ds->SetStarts().push_back(total_bases); } else { new_ds->SetStarts().push_back(-1); } if ( !m_Strands.empty() ) { new_ds->SetStrands().push_back(eNa_strand_unknown); } total_bases += consens[i].length(); data += consens[i]; } // copy our IDs for (i = 0; i < m_Ids.size(); ++i) { new_ds->SetIds().push_back(m_Ids[i]); } // now, we construct a new Bioseq and add it to our scope // this bioseq must have a local ID; it will be named "consensus" // once this is in, the Denseg should resolve all IDs correctly {{ CRef<CBioseq> bioseq(new CBioseq()); // construct a local sequence ID for this sequence CRef<CSeq_id> id(new CSeq_id()); bioseq->SetId().push_back(id); id->SetLocal().SetStr("consensus"); new_ds->SetIds().push_back(id); // add a description for this sequence CSeq_descr& desc = bioseq->SetDescr(); CRef<CSeqdesc> d(new CSeqdesc); desc.Set().push_back(d); d->SetComment("This is a generated consensus sequence"); // the main one: Seq-inst CSeq_inst& inst = bioseq->SetInst(); inst.SetRepr(CSeq_inst::eRepr_raw); inst.SetMol(CSeq_inst::eMol_na); inst.SetLength(data.length()); CSeq_data& seq_data = inst.SetSeq_data(); CIUPACna& na = seq_data.SetIupacna(); na = CIUPACna(data); // once we've created the bioseq, we need to add it to the // scope CRef<CSeq_entry> entry(new CSeq_entry()); entry->SetSeq(*bioseq); GetScope().AddTopLevelSeqEntry(*entry); }} consensus_row = new_ds->GetIds().size() - 1; return new_ds;}static SNCBIFullScoreMatrix s_FullScoreMatrix;int CAlnVec::CalculateScore(const string& s1, const string& s2, bool s1_is_prot, bool s2_is_prot){ // check the lengths if (s1_is_prot == s2_is_prot && s1.length() != s2.length()) { NCBI_THROW(CAlnException, eInvalidRequest, "CAlnVec::CalculateScore(): " "Strings should have equal lenghts."); } else if (s1.length() * (s1_is_prot ? 1 : 3) != s1.length() * (s1_is_prot ? 1 : 3)) { NCBI_THROW(CAlnException, eInvalidRequest, "CAlnVec::CalculateScore(): " "Strings lengths do not match."); } int score = 0; const char * res1 = s1.c_str(); const char * res2 = s2.c_str(); const char * end1 = res1 + s1.length(); const char * end2 = res2 + s2.length(); static bool s_FullScoreMatrixInitialized = false; if (s1_is_prot && s2_is_prot) { if ( !s_FullScoreMatrixInitialized ) { s_FullScoreMatrixInitialized = true; NCBISM_Unpack(&NCBISM_Blosum62, &s_FullScoreMatrix); } // use BLOSUM62 matrix for ( ; res1 != end1; res1++, res2++) { score += s_FullScoreMatrix.s[*res1][*res2]; } } else if ( !s1_is_prot && !s2_is_prot ) { // use match score/mismatch penalty for ( ; res1 != end1; res1++, res2++) { if (*res1 == *res2) { score += 1; } else { score -= 3; } } } else { string t; if (s1_is_prot) { TranslateNAToAA(s2, t); for ( ; res1 != end1; res1++, res2++) { score += s_FullScoreMatrix.s[*res1][*res2]; } } else { TranslateNAToAA(s1, t); for ( ; res2 != end2; res1++, res2++) { score += s_FullScoreMatrix.s[*res1][*res2]; } } } return score;}void CAlnVec::TranslateNAToAA(const string& na, string& aa){ if (na.size() % 3) { NCBI_THROW(CAlnException, eTranslateFailure, "CAlnVec::TranslateNAToAA(): " "NA size expected to be divisible by 3"); } const CTrans_table& tbl = CGen_code_table::GetTransTable(1); unsigned int i, j = 0, state = 0; aa.resize(na.size() / 3); string::const_iterator res = na.begin(); while (res != na.end()) { for (i = 0; i < 3; i++, res++) { state = tbl.NextCodonState(state, *res); } aa[j++] = tbl.GetCodonResidue(state); }}int CAlnVec::CalculateScore(TNumrow row1, TNumrow row2) const{ TNumrow numrows = m_NumRows; TNumrow index1 = row1, index2 = row2;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?