alnvec.cpp

来自「ncbi源码」· C++ 代码 · 共 1,047 行 · 第 1/3 页

CPP
1,047
字号
        // take care of the remaining coords if necessary    if (record_coords) {        // previous scrns        TSeqPos pos_diff = aln_pos - scrn_pos;        if (pos_diff > 0) {            nscrns = pos_diff / scrn_width;            if (pos_diff % scrn_width) {                nscrns++;            }            for (int i = 0; i < nscrns; i++) {                scrn_lefts->push_back(scrn_lft_seq_pos);                scrn_rights->push_back(scrn_rgt_seq_pos);                if (i == 0) {                    scrn_lft_seq_pos = scrn_rgt_seq_pos;                }                scrn_pos += scrn_width;            }        }    }    c_buff[aln_len] = '\0';    buffer = c_buff;    delete [] c_buff;    return buffer;}//// CreateConsensus()//// compute a consensus sequence given a particular alignment// the rules for a consensus are://   - a segment is consensus gap if > 50% of the sequences are gap at this//     segment.  50% exactly is counted as sequence//   - for a segment counted as sequence, for each position, the most//     frequently occurring base is counted as consensus.  in the case of//     a tie, the consensus is considered muddied, and the consensus is//     so marked//CRef<CDense_seg> CAlnVec::CreateConsensus(int& consensus_row) const{    if ( !m_DS ) {        return CRef<CDense_seg>();    }    int i;    int j;    // temporary storage for our consensus    vector<string> consens(m_NumSegs);    // determine what the number of segments required for a gapped consensus    // segment is.  this must be rounded to be at least 50%.    int gap_seg_thresh = m_NumRows - m_NumRows / 2;    for (j = 0;  j < m_NumSegs;  ++j) {        // evaluate for gap / no gap        int gap_count = 0;        for (i = 0;  i < m_NumRows;  ++i) {            if (m_Starts[ j*m_NumRows + i ] == -1) {                ++gap_count;            }        }        // check to make sure that this seg is not a consensus        // gap seg        if ( gap_count <= gap_seg_thresh ) {            // we will build a segment with enough bases to match            consens[j].resize(m_Lens[j]);            // retrieve all sequences for this segment            vector<string> segs(m_NumRows);            for (i = 0;  i < m_NumRows;  ++i) {                if (m_Starts[ j*m_NumRows + i ] != -1) {                    TSeqPos start = m_Starts[j*m_NumRows+i];                    TSeqPos stop  = start + m_Lens[j];                    if (IsPositiveStrand(i)) {                        x_GetSeqVector(i).GetSeqData(start, stop, segs[i]);                    } else {                        CSeqVector &  seq_vec = x_GetSeqVector(i);                        TSeqPos size = seq_vec.size();                        seq_vec.GetSeqData(size - stop, size - start, segs[i]);                    }                    for (int c = 0;  c < segs[i].length();  ++c) {                        segs[i][c] = FromIupac(segs[i][c]);                    }                }            }            typedef multimap<int, unsigned char, greater<int> > TRevMap;            //             // evaluate for a consensus            //            for (i = 0;  i < m_Lens[j];  ++i) {                // first, we record which bases occur and how often                // this is computed in NCBI4na notation                int base_count[4];                base_count[0] = base_count[1] =                    base_count[2] = base_count[3] = 0;                for (int row = 0;  row < m_NumRows;  ++row) {                    if (segs[row] != "") {                        for (int pos = 0;  pos < 4;  ++pos) {                            if (segs[row][i] & (1<<pos)) {                                ++base_count[ pos ];                            }                        }                    }                }                // we create a sorted list (in descending order) of                // frequencies of appearance to base                // the frequency is "global" for this position: that is,                // if 40% of the sequences are gapped, the highest frequency                // any base can have is 0.6                TRevMap rev_map;                for (int k = 0;  k < 4;  ++k) {                    // this gets around a potentially tricky idiosyncrasy                    // in some implementations of multimap.  depending on                    // the library, the key may be const (or not)                    TRevMap::value_type p(base_count[k], (1<<k));                    rev_map.insert(p);                }                // the base threshold for being considered unique is at least                // 70% of the available sequences                int base_thresh =                    ((m_NumRows - gap_count) * 7 + 5) / 10;                // now, the first element here contains the best frequency                // we scan for the appropriate bases                if (rev_map.count(rev_map.begin()->first) == 1 &&                    rev_map.begin()->first >= base_thresh) {                    consens[j][i] = ToIupac(rev_map.begin()->second);                } else {                    // now we need to make some guesses based on IUPACna                    // notation                    int               count;                    unsigned char     c    = 0x00;                    int               freq = 0;                    TRevMap::iterator curr = rev_map.begin();                    TRevMap::iterator prev = rev_map.begin();                    for (count = 0;                         curr != rev_map.end() &&                         (freq < base_thresh || prev->first == curr->first);                         ++curr, ++count) {                        prev = curr;                        freq += curr->first;                        c |= curr->second;                    }                    //                    // catchall                    //                    if (count > 2) {                        consens[j][i] = 'N';                    } else {                        consens[j][i] = ToIupac(c);                    }                }            }        }    }    //    // now, create a new CDense_seg    // we create a new CBioseq for our data, add it to our scope, and    // copy the contents of the CDense_seg    //    string data;    TSignedSeqPos total_bases = 0;    CRef<CDense_seg> new_ds(new CDense_seg());    new_ds->SetDim(m_NumRows + 1);    new_ds->SetNumseg(m_NumSegs);    new_ds->SetLens() = m_Lens;    new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs);    if ( !m_Strands.empty() ) {        new_ds->SetStrands().reserve(m_Strands.size() +                                     m_NumSegs);    }    for (i = 0;  i < consens.size();  ++i) {        // copy the old entries        for (j = 0;  j < m_NumRows;  ++j) {            int idx = i * m_NumRows + j;            new_ds->SetStarts().push_back(m_Starts[idx]);            if ( !m_Strands.empty() ) {                new_ds->SetStrands().push_back(m_Strands[idx]);            }        }        // add our new entry        // this places the consensus as the last sequence        // it should preferably be the first, but this would mean adjusting        // the bioseq handle and seqvector caches, and all row numbers would        // shift        if (consens[i].length() != 0) {            new_ds->SetStarts().push_back(total_bases);        } else {            new_ds->SetStarts().push_back(-1);        }                if ( !m_Strands.empty() ) {            new_ds->SetStrands().push_back(eNa_strand_unknown);        }        total_bases += consens[i].length();        data += consens[i];    }    // copy our IDs    for (i = 0;  i < m_Ids.size();  ++i) {        new_ds->SetIds().push_back(m_Ids[i]);    }    // now, we construct a new Bioseq and add it to our scope    // this bioseq must have a local ID; it will be named "consensus"    // once this is in, the Denseg should resolve all IDs correctly    {{         CRef<CBioseq> bioseq(new CBioseq());         // construct a local sequence ID for this sequence         CRef<CSeq_id> id(new CSeq_id());         bioseq->SetId().push_back(id);         id->SetLocal().SetStr("consensus");         new_ds->SetIds().push_back(id);         // add a description for this sequence         CSeq_descr& desc = bioseq->SetDescr();         CRef<CSeqdesc> d(new CSeqdesc);         desc.Set().push_back(d);         d->SetComment("This is a generated consensus sequence");         // the main one: Seq-inst         CSeq_inst& inst = bioseq->SetInst();         inst.SetRepr(CSeq_inst::eRepr_raw);         inst.SetMol(CSeq_inst::eMol_na);         inst.SetLength(data.length());         CSeq_data& seq_data = inst.SetSeq_data();         CIUPACna& na = seq_data.SetIupacna();         na = CIUPACna(data);         // once we've created the bioseq, we need to add it to the         // scope         CRef<CSeq_entry> entry(new CSeq_entry());         entry->SetSeq(*bioseq);         GetScope().AddTopLevelSeqEntry(*entry);    }}    consensus_row = new_ds->GetIds().size() - 1;    return new_ds;}static SNCBIFullScoreMatrix s_FullScoreMatrix;int CAlnVec::CalculateScore(const string& s1, const string& s2,                            bool s1_is_prot, bool s2_is_prot){    // check the lengths    if (s1_is_prot == s2_is_prot  &&  s1.length() != s2.length()) {        NCBI_THROW(CAlnException, eInvalidRequest,                   "CAlnVec::CalculateScore(): "                   "Strings should have equal lenghts.");    } else if (s1.length() * (s1_is_prot ? 1 : 3) !=               s1.length() * (s1_is_prot ? 1 : 3)) {        NCBI_THROW(CAlnException, eInvalidRequest,                   "CAlnVec::CalculateScore(): "                   "Strings lengths do not match.");    }            int score = 0;    const char * res1 = s1.c_str();    const char * res2 = s2.c_str();    const char * end1 = res1 + s1.length();    const char * end2 = res2 + s2.length();        static bool s_FullScoreMatrixInitialized = false;    if (s1_is_prot  &&  s2_is_prot) {        if ( !s_FullScoreMatrixInitialized ) {            s_FullScoreMatrixInitialized = true;            NCBISM_Unpack(&NCBISM_Blosum62, &s_FullScoreMatrix);        }                // use BLOSUM62 matrix        for ( ;  res1 != end1;  res1++, res2++) {            score += s_FullScoreMatrix.s[*res1][*res2];        }    } else if ( !s1_is_prot  &&  !s2_is_prot ) {        // use match score/mismatch penalty        for ( ; res1 != end1;  res1++, res2++) {            if (*res1 == *res2) {                score += 1;            } else {                score -= 3;            }        }    } else {        string t;        if (s1_is_prot) {            TranslateNAToAA(s2, t);            for ( ;  res1 != end1;  res1++, res2++) {                score += s_FullScoreMatrix.s[*res1][*res2];            }        } else {            TranslateNAToAA(s1, t);            for ( ;  res2 != end2;  res1++, res2++) {                score += s_FullScoreMatrix.s[*res1][*res2];            }        }    }    return score;}void CAlnVec::TranslateNAToAA(const string& na, string& aa){    if (na.size() % 3) {        NCBI_THROW(CAlnException, eTranslateFailure,                   "CAlnVec::TranslateNAToAA(): "                   "NA size expected to be divisible by 3");    }    const CTrans_table& tbl = CGen_code_table::GetTransTable(1);    unsigned int i, j = 0, state = 0;    aa.resize(na.size() / 3);    string::const_iterator res = na.begin();    while (res != na.end()) {        for (i = 0; i < 3; i++, res++) {            state = tbl.NextCodonState(state, *res);        }        aa[j++] = tbl.GetCodonResidue(state);    }}int CAlnVec::CalculateScore(TNumrow row1, TNumrow row2) const{    TNumrow       numrows = m_NumRows;    TNumrow       index1 = row1, index2 = row2;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?