validerror_bioseq.cpp

来自「ncbi源码」· C++ 代码 · 共 1,861 行 · 第 1/5 页

CPP
1,861
字号
            if (tech == CMolInfo::eTech_htgs_0  ||                tech == CMolInfo::eTech_htgs_1  ||                tech == CMolInfo::eTech_htgs_2)            {                PostErr(eDiag_Warning, eErr_SEQ_INST_LongHtgsSequence,                    "Phase 0, 1 or 2 HTGS sequence exceeds 350kbp limit",                    seq);            } else if (tech == CMolInfo::eTech_htgs_3) {                PostErr(eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,                    "Phase 3 HTGS sequence exceeds 350kbp limit", seq);            } else if (tech == CMolInfo::eTech_wgs) {                PostErr(eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,                    "WGS sequence exceeds 350kbp limit", seq);            } else {                PostErr (eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,                    "Length of sequence exceeds 350kbp limit", seq);            }        }  else {            PostErr (eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,                "Length of sequence exceeds 350kbp limit", seq);        }    }}// Assumes that seq is segmented and has Seq-ext datavoid CValidError_bioseq::ValidateSeqParts(const CBioseq& seq){    // Get parent CSeq_entry of seq and then find the next    // CSeq_entry in the set. This CSeq_entry should be a CBioseq_set    // of class parts.    const CSeq_entry* parent = seq.GetParentEntry();    if (!parent) {        return;    }    if ( !parent->IsSet() ) {        return;    }    // Loop through seq_set looking for the bioseq, when found, the next    // CSeq_entry should have the parts.    CBioseq_set::TSeq_set::const_iterator se_parts =         parent->GetSet().GetSeq_set().end();    ITERATE ( CBioseq_set::TSeq_set, it, parent->GetSet().GetSeq_set() ) {        if ( parent == &(**it) ) {            se_parts = ++it;            break;        }    }    if ( se_parts == parent->GetSet().GetSeq_set().end() ) {        return;    }    // Check that se_parts is a CBioseq_set of class parts    if ( !(*se_parts)->IsSet()  ||  !(*se_parts)->GetSet().IsSetClass()  ||         (*se_parts)->GetSet().GetClass() != CBioseq_set::eClass_parts ) {        return;    }    const CSeg_ext::Tdata& locs = seq.GetInst().GetExt().GetSeg().Get();    const CBioseq_set::TSeq_set& parts = (*se_parts)->GetSet().GetSeq_set();    // Make sure the number of locations (excluding null locations)    // match the number of parts    size_t nulls = 0;    ITERATE ( CSeg_ext::Tdata, loc, locs ) {        if ( (*loc)->IsNull() ) {            nulls++;        }    }    if ( locs.size() - nulls < parts.size() ) {         PostErr(eDiag_Error, eErr_SEQ_INST_PartsOutOfOrder,            "Parts set contains too many Bioseqs", seq);         return;    } else if ( locs.size() - nulls > parts.size() ) {        PostErr(eDiag_Error, eErr_SEQ_INST_PartsOutOfOrder,            "Parts set does not contain enough Bioseqs", seq);        return;    }    // Now, simultaneously loop through the parts of se_parts and CSeq_locs of     // seq's CSseq-ext. If don't compare, post error.    size_t size = locs.size();  // == parts.size()    CSeg_ext::Tdata::const_iterator loc_it = locs.begin();    CBioseq_set::TSeq_set::const_iterator part_it = parts.begin();    for ( size_t i = 0; i < size; ++i ) {        try {            if ( (*loc_it)->IsNull() ) {                continue;            }            if ( !(*part_it)->IsSet() ) {                PostErr(eDiag_Error, eErr_SEQ_INST_PartsOutOfOrder,                    "Parts set component is not Bioseq", seq);                return;            }            const CSeq_id& loc_id = GetId(**loc_it, m_Scope);            if ( !IsIdIn(loc_id, (*part_it)->GetSeq()) ) {                PostErr(eDiag_Error, eErr_SEQ_INST_PartsOutOfOrder,                    "Segmented bioseq seq_ext does not correspond to parts"                    "packaging order", seq);            }                        // advance both iterators            ++loc_it;            ++part_it;        } catch (const CNotUnique&) {            ERR_POST("Seq-loc not for unique sequence");            return;        } catch (...) {            ERR_POST("Unknown error");            return;        }    }}// Assumes seq is an amino acid sequencevoid CValidError_bioseq::ValidateProteinTitle(const CBioseq& seq){    const CSeq_id* id = seq.GetFirstId();    if (!id) {        return;    }    CBioseq_Handle hnd = m_Scope->GetBioseqHandle(*id);    string title_no_recon = GetTitle(hnd);    string title_recon = GetTitle(hnd, fGetTitle_Reconstruct);    if ( title_no_recon != title_recon ) {        PostErr(eDiag_Warning, eErr_SEQ_DESCR_InconsistentProteinTitle,            "Instantiated protein title does not match automatically "            "generated title. Instantiated: " + title_no_recon +             " Generated: " + title_recon, seq);    }}void CValidError_bioseq::ValidateNs(const CBioseq& seq){    try {        const CSeq_inst& inst = seq.GetInst();        CSeq_inst::TRepr repr =             inst.CanGetRepr() ? inst.GetRepr() : CSeq_inst::eRepr_not_set;        CSeq_inst::TLength len = inst.CanGetLength() ? inst.GetLength() : 0;        if ( (repr == CSeq_inst::eRepr_raw  ||               (repr == CSeq_inst::eRepr_delta  &&  x_IsDeltaLitOnly(inst))) &&             len > 10 ) {                        CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);            if ( !bsh ) {                return;            }                        CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);                        EDiagSev sev = eDiag_Warning;            string sequence;                        // check for Ns at the beginning of the sequence            if ( (vec[0] == 'N')  ||  (vec[0] == 'n') ) {                // if 10 or more Ns flag as error (except for NC),                 // otherwise just warning                vec.GetSeqData(0, 10, sequence);                if ( !m_Imp.IsNC()  &&                     (NStr::CompareNocase(sequence, "NNNNNNNNNN") == 0) ) {                    sev = eDiag_Error;                }                PostErr(sev, eErr_SEQ_INST_TerminalNs,                     "N at beginning of sequence", seq);            }                        // check for Ns at the end of the sequence            sev = eDiag_Warning;            if ( (vec[vec.size() - 1] == 'N')  ||  (vec[vec.size() - 1] == 'n') ) {                // if 10 or more Ns flag as error (except for NC),                 // otherwise just warning                vec.GetSeqData(vec.size() - 10, vec.size() , sequence);                if ( !m_Imp.IsNC()  &&                     (NStr::CompareNocase(sequence, "NNNNNNNNNN") == 0) ) {                    sev = eDiag_Error;                }                PostErr(sev, eErr_SEQ_INST_TerminalNs,                     "N at end of sequence", seq);            }        }    } catch ( exception& ) {        // just ignore, and continue with the validation process.    }}// Assumes that seq is eRepr_raw or eRepr_instvoid CValidError_bioseq::ValidateRawConst(const CBioseq& seq){    const CSeq_inst& inst = seq.GetInst();    const CEnumeratedTypeValues* tv = CSeq_inst::GetTypeInfo_enum_ERepr();    const string& rpr = tv->FindName(inst.GetRepr(), true);    if (inst.IsSetFuzz()) {        PostErr(eDiag_Error, eErr_SEQ_INST_FuzzyLen,            "Fuzzy length on " + rpr + "Bioseq", seq);    }    if (!inst.IsSetLength()  ||  inst.GetLength() == 0) {        string len = inst.IsSetLength() ?            NStr::IntToString(inst.GetLength()) : "0";        PostErr(eDiag_Error, eErr_SEQ_INST_InvalidLen,                 "Invalid Bioseq length [" + len + "]", seq);    }    CSeq_data::E_Choice seqtyp = inst.IsSetSeq_data() ?        inst.GetSeq_data().Which() : CSeq_data::e_not_set;    switch (seqtyp) {    case CSeq_data::e_Iupacna:    case CSeq_data::e_Ncbi2na:    case CSeq_data::e_Ncbi4na:    case CSeq_data::e_Ncbi8na:    case CSeq_data::e_Ncbipna:        if (inst.IsAa()) {            PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidAlphabet,                     "Using a nucleic acid alphabet on a protein sequence",                     seq);            return;        }        break;    case CSeq_data::e_Iupacaa:    case CSeq_data::e_Ncbi8aa:    case CSeq_data::e_Ncbieaa:    case CSeq_data::e_Ncbipaa:    case CSeq_data::e_Ncbistdaa:        if (inst.IsNa()) {            PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidAlphabet,                     "Using a protein alphabet on a nucleic acid",                     seq);            return;        }        break;    default:        PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidAlphabet,                 "Sequence alphabet not set",                 seq);        return;    }    bool check_alphabet = false;    unsigned int factor = 1;    switch (seqtyp) {    case CSeq_data::e_Iupacaa:    case CSeq_data::e_Iupacna:    case CSeq_data::e_Ncbieaa:    case CSeq_data::e_Ncbistdaa:        check_alphabet = true;        break;    case CSeq_data::e_Ncbi8na:    case CSeq_data::e_Ncbi8aa:        break;    case CSeq_data::e_Ncbi4na:        factor = 2;        break;    case CSeq_data::e_Ncbi2na:        factor = 4;        break;    case CSeq_data::e_Ncbipna:        factor = 5;        break;    case CSeq_data::e_Ncbipaa:        factor = 30;        break;    default:        // Logically, should not occur        PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidAlphabet,                 "Sequence alphabet not set",                 seq);        return;    }    TSeqPos calc_len = inst.IsSetLength() ? inst.GetLength() : 0;        switch (seqtyp) {    case CSeq_data::e_Ncbipna:    case CSeq_data::e_Ncbipaa:        calc_len *= factor;        break;    default:        if (calc_len % factor) {            calc_len += factor;        }        calc_len /= factor;    }    string s_len = NStr::UIntToString(calc_len);    TSeqPos data_len = GetDataLen(inst);    string data_len_str = NStr::IntToString(data_len);    if (calc_len > data_len) {        PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong,                 "Bioseq.seq_data too short [" + data_len_str +                 "] for given length [" + s_len + "]", seq);        return;    } else if (calc_len < data_len) {        PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong,                 "Bioseq.seq_data is larger [" + data_len_str +                 "] than given length [" + s_len + "]", seq);    }    unsigned char termination;    unsigned int trailingX = 0;    size_t terminations = 0;    if (check_alphabet) {        switch (seqtyp) {        case CSeq_data::e_Iupacaa:        case CSeq_data::e_Ncbieaa:            termination = '*';            break;        case CSeq_data::e_Ncbistdaa:            termination = 25;            break;        default:            termination = '\0';        }        CSeqVector sv =             m_Scope->GetBioseqHandle(seq).GetSeqVector();        size_t bad_cnt = 0;        TSeqPos pos = 0;        for ( CSeqVector_CI sv_iter(sv); sv_iter; ++sv_iter ) {            CSeqVector::TResidue res = *sv_iter;            if ( !IsResidue(res) ) {                if ( ++bad_cnt > 10 ) {                    PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidResidue,                        "More than 10 invalid residues. Checking stopped",                        seq);                    return;                } else {                    string msg = "Invalid residue [";                    if ( seqtyp == CSeq_data::e_Ncbistdaa ) {                        msg += NStr::UIntToString(res);                    } else {                        msg += res;                    }                    msg += "] in position [" + NStr::UIntToString(pos) + "]";                    PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidResidue,                         msg, seq);                }            } else if ( res == termination ) {                terminations++;                trailingX = 0;            } else if ( res == 'X' ) {                trailingX++;            } else {                trailingX = 0;            }            ++pos;        }        if ( trailingX > 0 && !SuppressTrailingXMsg(seq) ) {            // Suppress if cds ends in "*" or 3' partial            string msg = "Sequence ends in " +                NStr::IntToString(trailingX) + " trailing X";            if ( trailingX > 1 ) {                msg += "s";            }            PostErr(eDiag_Warning, eErr_SEQ_INST_TrailingX, msg, seq);        }        if (terminations) {            // Post error indicating terminations found in protein sequence            // First get gene label            string glbl;            seq.GetLabel(&glbl, CBioseq::eContent);            if ( IsBlankString(glbl) ) {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?