validerror_bioseq.cpp

来自「ncbi源码」· C++ 代码 · 共 1,861 行 · 第 1/5 页

CPP
1,861
字号
                glbl = "gene?";            }            string plbl;            const CBioseq* nuc = GetNucleotideParent(seq, m_Scope);            if ( nuc ) {                nuc->GetLabel(&plbl, CBioseq::eContent);            }            if ( IsBlankString(plbl) ) {                plbl = "prot?";            }            PostErr(eDiag_Error, eErr_SEQ_INST_StopInProtein,                NStr::IntToString(terminations) +                " termination symbols in protein sequence (" +                glbl + string(" - ") + plbl + ")", seq);            if (!bad_cnt) {                return;            }        }        return;    }}// Assumes seq is eRepr_seg or eRepr_refvoid CValidError_bioseq::ValidateSegRef(const CBioseq& seq){    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);    const CSeq_inst& inst = seq.GetInst();    // Validate extension data -- wrap in CSeq_loc_mix for convenience    CSeq_loc loc;    if ( GetLocFromSeq(seq, &loc) ) {        m_Imp.ValidateSeqLoc(loc, bsh, "Segmented Bioseq", seq);    }    // Validate Length    try {        TSeqPos loclen = GetLength(loc, m_Scope);        TSeqPos seqlen = inst.IsSetLength() ? inst.GetLength() : 0;        if (seqlen > loclen) {            PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong,                "Bioseq.seq_data too short [" + NStr::IntToString(loclen) +                "] for given length [" + NStr::IntToString(seqlen) + "]",                seq);        } else if (seqlen < loclen) {            PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong,                "Bioseq.seq_data is larger [" + NStr::IntToString(loclen) +                "] than given length [" + NStr::IntToString(seqlen) + "]",                seq);        }    } catch (const CNoLength&) {        ERR_POST(Critical << "Unable to calculate length: ");    }    // Check for multiple references to the same Bioseq    if (inst.IsSetExt()  &&  inst.GetExt().IsSeg()) {        const list< CRef<CSeq_loc> >& locs = inst.GetExt().GetSeg().Get();        ITERATE(list< CRef<CSeq_loc> >, i1, locs) {           if (!IsOneBioseq(**i1, m_Scope)) {                continue;            }            const CSeq_id& id1 = GetId(**i1);            list< CRef<CSeq_loc> >::const_iterator i2 = i1;            for (++i2; i2 != locs.end(); ++i2) {                if (!IsOneBioseq(**i2, m_Scope)) {                    continue;                }                const CSeq_id& id2 = GetId(**i2);                if (IsSameBioseq(id1, id2, m_Scope)) {                    CNcbiOstrstream os;                    os << id1.DumpAsFasta();                    string sid(os.str());                    if ((**i1).IsWhole()  &&  (**i2).IsWhole()) {                        PostErr(eDiag_Error,                            eErr_SEQ_INST_DuplicateSegmentReferences,                            "Segmented sequence has multiple references to " +                            sid, seq);                    } else {                        PostErr(eDiag_Warning,                            eErr_SEQ_INST_DuplicateSegmentReferences,                            "Segmented sequence has multiple references to " +                            sid + " that are not e_Whole", seq);                    }                }            }        }    }    // Check that partial sequence info on sequence segments is consistent with    // partial sequence info on sequence -- aa  sequences only    int partial = SeqLocPartialCheck(loc, m_Scope);    if (partial  &&  seq.IsAa()) {        bool got_partial = false;        CTypeConstIterator<CSeqdesc> sd(ConstBegin(seq.GetDescr()));        for (; sd; ++sd) {            if (!(*sd).IsModif()) {                continue;            }            ITERATE(list< EGIBB_mod >, md, (*sd).GetModif()) {                switch (*md) {                case eGIBB_mod_partial:                    got_partial = true;                    break;                case eGIBB_mod_no_left:                    if (partial & eSeqlocPartial_Start) {                        PostErr(eDiag_Error, eErr_SEQ_INST_PartialInconsistent,                            "GIBB-mod no-left inconsistent with segmented "                            "SeqLoc", seq);                    }                    got_partial = true;                    break;                case eGIBB_mod_no_right:                    if (partial & eSeqlocPartial_Stop) {                        PostErr(eDiag_Error, eErr_SEQ_INST_PartialInconsistent,                            "GIBB-mod no-right inconsistene with segmented "                            "SeqLoc", seq);                    }                    got_partial = true;                    break;                default:                    break;                }            }        }        if (!got_partial) {            PostErr(eDiag_Error, eErr_SEQ_INST_PartialInconsistent,                "Partial segmented sequence without GIBB-mod", seq);        }    }}// Assumes seq is a delta sequencevoid CValidError_bioseq::ValidateDelta(const CBioseq& seq){    const CSeq_inst& inst = seq.GetInst();    // Get CMolInfo and tech used for validating technique and gap positioning    const CMolInfo* mi = 0;    CSeqdesc_CI mi_desc(m_Scope->GetBioseqHandle(seq), CSeqdesc::e_Molinfo);    if ( mi_desc ) {        mi = &(mi_desc->GetMolinfo());    }    CMolInfo::TTech tech =         mi != 0 ? mi->GetTech() : CMolInfo::eTech_unknown;    if (!inst.IsSetExt()  ||  !inst.GetExt().IsDelta()  ||        inst.GetExt().GetDelta().Get().empty()) {        PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataLenWrong,            "No CDelta_ext data for delta Bioseq", seq);    }    TSeqPos len = 0;    TSeqPos seg = 0;    ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {        ++seg;        if ( !(*sg) ) {            PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataLenWrong,                "NULL pointer in delta seq_ext valnode (segment " +                NStr::IntToString(seg) + ")", seq);            continue;        }        switch ( (**sg).Which() ) {        case CDelta_seq::e_Loc:        {            const CSeq_loc& loc = (**sg).GetLoc();             try {                size_t loc_len = GetLength(loc, m_Scope);                len += loc_len;                if ( loc_len <= 10 ) {                    string loc_str;                    loc.GetLabel(&loc_str);                    if ( loc_str.empty() ) {                        loc_str = "?";                    }                    PostErr(eDiag_Warning, eErr_SEQ_INST_SeqLocLength,                        "Short length (" + NStr::IntToString(loc_len) +                         ") on seq-loc (" + loc_str + ") of delta seq_ext", seq);                }            } catch (const CNoLength&) {                string loc_str;                loc.GetLabel(&loc_str);                if ( loc_str.empty() ) {                    loc_str = "?";                }                PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataLenWrong,                    "No length for Seq-loc (" + loc_str + ") of delta seq-ext",                    seq);            }            break;        }        case CDelta_seq::e_Literal:        {            // The C toolkit code checks for valid alphabet here            // The C++ object serializaton will not load if invalid alphabet            // so no check needed here            const CSeq_literal& lit = (*sg)->GetLiteral();            TSeqPos start_len = len;            len += lit.CanGetLength() ? lit.GetLength() : 0;            // Check for invalid residues            if ( lit.CanGetSeq_data() ) {                const CSeq_data& data = lit.GetSeq_data();                vector<TSeqPos> badIdx;                CSeqportUtil::Validate(data, &badIdx);                const string* ss = 0;                switch (data.Which()) {                case CSeq_data::e_Iupacaa:                    ss = &data.GetIupacaa().Get();                    break;                case CSeq_data::e_Iupacna:                    ss = &data.GetIupacna().Get();                    break;                case CSeq_data::e_Ncbieaa:                    ss = &data.GetNcbieaa().Get();                    break;                case CSeq_data::e_Ncbistdaa:                    {                        const vector<char>& c = data.GetNcbistdaa().Get();                        ITERATE (vector<TSeqPos>, ci, badIdx) {                            PostErr(eDiag_Error, eErr_SEQ_INST_InvalidResidue,                                "Invalid residue [" +                                NStr::IntToString((int)c[*ci]) + "] in position " +                                NStr::IntToString(*ci), seq);                        }                        break;                    }                default:                    break;                }                if ( ss ) {                    ITERATE (vector<TSeqPos>, it, badIdx) {                        PostErr(eDiag_Error, eErr_SEQ_INST_InvalidResidue,                            "Invalid residue [" +                            ss->substr(*it, 1) + "] in position " +                            NStr::IntToString(*it), seq);                    }                }                                            // Count adjacent Ns in Seq-lit for htgs_1 and htgs_2                if ( tech == CMolInfo::eTech_htgs_1  ||                      tech == CMolInfo::eTech_htgs_2 ) {                    size_t adjacent_ns = x_CountAdjacentNs(lit);                    if ( adjacent_ns > scm_AdjacentNsThreshold ) {                        PostErr(eDiag_Warning, eErr_SEQ_INST_InternalNsInSeqLit,                            "Run of " + NStr::UIntToString(adjacent_ns) +                             " Ns in delta component " + NStr::UIntToString(seg) +                            " that starts at base " + NStr::UIntToString(start_len),                            seq);                    }                }            } else if ( !lit.CanGetLength()  ||  lit.GetLength() == 0 ) {                PostErr(eDiag_Error, eErr_SEQ_INST_SeqLitGapLength0,                    "Gap of length 0 in delta chain", seq);            }                        break;        }        default:            PostErr(eDiag_Error, eErr_SEQ_INST_ExtNotAllowed,                "CDelta_seq::Which() is e_not_set", seq);        }    }    if (inst.GetLength() > len) {        PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong,            "Bioseq.seq_data too short [" + NStr::IntToString(len) +            "] for given length [" + NStr::IntToString(inst.GetLength()) +            "]", seq);    } else if (inst.GetLength() < len) {        PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong,            "Bisoeq.seq_data is larger [" + NStr::IntToString(len) +            "] than given length [" + NStr::IntToString(inst.GetLength()) +            "]", seq);    }        // Validate technique    if ( mi  &&  !m_Imp.IsNT()  &&  !m_Imp.IsNC()  &&  !m_Imp.IsGPS() ) {        if (tech != CMolInfo::eTech_unknown   &&            tech != CMolInfo::eTech_standard  &&            tech != CMolInfo::eTech_wgs       &&            tech != CMolInfo::eTech_htgs_0    &&            tech != CMolInfo::eTech_htgs_1    &&            tech != CMolInfo::eTech_htgs_2    &&            tech != CMolInfo::eTech_htgs_3      ) {            const CEnumeratedTypeValues* tv =                CMolInfo::GetTypeInfo_enum_ETech();            const string& stech = tv->FindName(mi->GetTech(), true);            PostErr(eDiag_Error, eErr_SEQ_INST_BadDeltaSeq,                "Delta seq technique should not be " + stech, seq);        }    }    EDiagSev sev = eDiag_Error;    if ( tech != CMolInfo::eTech_htgs_0  &&         tech != CMolInfo::eTech_htgs_1  &&         tech != CMolInfo::eTech_htgs_2  &&         tech != CMolInfo::eTech_htgs_3  ) {        sev = eDiag_Warning;    }    // Validate positioning of gaps    CTypeConstIterator<CDelta_seq> ds(ConstBegin(inst));    if (ds  &&  (*ds).IsLiteral()  &&  !(*ds).GetLiteral().IsSetSeq_data()) {        PostErr(sev, eErr_SEQ_INST_BadDeltaSeq,            "First delta seq component is a gap", seq);    }    bool last_is_gap = false;    unsigned int num_adjacent_gaps = 0;    unsigned int num_gaps = 0;    for (++ds; ds; ++ds) {        if ((*ds).IsLiteral()) {            if (!(*ds).GetLiteral().IsSetSeq_data()) {                if (last_is_gap) {                    num_adjacent_gaps++;                }                last_is_gap = true;                num_gaps++;            } else {                last_is_gap = false;            }        } else {            last_is_gap = false;        }    }    if ( num_adjacent_gaps > 1 ) {        PostErr(eDiag_Error, eErr_SEQ_INST_BadDeltaSeq,            "There are " + NStr::IntToString(num_adjacent_gaps) +            " adjacent gaps in delta seq", seq);    } else if ( num_adjacent_gaps == 1 ) {        PostErr(eDiag_Error, eErr_SEQ_INST_BadDeltaSeq,            "There is one adjacent gap in delta seq", seq);    }    if (last_is_gap) {        PostErr(sev, eErr_SEQ_INST_BadDeltaSeq,            "Last delta seq component is a gap", seq);    }    if (num_gaps == 0  &&  mi) {        if ( tech == CMolInfo::eTech_htgs_2  &&             !GraphsOnBioseq(seq)  &&             !x_IsActiveFin(seq) ) {            PostErr(eDiag_Warning, eErr_SEQ_INST_BadDeltaSeq,                "HTGS 2 delta seq has no gaps and no graphs", seq);        }    }}bool CValidError_bioseq::ValidateRepr(const CSeq_inst& inst, const CBioseq&   seq){    bool rtn = true;    const CEnumeratedTypeValues* tv = CSeq_inst::GetTypeInfo_enum_ERepr();    const string& rpr = tv->FindName(inst.GetRepr(), true);    const string err0 = "Bioseq-ext not allowed on " + rpr + " Bioseq";    const string err1 = "Missing or incorrect Bioseq-ext on " + rpr + " Bioseq";    const string err2 = "Missing Seq-data on " + rpr + " Bioseq";    const string err3 = "Seq-data not allowed on " + rpr + " Bioseq";    switch (inst.GetRepr()) {    case CSeq_inst::eRepr_virtual:        if (inst.IsSetExt()) {            PostErr(eDiag_Error, eErr_SEQ_INST_ExtNotAllowed, err0, seq);            rtn = false;     

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?