validerror_bioseq.cpp
来自「ncbi源码」· C++ 代码 · 共 1,861 行 · 第 1/5 页
CPP
1,861 行
glbl = "gene?"; } string plbl; const CBioseq* nuc = GetNucleotideParent(seq, m_Scope); if ( nuc ) { nuc->GetLabel(&plbl, CBioseq::eContent); } if ( IsBlankString(plbl) ) { plbl = "prot?"; } PostErr(eDiag_Error, eErr_SEQ_INST_StopInProtein, NStr::IntToString(terminations) + " termination symbols in protein sequence (" + glbl + string(" - ") + plbl + ")", seq); if (!bad_cnt) { return; } } return; }}// Assumes seq is eRepr_seg or eRepr_refvoid CValidError_bioseq::ValidateSegRef(const CBioseq& seq){ CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq); const CSeq_inst& inst = seq.GetInst(); // Validate extension data -- wrap in CSeq_loc_mix for convenience CSeq_loc loc; if ( GetLocFromSeq(seq, &loc) ) { m_Imp.ValidateSeqLoc(loc, bsh, "Segmented Bioseq", seq); } // Validate Length try { TSeqPos loclen = GetLength(loc, m_Scope); TSeqPos seqlen = inst.IsSetLength() ? inst.GetLength() : 0; if (seqlen > loclen) { PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [" + NStr::IntToString(loclen) + "] for given length [" + NStr::IntToString(seqlen) + "]", seq); } else if (seqlen < loclen) { PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [" + NStr::IntToString(loclen) + "] than given length [" + NStr::IntToString(seqlen) + "]", seq); } } catch (const CNoLength&) { ERR_POST(Critical << "Unable to calculate length: "); } // Check for multiple references to the same Bioseq if (inst.IsSetExt() && inst.GetExt().IsSeg()) { const list< CRef<CSeq_loc> >& locs = inst.GetExt().GetSeg().Get(); ITERATE(list< CRef<CSeq_loc> >, i1, locs) { if (!IsOneBioseq(**i1, m_Scope)) { continue; } const CSeq_id& id1 = GetId(**i1); list< CRef<CSeq_loc> >::const_iterator i2 = i1; for (++i2; i2 != locs.end(); ++i2) { if (!IsOneBioseq(**i2, m_Scope)) { continue; } const CSeq_id& id2 = GetId(**i2); if (IsSameBioseq(id1, id2, m_Scope)) { CNcbiOstrstream os; os << id1.DumpAsFasta(); string sid(os.str()); if ((**i1).IsWhole() && (**i2).IsWhole()) { PostErr(eDiag_Error, eErr_SEQ_INST_DuplicateSegmentReferences, "Segmented sequence has multiple references to " + sid, seq); } else { PostErr(eDiag_Warning, eErr_SEQ_INST_DuplicateSegmentReferences, "Segmented sequence has multiple references to " + sid + " that are not e_Whole", seq); } } } } } // Check that partial sequence info on sequence segments is consistent with // partial sequence info on sequence -- aa sequences only int partial = SeqLocPartialCheck(loc, m_Scope); if (partial && seq.IsAa()) { bool got_partial = false; CTypeConstIterator<CSeqdesc> sd(ConstBegin(seq.GetDescr())); for (; sd; ++sd) { if (!(*sd).IsModif()) { continue; } ITERATE(list< EGIBB_mod >, md, (*sd).GetModif()) { switch (*md) { case eGIBB_mod_partial: got_partial = true; break; case eGIBB_mod_no_left: if (partial & eSeqlocPartial_Start) { PostErr(eDiag_Error, eErr_SEQ_INST_PartialInconsistent, "GIBB-mod no-left inconsistent with segmented " "SeqLoc", seq); } got_partial = true; break; case eGIBB_mod_no_right: if (partial & eSeqlocPartial_Stop) { PostErr(eDiag_Error, eErr_SEQ_INST_PartialInconsistent, "GIBB-mod no-right inconsistene with segmented " "SeqLoc", seq); } got_partial = true; break; default: break; } } } if (!got_partial) { PostErr(eDiag_Error, eErr_SEQ_INST_PartialInconsistent, "Partial segmented sequence without GIBB-mod", seq); } }}// Assumes seq is a delta sequencevoid CValidError_bioseq::ValidateDelta(const CBioseq& seq){ const CSeq_inst& inst = seq.GetInst(); // Get CMolInfo and tech used for validating technique and gap positioning const CMolInfo* mi = 0; CSeqdesc_CI mi_desc(m_Scope->GetBioseqHandle(seq), CSeqdesc::e_Molinfo); if ( mi_desc ) { mi = &(mi_desc->GetMolinfo()); } CMolInfo::TTech tech = mi != 0 ? mi->GetTech() : CMolInfo::eTech_unknown; if (!inst.IsSetExt() || !inst.GetExt().IsDelta() || inst.GetExt().GetDelta().Get().empty()) { PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataLenWrong, "No CDelta_ext data for delta Bioseq", seq); } TSeqPos len = 0; TSeqPos seg = 0; ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) { ++seg; if ( !(*sg) ) { PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataLenWrong, "NULL pointer in delta seq_ext valnode (segment " + NStr::IntToString(seg) + ")", seq); continue; } switch ( (**sg).Which() ) { case CDelta_seq::e_Loc: { const CSeq_loc& loc = (**sg).GetLoc(); try { size_t loc_len = GetLength(loc, m_Scope); len += loc_len; if ( loc_len <= 10 ) { string loc_str; loc.GetLabel(&loc_str); if ( loc_str.empty() ) { loc_str = "?"; } PostErr(eDiag_Warning, eErr_SEQ_INST_SeqLocLength, "Short length (" + NStr::IntToString(loc_len) + ") on seq-loc (" + loc_str + ") of delta seq_ext", seq); } } catch (const CNoLength&) { string loc_str; loc.GetLabel(&loc_str); if ( loc_str.empty() ) { loc_str = "?"; } PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataLenWrong, "No length for Seq-loc (" + loc_str + ") of delta seq-ext", seq); } break; } case CDelta_seq::e_Literal: { // The C toolkit code checks for valid alphabet here // The C++ object serializaton will not load if invalid alphabet // so no check needed here const CSeq_literal& lit = (*sg)->GetLiteral(); TSeqPos start_len = len; len += lit.CanGetLength() ? lit.GetLength() : 0; // Check for invalid residues if ( lit.CanGetSeq_data() ) { const CSeq_data& data = lit.GetSeq_data(); vector<TSeqPos> badIdx; CSeqportUtil::Validate(data, &badIdx); const string* ss = 0; switch (data.Which()) { case CSeq_data::e_Iupacaa: ss = &data.GetIupacaa().Get(); break; case CSeq_data::e_Iupacna: ss = &data.GetIupacna().Get(); break; case CSeq_data::e_Ncbieaa: ss = &data.GetNcbieaa().Get(); break; case CSeq_data::e_Ncbistdaa: { const vector<char>& c = data.GetNcbistdaa().Get(); ITERATE (vector<TSeqPos>, ci, badIdx) { PostErr(eDiag_Error, eErr_SEQ_INST_InvalidResidue, "Invalid residue [" + NStr::IntToString((int)c[*ci]) + "] in position " + NStr::IntToString(*ci), seq); } break; } default: break; } if ( ss ) { ITERATE (vector<TSeqPos>, it, badIdx) { PostErr(eDiag_Error, eErr_SEQ_INST_InvalidResidue, "Invalid residue [" + ss->substr(*it, 1) + "] in position " + NStr::IntToString(*it), seq); } } // Count adjacent Ns in Seq-lit for htgs_1 and htgs_2 if ( tech == CMolInfo::eTech_htgs_1 || tech == CMolInfo::eTech_htgs_2 ) { size_t adjacent_ns = x_CountAdjacentNs(lit); if ( adjacent_ns > scm_AdjacentNsThreshold ) { PostErr(eDiag_Warning, eErr_SEQ_INST_InternalNsInSeqLit, "Run of " + NStr::UIntToString(adjacent_ns) + " Ns in delta component " + NStr::UIntToString(seg) + " that starts at base " + NStr::UIntToString(start_len), seq); } } } else if ( !lit.CanGetLength() || lit.GetLength() == 0 ) { PostErr(eDiag_Error, eErr_SEQ_INST_SeqLitGapLength0, "Gap of length 0 in delta chain", seq); } break; } default: PostErr(eDiag_Error, eErr_SEQ_INST_ExtNotAllowed, "CDelta_seq::Which() is e_not_set", seq); } } if (inst.GetLength() > len) { PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [" + NStr::IntToString(len) + "] for given length [" + NStr::IntToString(inst.GetLength()) + "]", seq); } else if (inst.GetLength() < len) { PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong, "Bisoeq.seq_data is larger [" + NStr::IntToString(len) + "] than given length [" + NStr::IntToString(inst.GetLength()) + "]", seq); } // Validate technique if ( mi && !m_Imp.IsNT() && !m_Imp.IsNC() && !m_Imp.IsGPS() ) { if (tech != CMolInfo::eTech_unknown && tech != CMolInfo::eTech_standard && tech != CMolInfo::eTech_wgs && tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1 && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3 ) { const CEnumeratedTypeValues* tv = CMolInfo::GetTypeInfo_enum_ETech(); const string& stech = tv->FindName(mi->GetTech(), true); PostErr(eDiag_Error, eErr_SEQ_INST_BadDeltaSeq, "Delta seq technique should not be " + stech, seq); } } EDiagSev sev = eDiag_Error; if ( tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1 && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3 ) { sev = eDiag_Warning; } // Validate positioning of gaps CTypeConstIterator<CDelta_seq> ds(ConstBegin(inst)); if (ds && (*ds).IsLiteral() && !(*ds).GetLiteral().IsSetSeq_data()) { PostErr(sev, eErr_SEQ_INST_BadDeltaSeq, "First delta seq component is a gap", seq); } bool last_is_gap = false; unsigned int num_adjacent_gaps = 0; unsigned int num_gaps = 0; for (++ds; ds; ++ds) { if ((*ds).IsLiteral()) { if (!(*ds).GetLiteral().IsSetSeq_data()) { if (last_is_gap) { num_adjacent_gaps++; } last_is_gap = true; num_gaps++; } else { last_is_gap = false; } } else { last_is_gap = false; } } if ( num_adjacent_gaps > 1 ) { PostErr(eDiag_Error, eErr_SEQ_INST_BadDeltaSeq, "There are " + NStr::IntToString(num_adjacent_gaps) + " adjacent gaps in delta seq", seq); } else if ( num_adjacent_gaps == 1 ) { PostErr(eDiag_Error, eErr_SEQ_INST_BadDeltaSeq, "There is one adjacent gap in delta seq", seq); } if (last_is_gap) { PostErr(sev, eErr_SEQ_INST_BadDeltaSeq, "Last delta seq component is a gap", seq); } if (num_gaps == 0 && mi) { if ( tech == CMolInfo::eTech_htgs_2 && !GraphsOnBioseq(seq) && !x_IsActiveFin(seq) ) { PostErr(eDiag_Warning, eErr_SEQ_INST_BadDeltaSeq, "HTGS 2 delta seq has no gaps and no graphs", seq); } }}bool CValidError_bioseq::ValidateRepr(const CSeq_inst& inst, const CBioseq& seq){ bool rtn = true; const CEnumeratedTypeValues* tv = CSeq_inst::GetTypeInfo_enum_ERepr(); const string& rpr = tv->FindName(inst.GetRepr(), true); const string err0 = "Bioseq-ext not allowed on " + rpr + " Bioseq"; const string err1 = "Missing or incorrect Bioseq-ext on " + rpr + " Bioseq"; const string err2 = "Missing Seq-data on " + rpr + " Bioseq"; const string err3 = "Seq-data not allowed on " + rpr + " Bioseq"; switch (inst.GetRepr()) { case CSeq_inst::eRepr_virtual: if (inst.IsSetExt()) { PostErr(eDiag_Error, eErr_SEQ_INST_ExtNotAllowed, err0, seq); rtn = false;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?