validerror_bioseq.cpp
来自「ncbi源码」· C++ 代码 · 共 1,861 行 · 第 1/5 页
CPP
1,861 行
case CSeq_id::e_Gi: if ((*k)->GetGi() <= 0) { PostErr(eDiag_Critical, eErr_SEQ_INST_ZeroGiNumber, "Invalid GI number", seq); } gi_count++; break; case CSeq_id::e_General: break; default: break; } } CTypeConstIterator<CMolInfo> mi(ConstBegin(seq)); if ( is_wgs ) { if ( !mi || !mi->CanGetTech() || mi->GetTech() != CMolInfo::eTech_wgs ) { PostErr(eDiag_Error, eErr_SEQ_DESCR_Inconsistent, "WGS accession should have Mol-info.tech of wgs", seq); } } else if ( mi && mi->CanGetTech() && mi->GetTech() == CMolInfo::eTech_wgs && is_gb_embl_ddbj ) { PostErr(eDiag_Error, eErr_SEQ_DESCR_Inconsistent, "Mol-info.tech of wgs should have WGS accession", seq); } // Check that a sequence with a gi number has exactly one accession if ( gi_count > 0 && accn_count == 0 && !m_Imp.IsPDB() && repr != CSeq_inst::eRepr_virtual ) { PostErr(eDiag_Error, eErr_SEQ_INST_GiWithoutAccession, "No accession on sequence with gi number", seq); } if (gi_count > 0 && accn_count > 1) { PostErr(eDiag_Error, eErr_SEQ_INST_MultipleAccessions, "Multiple accessions on sequence with gi number", seq); } // Protein specific checks if ( seq.IsAa() ) { ITERATE( CBioseq::TId, id, seq.GetId() ) { switch ( (*id)->Which() ) { case CSeq_id::e_Genbank: case CSeq_id::e_Embl: case CSeq_id::e_Ddbj: case CSeq_id::e_Tpg: case CSeq_id::e_Tpe: case CSeq_id::e_Tpd: { const CTextseq_id* tsid = (*id)->GetTextseq_Id(); if ( tsid != NULL ) { if ( !tsid->IsSetAccession() && tsid->IsSetName() ) { if ( m_Imp.IsNucAcc(tsid->GetName()) ) { PostErr(eDiag_Warning, eErr_SEQ_INST_BadSeqIdFormat, "Protein bioseq has Textseq-id 'name' that" "looks like it is derived from a nucleotide" "accession", seq); } } } } break; default: break; } } } if ( m_Imp.IsValidateIdSet() ) { ValidateIDSetAgainstDb(seq); } // C toolkit ensures that there is exactly one CBioseq for a CSeq_id // Not done here because object manager will not allow // the same Seq-id on multiple Bioseqs}bool CValidError_bioseq::IsHistAssemblyMissing(const CBioseq& seq){ const CSeq_inst& inst = seq.GetInst(); CSeq_inst::TRepr repr = inst.CanGetRepr() ? inst.GetRepr() : CSeq_inst::eRepr_not_set; if ( !inst.CanGetHist() || !inst.GetHist().CanGetAssembly() ) { if ( seq.IsNa() && repr != CSeq_inst::eRepr_seg ) { return true; } } return false;}void CValidError_bioseq::ValidateSecondaryAccConflict(const string &primary_acc, const CBioseq &seq, int choice){ CSeq_descr_CI ds(m_Scope->GetBioseqHandle(seq)); CSeqdesc_CI sd(ds, static_cast<CSeqdesc::E_Choice>(choice)); for (; sd; ++sd) { const list< string > *extra_acc = 0; if ( choice == CSeqdesc::e_Genbank && sd->GetGenbank().IsSetExtra_accessions() ) { extra_acc = &(sd->GetGenbank().GetExtra_accessions()); } else if ( choice == CSeqdesc::e_Embl && sd->GetEmbl().IsSetExtra_acc() ) { extra_acc = &(sd->GetEmbl().GetExtra_acc()); } if ( extra_acc ) { ITERATE( list<string>, acc, *extra_acc ) { if ( NStr::CompareNocase(primary_acc, *acc) == 0 ) { // If the same post error PostErr(eDiag_Error, eErr_SEQ_INST_BadSecondaryAccn, primary_acc + " used for both primary and" " secondary accession", seq); } } } }}void CValidError_bioseq::ValidateInst(const CBioseq& seq){ const CSeq_inst& inst = seq.GetInst(); // Check representation if ( !ValidateRepr(inst, seq) ) { return; } // Check molecule, topology, and strand const CSeq_inst::EMol& mol = inst.GetMol(); switch (mol) { case CSeq_inst::eMol_na: PostErr(eDiag_Error, eErr_SEQ_INST_MolNuclAcid, "Bioseq.mol is type na", seq); break; case CSeq_inst::eMol_aa: if ( inst.IsSetTopology() && inst.GetTopology() != CSeq_inst::eTopology_not_set && inst.GetTopology() != CSeq_inst::eTopology_linear ) { PostErr(eDiag_Error, eErr_SEQ_INST_CircularProtein, "Non-linear topology set on protein", seq); } if ( inst.IsSetStrand() && inst.GetStrand() != CSeq_inst::eStrand_ss ) { PostErr(eDiag_Error, eErr_SEQ_INST_DSProtein, "Protein not single stranded", seq); } break; case CSeq_inst::eMol_not_set: PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol not set", seq); break; case CSeq_inst::eMol_other: PostErr(eDiag_Error, eErr_SEQ_INST_MolOther, "Bioseq.mol is type other", seq); break; default: break; } CSeq_inst::ERepr rp = seq.GetInst().GetRepr(); if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_const) { // Validate raw and constructed sequences ValidateRawConst(seq); } if (rp == CSeq_inst::eRepr_seg || rp == CSeq_inst::eRepr_ref) { // Validate segmented and reference sequences ValidateSegRef(seq); } if (rp == CSeq_inst::eRepr_delta) { // Validate delta sequences ValidateDelta(seq); } if (rp == CSeq_inst::eRepr_seg && seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsSeg()) { // Validate part of segmented sequence ValidateSeqParts(seq); } if ( seq.IsAa() ) { // Validate protein title (amino acids only) ValidateProteinTitle(seq); } if ( seq.IsNa() ) { // check for N bases at start or stop of sequence ValidateNs(seq); } // Validate sequence length ValidateSeqLen(seq);}void CValidError_bioseq::ValidateBioseqContext(const CBioseq& seq){ CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq); // Get Molinfo CTypeConstIterator<CMolInfo> mi(ConstBegin(seq)); if ( mi ) { x_ValidateCompletness(seq, *mi); if ( mi->IsSetTech() ) { switch (mi->GetTech()) { case CMolInfo::eTech_sts: case CMolInfo::eTech_survey: case CMolInfo::eTech_wgs: case CMolInfo::eTech_htgs_0: case CMolInfo::eTech_htgs_1: case CMolInfo::eTech_htgs_2: case CMolInfo::eTech_htgs_3: if (mi->GetTech() == CMolInfo::eTech_sts && seq.GetInst().GetMol() == CSeq_inst::eMol_rna && mi->IsSetBiomol() && mi->GetBiomol() == CMolInfo::eBiomol_mRNA) { // !!! // Ok, there are some STS sequences derived from // cDNAs, so do not report these } else if (mi->IsSetBiomol() && mi->GetBiomol() != CMolInfo::eBiomol_genomic) { PostErr(eDiag_Error, eErr_SEQ_INST_ConflictingBiomolTech, "HTGS/STS/GSS/WGS sequence should be genomic", seq); } else if (seq.GetInst().GetMol() != CSeq_inst::eMol_dna && seq.GetInst().GetMol() != CSeq_inst::eMol_na) { PostErr(eDiag_Error, eErr_SEQ_INST_ConflictingBiomolTech, "HTGS/STS/GSS/WGS sequence should not be RNA", seq); } break; default: break; } } } // Check that proteins in nuc_prot set have a CdRegion if ( CdError(bsh) ) { PostErr(eDiag_Error, eErr_SEQ_PKG_NoCdRegionPtr, "No CdRegion in nuc-prot set points to this protein", seq); } // Check that gene on non-segmented sequence does not have // multiple intervals ValidateMultiIntervalGene(seq); ValidateSeqFeatContext(seq); // Check for duplicate features and overlapping peptide features. ValidateDupOrOverlapFeats(seq); // Check for colliding genes ValidateCollidingGenes(seq); if ( seq.IsSetDescr() ) { ValidateSeqDescContext(seq); } // make sure that there is a pub on this bioseq if ( !m_Imp.IsNoPubs() ) { CheckForPubOnBioseq(seq); } // make sure that there is a source on this bioseq if ( !m_Imp.IsNoBioSource() ) { CheckForBiosourceOnBioseq(seq); } // flag missing molinfo even if not in Sequin CheckForMolinfoOnBioseq(seq); ValidateGraphsOnBioseq(seq); CheckTpaHistory(seq); if ( IsMrna(bsh) ) { ValidatemRNABioseqContext(bsh); }}static bool s_EqualGene_ref(const CGene_ref& genomic, const CGene_ref& mrna){ bool locus = (!genomic.CanGetLocus() && !mrna.CanGetLocus()) || (genomic.CanGetLocus() && mrna.CanGetLocus() && genomic.GetLocus() == mrna.GetLocus()); bool allele = (!genomic.CanGetAllele() && !mrna.CanGetAllele()) || (genomic.CanGetAllele() && mrna.CanGetAllele() && genomic.GetAllele() == mrna.GetAllele()); bool desc = (!genomic.CanGetDesc() && !mrna.CanGetDesc()) || (genomic.CanGetDesc() && mrna.CanGetDesc() && genomic.GetDesc() == mrna.GetDesc()); bool locus_tag = (!genomic.CanGetLocus_tag() && !mrna.CanGetLocus_tag()) || (genomic.CanGetLocus_tag() && mrna.CanGetLocus_tag() && genomic.GetLocus_tag() == mrna.GetLocus_tag()); return locus && allele && desc && locus_tag;}void CValidError_bioseq::ValidatemRNABioseqContext(const CBioseq_Handle& seq){ // check that there is no conflict between the gene on the genomic // and the gene on the mrna. const CSeq_feat* mrna = GetmRNAForProduct(seq); const CGene_ref* genomicgrp = 0; if ( mrna != 0 ) { genomicgrp = mrna->GetGeneXref(); if ( genomicgrp == 0 ) { const CSeq_feat* gene = GetOverlappingGene(mrna->GetLocation(), *m_Scope); if ( gene != 0 ) { genomicgrp = &gene->GetData().GetGene(); } } if ( genomicgrp != 0 ) { CFeat_CI mrna_gene(seq, 0, 0, CSeqFeatData::e_Gene); if ( mrna_gene ) { const CGene_ref& mrnagrp = mrna_gene->GetData().GetGene(); if ( !s_EqualGene_ref(*genomicgrp, mrnagrp) ) { PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenesInconsistent, "Gene on mRNA bioseq does not match gene on genomic bioseq", mrna_gene->GetOriginalFeature()); } } } }}void CValidError_bioseq::ValidateHistory(const CBioseq& seq){ if ( !seq.GetInst().IsSetHist() ) { return; } int gi = 0; ITERATE( CBioseq::TId, id, seq.GetId() ) { if ( (*id)->IsGi() ) { gi = (*id)->GetGi(); break; } } if ( gi == 0 ) { return; } const CSeq_hist& hist = seq.GetInst().GetHist(); if ( hist.IsSetReplaced_by() ) { const CSeq_hist_rec& rec = hist.GetReplaced_by(); ITERATE( CSeq_hist_rec::TIds, id, rec.GetIds() ) { if ( (*id)->IsGi() ) { if ( gi == (*id)->GetGi() ) { PostErr(eDiag_Error, eErr_SEQ_INST_HistoryGiCollision, "Replaced by gi (" +
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?