validerror_bioseq.cpp

来自「ncbi源码」· C++ 代码 · 共 1,861 行 · 第 1/5 页

CPP
1,861
字号
        case CSeq_id::e_Gi:            if ((*k)->GetGi() <= 0) {                PostErr(eDiag_Critical, eErr_SEQ_INST_ZeroGiNumber,                         "Invalid GI number", seq);            }            gi_count++;            break;        case CSeq_id::e_General:            break;        default:            break;        }    }    CTypeConstIterator<CMolInfo> mi(ConstBegin(seq));    if ( is_wgs ) {        if ( !mi  ||  !mi->CanGetTech()  ||              mi->GetTech() != CMolInfo::eTech_wgs ) {            PostErr(eDiag_Error, eErr_SEQ_DESCR_Inconsistent,                 "WGS accession should have Mol-info.tech of wgs", seq);        }    } else if ( mi  &&  mi->CanGetTech()  &&                  mi->GetTech() == CMolInfo::eTech_wgs  &&  is_gb_embl_ddbj ) {        PostErr(eDiag_Error, eErr_SEQ_DESCR_Inconsistent,            "Mol-info.tech of wgs should have WGS accession", seq);    }    // Check that a sequence with a gi number has exactly one accession    if ( gi_count > 0  &&  accn_count == 0  &&  !m_Imp.IsPDB()  &&           repr != CSeq_inst::eRepr_virtual ) {        PostErr(eDiag_Error, eErr_SEQ_INST_GiWithoutAccession,            "No accession on sequence with gi number", seq);    }    if (gi_count > 0  &&  accn_count > 1) {        PostErr(eDiag_Error, eErr_SEQ_INST_MultipleAccessions,            "Multiple accessions on sequence with gi number", seq);    }    // Protein specific checks    if ( seq.IsAa() ) {        ITERATE( CBioseq::TId, id, seq.GetId() ) {            switch ( (*id)->Which() ) {            case CSeq_id::e_Genbank:            case CSeq_id::e_Embl:            case CSeq_id::e_Ddbj:            case CSeq_id::e_Tpg:            case CSeq_id::e_Tpe:            case CSeq_id::e_Tpd:                {                    const CTextseq_id* tsid = (*id)->GetTextseq_Id();                    if ( tsid != NULL ) {                        if ( !tsid->IsSetAccession()  &&  tsid->IsSetName() ) {                            if ( m_Imp.IsNucAcc(tsid->GetName()) ) {                                PostErr(eDiag_Warning, eErr_SEQ_INST_BadSeqIdFormat,                                    "Protein bioseq has Textseq-id 'name' that"                                    "looks like it is derived from a nucleotide"                                    "accession", seq);                            }                        }                    }                }                break;            default:                break;            }        }    }    if ( m_Imp.IsValidateIdSet() ) {        ValidateIDSetAgainstDb(seq);    }    // C toolkit ensures that there is exactly one CBioseq for a CSeq_id    // Not done here because object manager will not allow    // the same Seq-id on multiple Bioseqs}bool CValidError_bioseq::IsHistAssemblyMissing(const CBioseq& seq){    const CSeq_inst& inst = seq.GetInst();    CSeq_inst::TRepr repr = inst.CanGetRepr() ?        inst.GetRepr() : CSeq_inst::eRepr_not_set;    if ( !inst.CanGetHist()  ||  !inst.GetHist().CanGetAssembly() ) {        if ( seq.IsNa()  &&  repr != CSeq_inst::eRepr_seg ) {            return true;        }    }    return false;}void CValidError_bioseq::ValidateSecondaryAccConflict(const string &primary_acc, const CBioseq &seq, int choice){    CSeq_descr_CI ds(m_Scope->GetBioseqHandle(seq));    CSeqdesc_CI sd(ds, static_cast<CSeqdesc::E_Choice>(choice));    for (; sd; ++sd) {        const list< string > *extra_acc = 0;        if ( choice == CSeqdesc::e_Genbank  &&            sd->GetGenbank().IsSetExtra_accessions() ) {            extra_acc = &(sd->GetGenbank().GetExtra_accessions());        } else if ( choice == CSeqdesc::e_Embl  &&            sd->GetEmbl().IsSetExtra_acc() ) {            extra_acc = &(sd->GetEmbl().GetExtra_acc());        }        if ( extra_acc ) {            ITERATE( list<string>, acc, *extra_acc ) {                if ( NStr::CompareNocase(primary_acc, *acc) == 0 ) {                    // If the same post error                    PostErr(eDiag_Error,                        eErr_SEQ_INST_BadSecondaryAccn,                        primary_acc + " used for both primary and"                        " secondary accession", seq);                }            }        }    }}void CValidError_bioseq::ValidateInst(const CBioseq& seq){    const CSeq_inst& inst = seq.GetInst();    // Check representation    if ( !ValidateRepr(inst, seq) ) {        return;    }    // Check molecule, topology, and strand    const CSeq_inst::EMol& mol = inst.GetMol();    switch (mol) {        case CSeq_inst::eMol_na:            PostErr(eDiag_Error, eErr_SEQ_INST_MolNuclAcid,                     "Bioseq.mol is type na", seq);            break;        case CSeq_inst::eMol_aa:            if ( inst.IsSetTopology()  &&                 inst.GetTopology() != CSeq_inst::eTopology_not_set  &&                 inst.GetTopology() != CSeq_inst::eTopology_linear ) {                PostErr(eDiag_Error, eErr_SEQ_INST_CircularProtein,                         "Non-linear topology set on protein", seq);            }            if ( inst.IsSetStrand()  &&                 inst.GetStrand() != CSeq_inst::eStrand_ss ) {                PostErr(eDiag_Error, eErr_SEQ_INST_DSProtein,                         "Protein not single stranded", seq);            }            break;        case CSeq_inst::eMol_not_set:            PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol not set",                seq);            break;        case CSeq_inst::eMol_other:            PostErr(eDiag_Error, eErr_SEQ_INST_MolOther,                     "Bioseq.mol is type other", seq);            break;        default:            break;    }    CSeq_inst::ERepr rp = seq.GetInst().GetRepr();    if (rp == CSeq_inst::eRepr_raw  ||  rp == CSeq_inst::eRepr_const) {            // Validate raw and constructed sequences        ValidateRawConst(seq);    }    if (rp == CSeq_inst::eRepr_seg  ||  rp == CSeq_inst::eRepr_ref) {        // Validate segmented and reference sequences        ValidateSegRef(seq);    }    if (rp == CSeq_inst::eRepr_delta) {        // Validate delta sequences        ValidateDelta(seq);    }    if (rp == CSeq_inst::eRepr_seg  &&  seq.GetInst().IsSetExt()  &&        seq.GetInst().GetExt().IsSeg()) {        // Validate part of segmented sequence        ValidateSeqParts(seq);    }        if ( seq.IsAa() ) {        // Validate protein title (amino acids only)        ValidateProteinTitle(seq);    }        if ( seq.IsNa() ) {        // check for N bases at start or stop of sequence        ValidateNs(seq);    }    // Validate sequence length    ValidateSeqLen(seq);}void CValidError_bioseq::ValidateBioseqContext(const CBioseq& seq){    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);    // Get Molinfo    CTypeConstIterator<CMolInfo> mi(ConstBegin(seq));    if ( mi ) {        x_ValidateCompletness(seq, *mi);        if ( mi->IsSetTech() ) {            switch (mi->GetTech()) {            case CMolInfo::eTech_sts:            case CMolInfo::eTech_survey:            case CMolInfo::eTech_wgs:            case CMolInfo::eTech_htgs_0:            case CMolInfo::eTech_htgs_1:            case CMolInfo::eTech_htgs_2:            case CMolInfo::eTech_htgs_3:                if (mi->GetTech() == CMolInfo::eTech_sts  &&                    seq.GetInst().GetMol() == CSeq_inst::eMol_rna  &&                    mi->IsSetBiomol()  &&                    mi->GetBiomol() == CMolInfo::eBiomol_mRNA) {                    // !!!                    // Ok, there are some STS sequences derived from                     // cDNAs, so do not report these                } else if (mi->IsSetBiomol()  &&                    mi->GetBiomol() != CMolInfo::eBiomol_genomic) {                    PostErr(eDiag_Error, eErr_SEQ_INST_ConflictingBiomolTech,                        "HTGS/STS/GSS/WGS sequence should be genomic", seq);                } else if (seq.GetInst().GetMol() != CSeq_inst::eMol_dna  &&                    seq.GetInst().GetMol() != CSeq_inst::eMol_na) {                    PostErr(eDiag_Error, eErr_SEQ_INST_ConflictingBiomolTech,                        "HTGS/STS/GSS/WGS sequence should not be RNA", seq);                }                break;            default:                break;            }        }    }            // Check that proteins in nuc_prot set have a CdRegion    if ( CdError(bsh) ) {        PostErr(eDiag_Error, eErr_SEQ_PKG_NoCdRegionPtr,            "No CdRegion in nuc-prot set points to this protein",             seq);    }    // Check that gene on non-segmented sequence does not have    // multiple intervals    ValidateMultiIntervalGene(seq);    ValidateSeqFeatContext(seq);    // Check for duplicate features and overlapping peptide features.    ValidateDupOrOverlapFeats(seq);    // Check for colliding genes    ValidateCollidingGenes(seq);    if ( seq.IsSetDescr() ) {        ValidateSeqDescContext(seq);    }    // make sure that there is a pub on this bioseq    if ( !m_Imp.IsNoPubs() ) {          CheckForPubOnBioseq(seq);    }    // make sure that there is a source on this bioseq    if ( !m_Imp.IsNoBioSource() ) {         CheckForBiosourceOnBioseq(seq);    }        // flag missing molinfo even if not in Sequin    CheckForMolinfoOnBioseq(seq);    ValidateGraphsOnBioseq(seq);    CheckTpaHistory(seq);        if ( IsMrna(bsh) ) {        ValidatemRNABioseqContext(bsh);    }}static bool s_EqualGene_ref(const CGene_ref& genomic, const CGene_ref& mrna){    bool locus = (!genomic.CanGetLocus()  &&  !mrna.CanGetLocus())  ||        (genomic.CanGetLocus()  &&  mrna.CanGetLocus()  &&        genomic.GetLocus() == mrna.GetLocus());    bool allele = (!genomic.CanGetAllele()  &&  !mrna.CanGetAllele())  ||        (genomic.CanGetAllele()  &&  mrna.CanGetAllele()  &&        genomic.GetAllele() == mrna.GetAllele());    bool desc = (!genomic.CanGetDesc()  &&  !mrna.CanGetDesc())  ||        (genomic.CanGetDesc()  &&  mrna.CanGetDesc()  &&        genomic.GetDesc() == mrna.GetDesc());    bool locus_tag = (!genomic.CanGetLocus_tag()  &&  !mrna.CanGetLocus_tag())  ||        (genomic.CanGetLocus_tag()  &&  mrna.CanGetLocus_tag()  &&        genomic.GetLocus_tag() == mrna.GetLocus_tag());    return locus  &&  allele  &&  desc  && locus_tag;}void CValidError_bioseq::ValidatemRNABioseqContext(const CBioseq_Handle& seq){    // check that there is no conflict between the gene on the genomic     // and the gene on the mrna.    const CSeq_feat* mrna = GetmRNAForProduct(seq);    const CGene_ref* genomicgrp = 0;    if ( mrna != 0 ) {        genomicgrp = mrna->GetGeneXref();        if ( genomicgrp == 0 ) {            const CSeq_feat* gene =                 GetOverlappingGene(mrna->GetLocation(), *m_Scope);            if ( gene != 0 ) {                genomicgrp = &gene->GetData().GetGene();            }        }        if ( genomicgrp != 0 ) {            CFeat_CI mrna_gene(seq, 0, 0, CSeqFeatData::e_Gene);            if ( mrna_gene ) {                const CGene_ref& mrnagrp = mrna_gene->GetData().GetGene();                if ( !s_EqualGene_ref(*genomicgrp, mrnagrp) ) {                    PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenesInconsistent,                        "Gene on mRNA bioseq does not match gene on genomic bioseq",                        mrna_gene->GetOriginalFeature());                }            }        }    }}void CValidError_bioseq::ValidateHistory(const CBioseq& seq){    if ( !seq.GetInst().IsSetHist() ) {        return;    }        int gi = 0;    ITERATE( CBioseq::TId, id, seq.GetId() ) {        if ( (*id)->IsGi() ) {            gi = (*id)->GetGi();            break;        }    }    if ( gi == 0 ) {        return;    }    const CSeq_hist& hist = seq.GetInst().GetHist();    if ( hist.IsSetReplaced_by() ) {        const CSeq_hist_rec& rec = hist.GetReplaced_by();        ITERATE( CSeq_hist_rec::TIds, id, rec.GetIds() ) {            if ( (*id)->IsGi() ) {                if ( gi == (*id)->GetGi() ) {                    PostErr(eDiag_Error, eErr_SEQ_INST_HistoryGiCollision,                        "Replaced by gi (" + 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?