validerror_bioseq.cpp

来自「ncbi源码」· C++ 代码 · 共 1,861 行 · 第 1/5 页

CPP
1,861
字号
/* * =========================================================================== * PRODUCTION $Log: validerror_bioseq.cpp,v $ * PRODUCTION Revision 1000.4  2004/06/01 19:47:52  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.79 * PRODUCTION * =========================================================================== *//*  $Id: validerror_bioseq.cpp,v 1000.4 2004/06/01 19:47:52 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * * Author:  Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat ...... * * File Description: *   validation of bioseq  *   ....... * */#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <corelib/ncbistr.hpp>#include <corelib/ncbitime.hpp>#include "validatorp.hpp"#include "utilities.hpp"#include <serial/enumvalues.hpp>#include <serial/iterator.hpp>#include <objects/general/Date.hpp>#include <objects/general/Dbtag.hpp>#include <objects/general/Object_id.hpp>#include <objects/general/User_object.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seq/Annotdesc.hpp>#include <objects/seq/Annot_descr.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/MolInfo.hpp>#include <objects/seq/Delta_ext.hpp>#include <objects/seq/Delta_seq.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_ext.hpp>#include <objects/seq/Seg_ext.hpp>#include <objects/seq/Seq_hist.hpp>#include <objects/seq/Seq_hist_rec.hpp>#include <objects/seq/Seq_literal.hpp>#include <objects/seq/seqport_util.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/NCBI2na.hpp>#include <objects/seq/NCBI4na.hpp>#include <objects/seq/NCBI8aa.hpp>#include <objects/seq/NCBI8na.hpp>#include <objects/seq/NCBIeaa.hpp>#include <objects/seq/NCBIpaa.hpp>#include <objects/seq/NCBIpna.hpp>#include <objects/seq/NCBIstdaa.hpp>#include <objects/seq/GIBB_mol.hpp>#include <objects/seqfeat/Seq_feat.hpp>#include <objects/seqfeat/BioSource.hpp>#include <objects/seqfeat/Cdregion.hpp>#include <objects/seqfeat/Imp_feat.hpp>#include <objects/seqfeat/Org_ref.hpp>#include <objects/seqfeat/RNA_ref.hpp>#include <objects/seqfeat/OrgName.hpp>#include <objects/seqblock/GB_block.hpp>#include <objects/seqblock/EMBL_block.hpp>#include <objects/seqset/Seq_entry.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seqres/Seq_graph.hpp>#include <objects/seqres/Real_graph.hpp>#include <objects/seqres/Int_graph.hpp>#include <objects/seqres/Byte_graph.hpp>#include <objmgr/seq_descr_ci.hpp>#include <objmgr/feat_ci.hpp>#include <objmgr/graph_ci.hpp>#include <objmgr/scope.hpp>#include <objmgr/seqdesc_ci.hpp>#include <objmgr/seq_vector.hpp>#include <objmgr/seq_vector_ci.hpp>#include <objmgr/util/sequence.hpp>#include <objmgr/util/feature.hpp>#include <objmgr/bioseq_handle.hpp>#include <objmgr/seq_entry_handle.hpp>#include <objmgr/seq_entry_ci.hpp>#include <objmgr/annot_selector.hpp>#include <objmgr/seq_feat_handle.hpp>#include <objmgr/seq_annot_handle.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)BEGIN_SCOPE(validator)USING_SCOPE(sequence);USING_SCOPE(feature);// Maximum number of adjacent Ns in a Seq_litconst size_t CValidError_bioseq::scm_AdjacentNsThreshold = 80;// =============================================================================//                                     Public// =============================================================================CValidError_bioseq::CValidError_bioseq(CValidError_imp& imp) :    CValidError_base(imp),    m_TpaWithHistory(0), m_TpaWithoutHistory(0){}CValidError_bioseq::~CValidError_bioseq(void){}void CValidError_bioseq::ValidateSeqIds(const CBioseq& seq){    // Ensure that CBioseq has at least one CSeq_id    if ( seq.GetId().empty() ) {        PostErr(eDiag_Critical, eErr_SEQ_INST_NoIdOnBioseq,                 "No ids on a Bioseq", seq);        return;    }    CSeq_inst::ERepr repr = seq.GetInst().GetRepr();    // Loop thru CSeq_ids for this CBioseq. Determine if seq has    // gi, NT, or NC. Check that the same CSeq_id not included more    // than once.    bool has_gi = false;    ITERATE( CBioseq::TId, i, seq.GetId() ) {        // Check that no two CSeq_ids for same CBioseq are same type        CBioseq::TId::const_iterator j;        for (j = i, ++j; j != seq.GetId().end(); ++j) {            if ((**i).Compare(**j) != CSeq_id::e_DIFF) {                CNcbiOstrstream os;                os << "Conflicting ids on a Bioseq: (";                (**i).WriteAsFasta(os);                os << " - ";                (**j).WriteAsFasta(os);                os << ")";                PostErr(eDiag_Error, eErr_SEQ_INST_ConflictingIdsOnBioseq,                    CNcbiOstrstreamToString (os) /* os.str() */, seq);            }        }        CConstRef<CBioseq> core = m_Scope->GetBioseqHandle(**i).GetBioseqCore();        if ( !core ) {            if ( !m_Imp.IsPatent() ) {                PostErr(eDiag_Error, eErr_SEQ_INST_IdOnMultipleBioseqs,                    "BioseqFind (" + (*i)->AsFastaString() +                     ") unable to find itself - possible internal error", seq);            }        } else if ( core.GetPointer() != &seq ) {            PostErr(eDiag_Error, eErr_SEQ_INST_IdOnMultipleBioseqs,                "SeqID " + (*i)->AsFastaString() +                 " is present on multiple Bioseqs in record", seq);        }        if ( (*i)->IsGi() ) {            has_gi = true;        }    }    // Loop thru CSeq_ids to check formatting    bool is_wgs = false;    bool is_gb_embl_ddbj = false;    unsigned int gi_count = 0;    unsigned int accn_count = 0;    ITERATE (CBioseq::TId, k, seq.GetId()) {        const CTextseq_id* tsid = (*k)->GetTextseq_Id();        switch ((**k).Which()) {        case CSeq_id::e_Tpg:        case CSeq_id::e_Tpe:        case CSeq_id::e_Tpd:            if ( IsHistAssemblyMissing(seq)  &&  seq.IsNa() ) {                PostErr(eDiag_Error, eErr_SEQ_INST_HistAssemblyMissing,                    "TPA record " + (*k)->AsFastaString() +                    " should have Seq-hist.assembly for PRIMARY block",                     seq);            }        // Fall thru         case CSeq_id::e_Genbank:        case CSeq_id::e_Embl:        case CSeq_id::e_Ddbj:            if ( tsid  &&  tsid->IsSetAccession() ) {                const string& acc = tsid->GetAccession();                unsigned int num_digits = 0;                unsigned int num_letters = 0;                bool letter_after_digit = false;                bool bad_id_chars = false;                                           is_wgs = acc.length() == 12  ||  acc.length() == 13;                ITERATE(string, s, acc) {                    if (isupper(*s)) {                        num_letters++;                        if (num_digits > 0) {                            letter_after_digit = true;                        }                    } else if (isdigit(*s)) {                        num_digits++;                    } else {                        bad_id_chars = true;                    }                }                is_gb_embl_ddbj =                     (**k).IsGenbank()  ||  (**k).IsEmbl()  ||  (**k).IsDdbj();                if ( letter_after_digit || bad_id_chars ) {                    PostErr(eDiag_Critical, eErr_SEQ_INST_BadSeqIdFormat,                        "Bad accession: " + acc, seq);                } else if (num_letters == 1 && num_digits == 5 && seq.IsNa()) {                } else if (num_letters == 2 && num_digits == 6 && seq.IsNa()) {                } else if (num_letters == 3 && num_digits == 5 && seq.IsAa()) {                } else if (num_letters == 2 && num_digits == 6 && seq.IsAa() &&                    repr == CSeq_inst::eRepr_seg) {                } else if ( num_letters == 4  &&                             (num_digits == 8  ||  num_digits == 9)  &&                             seq.IsNa()  &&                            is_gb_embl_ddbj ) {                } else {                    PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat,                        "Bad accession: " + acc, seq);                }                                    // Check for secondary conflicts                if ( seq.GetFirstId() ) {                    ValidateSecondaryAccConflict(acc, seq, CSeqdesc::e_Genbank);                    ValidateSecondaryAccConflict(acc, seq, CSeqdesc::e_Embl);                }                if ( has_gi ) {                    if ( tsid->IsSetVersion()  &&  tsid->GetVersion() == 0 ) {                        PostErr(eDiag_Critical, eErr_SEQ_INST_BadSeqIdFormat,                            "Accession " + acc + " has 0 version", seq);                    }                }            }        // Fall thru        case CSeq_id::e_Other:            if ( tsid ) {                if ( tsid->IsSetName() ) {                    const string& name = tsid->GetName();                    ITERATE (string, s, name) {                        if (isspace(*s)) {                            PostErr(eDiag_Critical,                                eErr_SEQ_INST_SeqIdNameHasSpace,                                "Seq-id.name " + name + " should be a single "                                "word without any spaces", seq);                            break;                        }                    }                }                if ( tsid->IsSetAccession()  &&  (*k)->IsOther() ) {                    const string& acc = tsid->GetAccession();                    size_t num_letters = 0;                    size_t num_digits = 0;                    size_t num_underscores = 0;                    bool bad_id_chars = false;                    bool is_NZ = (NStr::CompareNocase(acc, 0, 3, "NZ_") == 0);                    size_t i = is_NZ ? 3 : 0;                    bool letter_after_digit = false;                    for ( ; i < acc.length(); ++i ) {                        if ( isupper(acc[i]) ) {                            num_letters++;                        } else if ( isdigit(acc[i]) ) {                            num_digits++;                        } else if ( acc[i] == '_' ) {                            num_underscores++;                            if ( num_digits > 0  ||  num_underscores > 1 ) {                                letter_after_digit = true;                            }                        } else {                            bad_id_chars = true;                        }                    }                    if ( letter_after_digit  ||  bad_id_chars ) {                        PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat,                            "Bad accession " + acc, seq);                    } else if ( is_NZ  &&  num_letters == 4  &&                         num_digits == 8  &&  num_underscores == 0 ) {                        // valid accession - do nothing!                    } else if ( num_letters == 2  &&                        (num_digits == 6  ||  num_digits == 8  || num_digits == 9)  &&                        num_underscores == 1 ) {                        // valid accession - do nothing!                    } else {                        PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat,                            "Bad accession " + acc, seq);                    }                }                if ( has_gi && !tsid->IsSetAccession() && tsid->IsSetName() ) {                    EDiagSev sev = eDiag_Critical;                    // Report ddbj segmented sequence missing accesions as                     // warning, not critical.                    if ( (*k)->IsDdbj()  &&  repr == CSeq_inst::eRepr_seg ) {                        sev = eDiag_Warning;                    }                    PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat,                        "Missing accession for " + tsid->GetName(), seq);                }            }            // Fall thru        case CSeq_id::e_Pir:        case CSeq_id::e_Swissprot:        case CSeq_id::e_Prf:            if ( tsid ) {                if ( seq.IsNa()  &&                       (!tsid->IsSetAccession() || tsid->GetAccession().empty())) {                    if ( repr != CSeq_inst::eRepr_seg  ||                        m_Imp.IsGI()) {                        if (!(**k).IsDdbj()  ||                            repr != CSeq_inst::eRepr_seg) {                            CNcbiOstrstream os;                            os << "Missing accession for " << (**k).DumpAsFasta();                            PostErr(eDiag_Error,                                eErr_SEQ_INST_BadSeqIdFormat,                                string(os.str()), seq);                        }                    }                }                accn_count++;            } else {                PostErr(eDiag_Critical, eErr_SEQ_INST_BadSeqIdFormat,                    "Seq-id type not handled", seq);            }            break;                case CSeq_id::e_Patent:            break;        case CSeq_id::e_Pdb:            break;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?