validerror_bioseq.cpp
来自「ncbi源码」· C++ 代码 · 共 1,861 行 · 第 1/5 页
CPP
1,861 行
/* * =========================================================================== * PRODUCTION $Log: validerror_bioseq.cpp,v $ * PRODUCTION Revision 1000.4 2004/06/01 19:47:52 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.79 * PRODUCTION * =========================================================================== *//* $Id: validerror_bioseq.cpp,v 1000.4 2004/06/01 19:47:52 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat ...... * * File Description: * validation of bioseq * ....... * */#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <corelib/ncbistr.hpp>#include <corelib/ncbitime.hpp>#include "validatorp.hpp"#include "utilities.hpp"#include <serial/enumvalues.hpp>#include <serial/iterator.hpp>#include <objects/general/Date.hpp>#include <objects/general/Dbtag.hpp>#include <objects/general/Object_id.hpp>#include <objects/general/User_object.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seq/Annotdesc.hpp>#include <objects/seq/Annot_descr.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/MolInfo.hpp>#include <objects/seq/Delta_ext.hpp>#include <objects/seq/Delta_seq.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_ext.hpp>#include <objects/seq/Seg_ext.hpp>#include <objects/seq/Seq_hist.hpp>#include <objects/seq/Seq_hist_rec.hpp>#include <objects/seq/Seq_literal.hpp>#include <objects/seq/seqport_util.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/NCBI2na.hpp>#include <objects/seq/NCBI4na.hpp>#include <objects/seq/NCBI8aa.hpp>#include <objects/seq/NCBI8na.hpp>#include <objects/seq/NCBIeaa.hpp>#include <objects/seq/NCBIpaa.hpp>#include <objects/seq/NCBIpna.hpp>#include <objects/seq/NCBIstdaa.hpp>#include <objects/seq/GIBB_mol.hpp>#include <objects/seqfeat/Seq_feat.hpp>#include <objects/seqfeat/BioSource.hpp>#include <objects/seqfeat/Cdregion.hpp>#include <objects/seqfeat/Imp_feat.hpp>#include <objects/seqfeat/Org_ref.hpp>#include <objects/seqfeat/RNA_ref.hpp>#include <objects/seqfeat/OrgName.hpp>#include <objects/seqblock/GB_block.hpp>#include <objects/seqblock/EMBL_block.hpp>#include <objects/seqset/Seq_entry.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seqres/Seq_graph.hpp>#include <objects/seqres/Real_graph.hpp>#include <objects/seqres/Int_graph.hpp>#include <objects/seqres/Byte_graph.hpp>#include <objmgr/seq_descr_ci.hpp>#include <objmgr/feat_ci.hpp>#include <objmgr/graph_ci.hpp>#include <objmgr/scope.hpp>#include <objmgr/seqdesc_ci.hpp>#include <objmgr/seq_vector.hpp>#include <objmgr/seq_vector_ci.hpp>#include <objmgr/util/sequence.hpp>#include <objmgr/util/feature.hpp>#include <objmgr/bioseq_handle.hpp>#include <objmgr/seq_entry_handle.hpp>#include <objmgr/seq_entry_ci.hpp>#include <objmgr/annot_selector.hpp>#include <objmgr/seq_feat_handle.hpp>#include <objmgr/seq_annot_handle.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)BEGIN_SCOPE(validator)USING_SCOPE(sequence);USING_SCOPE(feature);// Maximum number of adjacent Ns in a Seq_litconst size_t CValidError_bioseq::scm_AdjacentNsThreshold = 80;// =============================================================================// Public// =============================================================================CValidError_bioseq::CValidError_bioseq(CValidError_imp& imp) : CValidError_base(imp), m_TpaWithHistory(0), m_TpaWithoutHistory(0){}CValidError_bioseq::~CValidError_bioseq(void){}void CValidError_bioseq::ValidateSeqIds(const CBioseq& seq){ // Ensure that CBioseq has at least one CSeq_id if ( seq.GetId().empty() ) { PostErr(eDiag_Critical, eErr_SEQ_INST_NoIdOnBioseq, "No ids on a Bioseq", seq); return; } CSeq_inst::ERepr repr = seq.GetInst().GetRepr(); // Loop thru CSeq_ids for this CBioseq. Determine if seq has // gi, NT, or NC. Check that the same CSeq_id not included more // than once. bool has_gi = false; ITERATE( CBioseq::TId, i, seq.GetId() ) { // Check that no two CSeq_ids for same CBioseq are same type CBioseq::TId::const_iterator j; for (j = i, ++j; j != seq.GetId().end(); ++j) { if ((**i).Compare(**j) != CSeq_id::e_DIFF) { CNcbiOstrstream os; os << "Conflicting ids on a Bioseq: ("; (**i).WriteAsFasta(os); os << " - "; (**j).WriteAsFasta(os); os << ")"; PostErr(eDiag_Error, eErr_SEQ_INST_ConflictingIdsOnBioseq, CNcbiOstrstreamToString (os) /* os.str() */, seq); } } CConstRef<CBioseq> core = m_Scope->GetBioseqHandle(**i).GetBioseqCore(); if ( !core ) { if ( !m_Imp.IsPatent() ) { PostErr(eDiag_Error, eErr_SEQ_INST_IdOnMultipleBioseqs, "BioseqFind (" + (*i)->AsFastaString() + ") unable to find itself - possible internal error", seq); } } else if ( core.GetPointer() != &seq ) { PostErr(eDiag_Error, eErr_SEQ_INST_IdOnMultipleBioseqs, "SeqID " + (*i)->AsFastaString() + " is present on multiple Bioseqs in record", seq); } if ( (*i)->IsGi() ) { has_gi = true; } } // Loop thru CSeq_ids to check formatting bool is_wgs = false; bool is_gb_embl_ddbj = false; unsigned int gi_count = 0; unsigned int accn_count = 0; ITERATE (CBioseq::TId, k, seq.GetId()) { const CTextseq_id* tsid = (*k)->GetTextseq_Id(); switch ((**k).Which()) { case CSeq_id::e_Tpg: case CSeq_id::e_Tpe: case CSeq_id::e_Tpd: if ( IsHistAssemblyMissing(seq) && seq.IsNa() ) { PostErr(eDiag_Error, eErr_SEQ_INST_HistAssemblyMissing, "TPA record " + (*k)->AsFastaString() + " should have Seq-hist.assembly for PRIMARY block", seq); } // Fall thru case CSeq_id::e_Genbank: case CSeq_id::e_Embl: case CSeq_id::e_Ddbj: if ( tsid && tsid->IsSetAccession() ) { const string& acc = tsid->GetAccession(); unsigned int num_digits = 0; unsigned int num_letters = 0; bool letter_after_digit = false; bool bad_id_chars = false; is_wgs = acc.length() == 12 || acc.length() == 13; ITERATE(string, s, acc) { if (isupper(*s)) { num_letters++; if (num_digits > 0) { letter_after_digit = true; } } else if (isdigit(*s)) { num_digits++; } else { bad_id_chars = true; } } is_gb_embl_ddbj = (**k).IsGenbank() || (**k).IsEmbl() || (**k).IsDdbj(); if ( letter_after_digit || bad_id_chars ) { PostErr(eDiag_Critical, eErr_SEQ_INST_BadSeqIdFormat, "Bad accession: " + acc, seq); } else if (num_letters == 1 && num_digits == 5 && seq.IsNa()) { } else if (num_letters == 2 && num_digits == 6 && seq.IsNa()) { } else if (num_letters == 3 && num_digits == 5 && seq.IsAa()) { } else if (num_letters == 2 && num_digits == 6 && seq.IsAa() && repr == CSeq_inst::eRepr_seg) { } else if ( num_letters == 4 && (num_digits == 8 || num_digits == 9) && seq.IsNa() && is_gb_embl_ddbj ) { } else { PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "Bad accession: " + acc, seq); } // Check for secondary conflicts if ( seq.GetFirstId() ) { ValidateSecondaryAccConflict(acc, seq, CSeqdesc::e_Genbank); ValidateSecondaryAccConflict(acc, seq, CSeqdesc::e_Embl); } if ( has_gi ) { if ( tsid->IsSetVersion() && tsid->GetVersion() == 0 ) { PostErr(eDiag_Critical, eErr_SEQ_INST_BadSeqIdFormat, "Accession " + acc + " has 0 version", seq); } } } // Fall thru case CSeq_id::e_Other: if ( tsid ) { if ( tsid->IsSetName() ) { const string& name = tsid->GetName(); ITERATE (string, s, name) { if (isspace(*s)) { PostErr(eDiag_Critical, eErr_SEQ_INST_SeqIdNameHasSpace, "Seq-id.name " + name + " should be a single " "word without any spaces", seq); break; } } } if ( tsid->IsSetAccession() && (*k)->IsOther() ) { const string& acc = tsid->GetAccession(); size_t num_letters = 0; size_t num_digits = 0; size_t num_underscores = 0; bool bad_id_chars = false; bool is_NZ = (NStr::CompareNocase(acc, 0, 3, "NZ_") == 0); size_t i = is_NZ ? 3 : 0; bool letter_after_digit = false; for ( ; i < acc.length(); ++i ) { if ( isupper(acc[i]) ) { num_letters++; } else if ( isdigit(acc[i]) ) { num_digits++; } else if ( acc[i] == '_' ) { num_underscores++; if ( num_digits > 0 || num_underscores > 1 ) { letter_after_digit = true; } } else { bad_id_chars = true; } } if ( letter_after_digit || bad_id_chars ) { PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "Bad accession " + acc, seq); } else if ( is_NZ && num_letters == 4 && num_digits == 8 && num_underscores == 0 ) { // valid accession - do nothing! } else if ( num_letters == 2 && (num_digits == 6 || num_digits == 8 || num_digits == 9) && num_underscores == 1 ) { // valid accession - do nothing! } else { PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "Bad accession " + acc, seq); } } if ( has_gi && !tsid->IsSetAccession() && tsid->IsSetName() ) { EDiagSev sev = eDiag_Critical; // Report ddbj segmented sequence missing accesions as // warning, not critical. if ( (*k)->IsDdbj() && repr == CSeq_inst::eRepr_seg ) { sev = eDiag_Warning; } PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "Missing accession for " + tsid->GetName(), seq); } } // Fall thru case CSeq_id::e_Pir: case CSeq_id::e_Swissprot: case CSeq_id::e_Prf: if ( tsid ) { if ( seq.IsNa() && (!tsid->IsSetAccession() || tsid->GetAccession().empty())) { if ( repr != CSeq_inst::eRepr_seg || m_Imp.IsGI()) { if (!(**k).IsDdbj() || repr != CSeq_inst::eRepr_seg) { CNcbiOstrstream os; os << "Missing accession for " << (**k).DumpAsFasta(); PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, string(os.str()), seq); } } } accn_count++; } else { PostErr(eDiag_Critical, eErr_SEQ_INST_BadSeqIdFormat, "Seq-id type not handled", seq); } break; case CSeq_id::e_Patent: break; case CSeq_id::e_Pdb: break;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?