validerror_bioseq.cpp
来自「ncbi源码」· C++ 代码 · 共 1,861 行 · 第 1/5 页
CPP
1,861 行
if (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 || tech == CMolInfo::eTech_htgs_2) { PostErr(eDiag_Warning, eErr_SEQ_INST_LongHtgsSequence, "Phase 0, 1 or 2 HTGS sequence exceeds 350kbp limit", seq); } else if (tech == CMolInfo::eTech_htgs_3) { PostErr(eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp, "Phase 3 HTGS sequence exceeds 350kbp limit", seq); } else if (tech == CMolInfo::eTech_wgs) { PostErr(eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp, "WGS sequence exceeds 350kbp limit", seq); } else { PostErr (eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp, "Length of sequence exceeds 350kbp limit", seq); } } else { PostErr (eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp, "Length of sequence exceeds 350kbp limit", seq); } }}// Assumes that seq is segmented and has Seq-ext datavoid CValidError_bioseq::ValidateSeqParts(const CBioseq& seq){ // Get parent CSeq_entry of seq and then find the next // CSeq_entry in the set. This CSeq_entry should be a CBioseq_set // of class parts. const CSeq_entry* parent = seq.GetParentEntry(); if (!parent) { return; } if ( !parent->IsSet() ) { return; } // Loop through seq_set looking for the bioseq, when found, the next // CSeq_entry should have the parts. CBioseq_set::TSeq_set::const_iterator se_parts = parent->GetSet().GetSeq_set().end(); ITERATE ( CBioseq_set::TSeq_set, it, parent->GetSet().GetSeq_set() ) { if ( parent == &(**it) ) { se_parts = ++it; break; } } if ( se_parts == parent->GetSet().GetSeq_set().end() ) { return; } // Check that se_parts is a CBioseq_set of class parts if ( !(*se_parts)->IsSet() || !(*se_parts)->GetSet().IsSetClass() || (*se_parts)->GetSet().GetClass() != CBioseq_set::eClass_parts ) { return; } const CSeg_ext::Tdata& locs = seq.GetInst().GetExt().GetSeg().Get(); const CBioseq_set::TSeq_set& parts = (*se_parts)->GetSet().GetSeq_set(); // Make sure the number of locations (excluding null locations) // match the number of parts size_t nulls = 0; ITERATE ( CSeg_ext::Tdata, loc, locs ) { if ( (*loc)->IsNull() ) { nulls++; } } if ( locs.size() - nulls < parts.size() ) { PostErr(eDiag_Error, eErr_SEQ_INST_PartsOutOfOrder, "Parts set contains too many Bioseqs", seq); return; } else if ( locs.size() - nulls > parts.size() ) { PostErr(eDiag_Error, eErr_SEQ_INST_PartsOutOfOrder, "Parts set does not contain enough Bioseqs", seq); return; } // Now, simultaneously loop through the parts of se_parts and CSeq_locs of // seq's CSseq-ext. If don't compare, post error. size_t size = locs.size(); // == parts.size() CSeg_ext::Tdata::const_iterator loc_it = locs.begin(); CBioseq_set::TSeq_set::const_iterator part_it = parts.begin(); for ( size_t i = 0; i < size; ++i ) { try { if ( (*loc_it)->IsNull() ) { continue; } if ( !(*part_it)->IsSet() ) { PostErr(eDiag_Error, eErr_SEQ_INST_PartsOutOfOrder, "Parts set component is not Bioseq", seq); return; } const CSeq_id& loc_id = GetId(**loc_it, m_Scope); if ( !IsIdIn(loc_id, (*part_it)->GetSeq()) ) { PostErr(eDiag_Error, eErr_SEQ_INST_PartsOutOfOrder, "Segmented bioseq seq_ext does not correspond to parts" "packaging order", seq); } // advance both iterators ++loc_it; ++part_it; } catch (const CNotUnique&) { ERR_POST("Seq-loc not for unique sequence"); return; } catch (...) { ERR_POST("Unknown error"); return; } }}// Assumes seq is an amino acid sequencevoid CValidError_bioseq::ValidateProteinTitle(const CBioseq& seq){ const CSeq_id* id = seq.GetFirstId(); if (!id) { return; } CBioseq_Handle hnd = m_Scope->GetBioseqHandle(*id); string title_no_recon = GetTitle(hnd); string title_recon = GetTitle(hnd, fGetTitle_Reconstruct); if ( title_no_recon != title_recon ) { PostErr(eDiag_Warning, eErr_SEQ_DESCR_InconsistentProteinTitle, "Instantiated protein title does not match automatically " "generated title. Instantiated: " + title_no_recon + " Generated: " + title_recon, seq); }}void CValidError_bioseq::ValidateNs(const CBioseq& seq){ try { const CSeq_inst& inst = seq.GetInst(); CSeq_inst::TRepr repr = inst.CanGetRepr() ? inst.GetRepr() : CSeq_inst::eRepr_not_set; CSeq_inst::TLength len = inst.CanGetLength() ? inst.GetLength() : 0; if ( (repr == CSeq_inst::eRepr_raw || (repr == CSeq_inst::eRepr_delta && x_IsDeltaLitOnly(inst))) && len > 10 ) { CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq); if ( !bsh ) { return; } CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac); EDiagSev sev = eDiag_Warning; string sequence; // check for Ns at the beginning of the sequence if ( (vec[0] == 'N') || (vec[0] == 'n') ) { // if 10 or more Ns flag as error (except for NC), // otherwise just warning vec.GetSeqData(0, 10, sequence); if ( !m_Imp.IsNC() && (NStr::CompareNocase(sequence, "NNNNNNNNNN") == 0) ) { sev = eDiag_Error; } PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at beginning of sequence", seq); } // check for Ns at the end of the sequence sev = eDiag_Warning; if ( (vec[vec.size() - 1] == 'N') || (vec[vec.size() - 1] == 'n') ) { // if 10 or more Ns flag as error (except for NC), // otherwise just warning vec.GetSeqData(vec.size() - 10, vec.size() , sequence); if ( !m_Imp.IsNC() && (NStr::CompareNocase(sequence, "NNNNNNNNNN") == 0) ) { sev = eDiag_Error; } PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at end of sequence", seq); } } } catch ( exception& ) { // just ignore, and continue with the validation process. }}// Assumes that seq is eRepr_raw or eRepr_instvoid CValidError_bioseq::ValidateRawConst(const CBioseq& seq){ const CSeq_inst& inst = seq.GetInst(); const CEnumeratedTypeValues* tv = CSeq_inst::GetTypeInfo_enum_ERepr(); const string& rpr = tv->FindName(inst.GetRepr(), true); if (inst.IsSetFuzz()) { PostErr(eDiag_Error, eErr_SEQ_INST_FuzzyLen, "Fuzzy length on " + rpr + "Bioseq", seq); } if (!inst.IsSetLength() || inst.GetLength() == 0) { string len = inst.IsSetLength() ? NStr::IntToString(inst.GetLength()) : "0"; PostErr(eDiag_Error, eErr_SEQ_INST_InvalidLen, "Invalid Bioseq length [" + len + "]", seq); } CSeq_data::E_Choice seqtyp = inst.IsSetSeq_data() ? inst.GetSeq_data().Which() : CSeq_data::e_not_set; switch (seqtyp) { case CSeq_data::e_Iupacna: case CSeq_data::e_Ncbi2na: case CSeq_data::e_Ncbi4na: case CSeq_data::e_Ncbi8na: case CSeq_data::e_Ncbipna: if (inst.IsAa()) { PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidAlphabet, "Using a nucleic acid alphabet on a protein sequence", seq); return; } break; case CSeq_data::e_Iupacaa: case CSeq_data::e_Ncbi8aa: case CSeq_data::e_Ncbieaa: case CSeq_data::e_Ncbipaa: case CSeq_data::e_Ncbistdaa: if (inst.IsNa()) { PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidAlphabet, "Using a protein alphabet on a nucleic acid", seq); return; } break; default: PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidAlphabet, "Sequence alphabet not set", seq); return; } bool check_alphabet = false; unsigned int factor = 1; switch (seqtyp) { case CSeq_data::e_Iupacaa: case CSeq_data::e_Iupacna: case CSeq_data::e_Ncbieaa: case CSeq_data::e_Ncbistdaa: check_alphabet = true; break; case CSeq_data::e_Ncbi8na: case CSeq_data::e_Ncbi8aa: break; case CSeq_data::e_Ncbi4na: factor = 2; break; case CSeq_data::e_Ncbi2na: factor = 4; break; case CSeq_data::e_Ncbipna: factor = 5; break; case CSeq_data::e_Ncbipaa: factor = 30; break; default: // Logically, should not occur PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidAlphabet, "Sequence alphabet not set", seq); return; } TSeqPos calc_len = inst.IsSetLength() ? inst.GetLength() : 0; switch (seqtyp) { case CSeq_data::e_Ncbipna: case CSeq_data::e_Ncbipaa: calc_len *= factor; break; default: if (calc_len % factor) { calc_len += factor; } calc_len /= factor; } string s_len = NStr::UIntToString(calc_len); TSeqPos data_len = GetDataLen(inst); string data_len_str = NStr::IntToString(data_len); if (calc_len > data_len) { PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [" + data_len_str + "] for given length [" + s_len + "]", seq); return; } else if (calc_len < data_len) { PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [" + data_len_str + "] than given length [" + s_len + "]", seq); } unsigned char termination; unsigned int trailingX = 0; size_t terminations = 0; if (check_alphabet) { switch (seqtyp) { case CSeq_data::e_Iupacaa: case CSeq_data::e_Ncbieaa: termination = '*'; break; case CSeq_data::e_Ncbistdaa: termination = 25; break; default: termination = '\0'; } CSeqVector sv = m_Scope->GetBioseqHandle(seq).GetSeqVector(); size_t bad_cnt = 0; TSeqPos pos = 0; for ( CSeqVector_CI sv_iter(sv); sv_iter; ++sv_iter ) { CSeqVector::TResidue res = *sv_iter; if ( !IsResidue(res) ) { if ( ++bad_cnt > 10 ) { PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidResidue, "More than 10 invalid residues. Checking stopped", seq); return; } else { string msg = "Invalid residue ["; if ( seqtyp == CSeq_data::e_Ncbistdaa ) { msg += NStr::UIntToString(res); } else { msg += res; } msg += "] in position [" + NStr::UIntToString(pos) + "]"; PostErr(eDiag_Critical, eErr_SEQ_INST_InvalidResidue, msg, seq); } } else if ( res == termination ) { terminations++; trailingX = 0; } else if ( res == 'X' ) { trailingX++; } else { trailingX = 0; } ++pos; } if ( trailingX > 0 && !SuppressTrailingXMsg(seq) ) { // Suppress if cds ends in "*" or 3' partial string msg = "Sequence ends in " + NStr::IntToString(trailingX) + " trailing X"; if ( trailingX > 1 ) { msg += "s"; } PostErr(eDiag_Warning, eErr_SEQ_INST_TrailingX, msg, seq); } if (terminations) { // Post error indicating terminations found in protein sequence // First get gene label string glbl; seq.GetLabel(&glbl, CBioseq::eContent); if ( IsBlankString(glbl) ) {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?