seqtitle.cpp
来自「ncbi源码」· C++ 代码 · 共 1,141 行 · 第 1/3 页
CPP
1,141 行
/* * =========================================================================== * PRODUCTION $Log: seqtitle.cpp,v $ * PRODUCTION Revision 1000.3 2004/06/01 19:25:27 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.37 * PRODUCTION * =========================================================================== *//* $Id: seqtitle.cpp,v 1000.3 2004/06/01 19:25:27 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Author: Aaron Ucko** File Description:* Obtains or constructs a sequence's title. (Corresponds to* CreateDefLine in the C toolkit.)*/#include <ncbi_pch.hpp>#include <serial/iterator.hpp>#include <objects/biblio/Id_pat.hpp>#include <objects/general/Dbtag.hpp>#include <objects/general/Object_id.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/Delta_ext.hpp>#include <objects/seq/Delta_seq.hpp>#include <objects/seq/MolInfo.hpp>#include <objects/seq/Seg_ext.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_ext.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_literal.hpp>#include <objects/seqblock/EMBL_block.hpp>#include <objects/seqblock/GB_block.hpp>#include <objects/seqblock/PDB_block.hpp>#include <objects/seqfeat/BioSource.hpp>#include <objects/seqfeat/Gene_ref.hpp>#include <objects/seqfeat/OrgMod.hpp>#include <objects/seqfeat/OrgName.hpp>#include <objects/seqfeat/Org_ref.hpp>#include <objects/seqfeat/Prot_ref.hpp>#include <objects/seqfeat/Seq_feat.hpp>#include <objects/seqfeat/SubSource.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/Patent_seq_id.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqloc/Seq_loc_mix.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seqset/Seq_entry.hpp>#include <objmgr/scope.hpp>#include <objmgr/seqdesc_ci.hpp>#include <objmgr/feat_ci.hpp>#include <objmgr/util/feature.hpp>#include <objmgr/util/sequence.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)BEGIN_SCOPE(sequence)static string s_TitleFromBioSource (const CBioSource& source, const string& suffix = kEmptyStr);static string s_TitleFromChromosome(const CBioSource& source, const CMolInfo& mol_info);static string s_TitleFromProtein (const CBioseq_Handle& handle, CScope& scope, string& organism);static string s_TitleFromSegment (const CBioseq_Handle& handle, CScope& scope); string GetTitle(const CBioseq_Handle& hnd, TGetTitleFlags flags){ string prefix, title, suffix; string organism; CBioseq_Handle::TBioseqCore core = hnd.GetBioseqCore(); CConstRef<CTextseq_id> tsid(NULL); CConstRef<CPDB_seq_id> pdb_id(NULL); CConstRef<CPatent_seq_id> pat_id(NULL); CConstRef<CDbtag> general_id(NULL); CConstRef<CBioSource> source(NULL); CConstRef<CMolInfo> mol_info(NULL); bool third_party = false; bool is_nc = false; bool is_nm = false; bool is_nr = false; bool wgs_master = false; CMolInfo::TTech tech = CMolInfo::eTech_unknown; bool htg_tech = false; bool use_biosrc = false; ITERATE (CBioseq::TId, id, core->GetId()) { if ( !tsid ) { tsid = (*id)->GetTextseq_Id(); } switch ((*id)->Which()) { case CSeq_id::e_Other: case CSeq_id::e_Genbank: case CSeq_id::e_Embl: case CSeq_id::e_Ddbj: { const CTextseq_id& t = *(*id)->GetTextseq_Id(); if (t.IsSetAccession()) { const string& acc = t.GetAccession(); CSeq_id::EAccessionInfo type = CSeq_id::IdentifyAccession(acc); if ((type & CSeq_id::eAcc_division_mask) == CSeq_id::eAcc_wgs && NStr::EndsWith(acc, "000000")) { wgs_master = true; } else if (type == CSeq_id::eAcc_refseq_chromosome) { is_nc = true; } else if (type == CSeq_id::eAcc_refseq_mrna) { is_nm = true; } else if (type == CSeq_id::eAcc_refseq_ncrna) { is_nr = true; } } break; } case CSeq_id::e_General: general_id = &(*id)->GetGeneral(); break; case CSeq_id::e_Tpg: case CSeq_id::e_Tpe: case CSeq_id::e_Tpd: third_party = true; break; case CSeq_id::e_Pdb: pdb_id = &(*id)->GetPdb(); break; case CSeq_id::e_Patent: pat_id = &(*id)->GetPatent(); break; default: break; } } { CSeqdesc_CI it(hnd, CSeqdesc::e_Source); if (it) { source = &it->GetSource(); } } { CSeqdesc_CI it(hnd, CSeqdesc::e_Molinfo); if (it) { mol_info = &it->GetMolinfo(); tech = mol_info->GetTech(); } } switch (tech) { case CMolInfo::eTech_htgs_0: case CMolInfo::eTech_htgs_1: case CMolInfo::eTech_htgs_2: // manufacture all titles for unfinished HTG sequences flags |= fGetTitle_Reconstruct; // fall through case CMolInfo::eTech_htgs_3: htg_tech = true; // fall through case CMolInfo::eTech_est: case CMolInfo::eTech_sts: case CMolInfo::eTech_survey: case CMolInfo::eTech_wgs: use_biosrc = true; default: break; } if (!(flags & fGetTitle_Reconstruct)) { // Ignore parents' titles for non-PDB proteins. if (core->GetInst().GetMol() == CSeq_inst::eMol_aa && pdb_id.IsNull()) { // Sun Workshop compiler does not call destructors of objects // created in for-loop initializers in case we use break to exit the loop // (08-apr-2002) CTypeConstIterator<CSeqdesc> it = ConstBegin(*core); for (; it; ++it) { if (it->IsTitle()) { title = it->GetTitle(); BREAK(it); } } } else { { CSeqdesc_CI it(hnd, CSeqdesc::e_Title); if (it) { title = it->GetTitle(); } } } } if (title.empty() && use_biosrc && source.NotEmpty()) { if (tech == CMolInfo::eTech_wgs && !wgs_master && general_id.NotEmpty() && general_id->GetTag().IsStr()) { title = s_TitleFromBioSource(*source, general_id->GetTag().GetStr()); } else { title = s_TitleFromBioSource(*source); } flags &= ~fGetTitle_Organism; } if (title.empty() && is_nc && source.NotEmpty()) { switch (mol_info->GetBiomol()) { case CMolInfo::eBiomol_genomic: case CMolInfo::eBiomol_other_genetic: title = s_TitleFromChromosome(*source, *mol_info); if (!title.empty()) { flags &= ~fGetTitle_Organism; } break; } } else if (title.empty() && is_nm && source.NotEmpty()) { unsigned int genes = 0, cdregions = 0, prots = 0; CConstRef<CSeq_feat> gene(0), cdregion(0); for (CFeat_CI it(hnd, 0, 0, CSeqFeatData::e_not_set); it; ++it) { switch (it->GetData().Which()) { case CSeqFeatData::e_Gene: ++genes; gene.Reset(&it->GetMappedFeature()); break; case CSeqFeatData::e_Cdregion: ++cdregions; cdregion.Reset(&it->GetMappedFeature()); break; case CSeqFeatData::e_Prot: ++prots; break; default: break; } } if (genes == 1 && cdregions == 1 // && prots >= 1 && source->GetOrg().IsSetTaxname()) { title = source->GetOrg().GetTaxname() + ' '; feature::GetLabel(*cdregion, &title, feature::eContent, &hnd.GetScope()); title += " ("; feature::GetLabel(*gene, &title, feature::eContent, &hnd.GetScope()); title += "), mRNA"; } } else if (title.empty() && is_nr && source.NotEmpty() && source->GetOrg().IsSetTaxname()) { for (CTypeConstIterator<CSeq_feat> it(hnd.GetTopLevelSeqEntry()); it; ++it) { if (it->GetData().IsGene()) { title = source->GetOrg().GetTaxname() + ' '; feature::GetLabel(*it, &title, feature::eContent); title += ", "; switch (mol_info->GetBiomol()) { case CMolInfo::eBiomol_pre_RNA: title += "precursorRNA"; break; case CMolInfo::eBiomol_mRNA: title += "mRNA"; break; case CMolInfo::eBiomol_rRNA: title += "rRNA"; break; case CMolInfo::eBiomol_tRNA: title += "tRNA"; break; case CMolInfo::eBiomol_snRNA: title += "snRNA"; break; case CMolInfo::eBiomol_scRNA: title += "scRNA"; break; case CMolInfo::eBiomol_cRNA: title += "cRNA"; break; case CMolInfo::eBiomol_snoRNA: title += "snoRNA"; break; default: title += "miscRNA"; break; } BREAK(it); } } } // originally further down, but moved up to match the C version while (NStr::EndsWith(title, ".") || NStr::EndsWith(title, " ")) { title.erase(title.end() - 1); } if (title.empty() && pdb_id.NotEmpty()) { CSeqdesc_CI it(hnd, CSeqdesc::e_Pdb); for (; it; ++it) { if ( !it->GetPdb().GetCompound().empty() ) { if (isprint(pdb_id->GetChain())) { title = string("Chain ") + (char)pdb_id->GetChain() + ", "; } title += it->GetPdb().GetCompound().front(); BREAK(it); } } } if (title.empty() && pat_id.NotEmpty()) { title = "Sequence " + NStr::IntToString(pat_id->GetSeqid()) + " from Patent " + pat_id->GetCit().GetCountry() + ' ' + pat_id->GetCit().GetId().GetNumber(); } if (title.empty() && core->GetInst().GetMol() == CSeq_inst::eMol_aa) { title = s_TitleFromProtein(hnd, hnd.GetScope(), organism); if ( !title.empty() ) { flags |= fGetTitle_Organism; } } if (title.empty() && !htg_tech && core->GetInst().GetRepr() == CSeq_inst::eRepr_seg) { title = s_TitleFromSegment(hnd, hnd.GetScope()); } if (title.empty() && !htg_tech && source.NotEmpty()) { title = s_TitleFromBioSource(*source); if (title.empty()) { title = "No definition line found"; } } if (third_party && !title.empty() && !NStr::StartsWith(title, "TPA: ", NStr::eNocase)) { prefix += "TPA: "; } switch (tech) { case CMolInfo::eTech_htgs_0: if (title.find("LOW-PASS") == NPOS) { suffix = ", LOW-PASS SEQUENCE SAMPLING"; } break; case CMolInfo::eTech_htgs_1: case CMolInfo::eTech_htgs_2: { bool is_draft = false; bool cancelled = false; const CGB_block::TKeywords* keywords = 0; for (CSeqdesc_CI gb(hnd, CSeqdesc::e_Genbank); gb; ++gb) { if (gb->GetGenbank().IsSetKeywords()) { keywords = &gb->GetGenbank().GetKeywords(); } BREAK(gb); } if ( !keywords ) { for (CSeqdesc_CI embl(hnd, CSeqdesc::e_Embl); embl; ++embl) { if (embl->GetEmbl().IsSetKeywords()) { keywords = &embl->GetEmbl().GetKeywords(); } BREAK(embl); } } if (keywords) { ITERATE (CGB_block::TKeywords, it, *keywords) { if (NStr::Compare(*it, "HTGS_DRAFT", NStr::eNocase) == 0) { is_draft = true; break; } else if (NStr::Compare(*it, "HTGS_CANCELLED", NStr::eNocase) == 0) { cancelled = true;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?