seqtitle.cpp

来自「ncbi源码」· C++ 代码 · 共 1,141 行 · 第 1/3 页

CPP
1,141
字号
/* * =========================================================================== * PRODUCTION $Log: seqtitle.cpp,v $ * PRODUCTION Revision 1000.3  2004/06/01 19:25:27  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.37 * PRODUCTION * =========================================================================== *//*  $Id: seqtitle.cpp,v 1000.3 2004/06/01 19:25:27 gouriano Exp $* ===========================================================================**                            PUBLIC DOMAIN NOTICE*               National Center for Biotechnology Information**  This software/database is a "United States Government Work" under the*  terms of the United States Copyright Act.  It was written as part of*  the author's official duties as a United States Government employee and*  thus cannot be copyrighted.  This software/database is freely available*  to the public for use. The National Library of Medicine and the U.S.*  Government have not placed any restriction on its use or reproduction.**  Although all reasonable efforts have been taken to ensure the accuracy*  and reliability of the software and data, the NLM and the U.S.*  Government do not and cannot warrant the performance or results that*  may be obtained by using this software or data. The NLM and the U.S.*  Government disclaim all warranties, express or implied, including*  warranties of performance, merchantability or fitness for any particular*  purpose.**  Please cite the author in any work or product based on this material.** ===========================================================================** Author:  Aaron Ucko** File Description:*   Obtains or constructs a sequence's title.  (Corresponds to*   CreateDefLine in the C toolkit.)*/#include <ncbi_pch.hpp>#include <serial/iterator.hpp>#include <objects/biblio/Id_pat.hpp>#include <objects/general/Dbtag.hpp>#include <objects/general/Object_id.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/Delta_ext.hpp>#include <objects/seq/Delta_seq.hpp>#include <objects/seq/MolInfo.hpp>#include <objects/seq/Seg_ext.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_ext.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_literal.hpp>#include <objects/seqblock/EMBL_block.hpp>#include <objects/seqblock/GB_block.hpp>#include <objects/seqblock/PDB_block.hpp>#include <objects/seqfeat/BioSource.hpp>#include <objects/seqfeat/Gene_ref.hpp>#include <objects/seqfeat/OrgMod.hpp>#include <objects/seqfeat/OrgName.hpp>#include <objects/seqfeat/Org_ref.hpp>#include <objects/seqfeat/Prot_ref.hpp>#include <objects/seqfeat/Seq_feat.hpp>#include <objects/seqfeat/SubSource.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/Patent_seq_id.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqloc/Seq_loc_mix.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seqset/Seq_entry.hpp>#include <objmgr/scope.hpp>#include <objmgr/seqdesc_ci.hpp>#include <objmgr/feat_ci.hpp>#include <objmgr/util/feature.hpp>#include <objmgr/util/sequence.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)BEGIN_SCOPE(sequence)static string s_TitleFromBioSource (const CBioSource&    source,                                    const string&        suffix = kEmptyStr);static string s_TitleFromChromosome(const CBioSource&    source,                                    const CMolInfo&      mol_info);static string s_TitleFromProtein   (const CBioseq_Handle& handle,                                          CScope&        scope,                                          string&        organism);static string s_TitleFromSegment   (const CBioseq_Handle& handle,                                          CScope&        scope);                                          string GetTitle(const CBioseq_Handle& hnd, TGetTitleFlags flags){    string                    prefix, title, suffix;    string                    organism;    CBioseq_Handle::TBioseqCore core        = hnd.GetBioseqCore();    CConstRef<CTextseq_id>    tsid(NULL);    CConstRef<CPDB_seq_id>    pdb_id(NULL);    CConstRef<CPatent_seq_id> pat_id(NULL);    CConstRef<CDbtag>         general_id(NULL);    CConstRef<CBioSource>     source(NULL);    CConstRef<CMolInfo>       mol_info(NULL);    bool                      third_party = false;    bool                      is_nc       = false;    bool                      is_nm       = false;    bool                      is_nr       = false;    bool                      wgs_master  = false;    CMolInfo::TTech           tech        = CMolInfo::eTech_unknown;    bool                      htg_tech    = false;    bool                      use_biosrc  = false;    ITERATE (CBioseq::TId, id, core->GetId()) {        if ( !tsid ) {            tsid = (*id)->GetTextseq_Id();        }        switch ((*id)->Which()) {        case CSeq_id::e_Other:        case CSeq_id::e_Genbank:        case CSeq_id::e_Embl:        case CSeq_id::e_Ddbj:        {            const CTextseq_id& t = *(*id)->GetTextseq_Id();            if (t.IsSetAccession()) {                const string& acc = t.GetAccession();                CSeq_id::EAccessionInfo type = CSeq_id::IdentifyAccession(acc);                if ((type & CSeq_id::eAcc_division_mask) == CSeq_id::eAcc_wgs                    &&  NStr::EndsWith(acc, "000000")) {                    wgs_master = true;                } else if (type == CSeq_id::eAcc_refseq_chromosome) {                    is_nc = true;                } else if (type == CSeq_id::eAcc_refseq_mrna) {                    is_nm = true;                } else if (type == CSeq_id::eAcc_refseq_ncrna) {                    is_nr = true;                }            }            break;        }        case CSeq_id::e_General:            general_id = &(*id)->GetGeneral();            break;        case CSeq_id::e_Tpg:        case CSeq_id::e_Tpe:        case CSeq_id::e_Tpd:            third_party = true;            break;        case CSeq_id::e_Pdb:            pdb_id = &(*id)->GetPdb();            break;        case CSeq_id::e_Patent:            pat_id = &(*id)->GetPatent();            break;        default:            break;        }    }        {        CSeqdesc_CI it(hnd, CSeqdesc::e_Source);        if (it) {            source = &it->GetSource();        }    }    {        CSeqdesc_CI it(hnd, CSeqdesc::e_Molinfo);        if (it) {            mol_info = &it->GetMolinfo();            tech = mol_info->GetTech();        }    }    switch (tech) {    case CMolInfo::eTech_htgs_0:    case CMolInfo::eTech_htgs_1:    case CMolInfo::eTech_htgs_2:        // manufacture all titles for unfinished HTG sequences        flags |= fGetTitle_Reconstruct;        // fall through    case CMolInfo::eTech_htgs_3:        htg_tech = true;        // fall through    case CMolInfo::eTech_est:    case CMolInfo::eTech_sts:    case CMolInfo::eTech_survey:    case CMolInfo::eTech_wgs:        use_biosrc = true;    default:        break;    }    if (!(flags & fGetTitle_Reconstruct)) {        // Ignore parents' titles for non-PDB proteins.        if (core->GetInst().GetMol() == CSeq_inst::eMol_aa            &&  pdb_id.IsNull()) {            // Sun Workshop compiler does not call destructors of objects            // created in for-loop initializers in case we use break to exit the loop            // (08-apr-2002)            CTypeConstIterator<CSeqdesc> it = ConstBegin(*core);            for (; it;  ++it) {                if (it->IsTitle()) {                    title = it->GetTitle();                    BREAK(it);                }            }        } else {            {                CSeqdesc_CI it(hnd, CSeqdesc::e_Title);                if (it) {                    title = it->GetTitle();                }            }        }    }    if (title.empty()  &&  use_biosrc  &&  source.NotEmpty()) {        if (tech == CMolInfo::eTech_wgs  &&  !wgs_master            &&  general_id.NotEmpty()  &&  general_id->GetTag().IsStr()) {            title = s_TitleFromBioSource(*source,                                         general_id->GetTag().GetStr());        } else {            title = s_TitleFromBioSource(*source);        }        flags &= ~fGetTitle_Organism;    }    if (title.empty()  &&  is_nc  &&  source.NotEmpty()) {        switch (mol_info->GetBiomol()) {        case CMolInfo::eBiomol_genomic:        case CMolInfo::eBiomol_other_genetic:            title = s_TitleFromChromosome(*source, *mol_info);            if (!title.empty()) {                flags &= ~fGetTitle_Organism;            }            break;        }    } else if (title.empty()  &&  is_nm  &&  source.NotEmpty()) {        unsigned int         genes = 0, cdregions = 0, prots = 0;        CConstRef<CSeq_feat> gene(0),   cdregion(0);        for (CFeat_CI it(hnd, 0, 0, CSeqFeatData::e_not_set);             it;  ++it) {            switch (it->GetData().Which()) {            case CSeqFeatData::e_Gene:                ++genes;                gene.Reset(&it->GetMappedFeature());                break;            case CSeqFeatData::e_Cdregion:                ++cdregions;                cdregion.Reset(&it->GetMappedFeature());                break;            case CSeqFeatData::e_Prot:                ++prots;                break;            default:                break;            }        }        if (genes == 1  &&  cdregions == 1  // &&  prots >= 1            &&  source->GetOrg().IsSetTaxname()) {            title = source->GetOrg().GetTaxname() + ' ';            feature::GetLabel(*cdregion, &title, feature::eContent,                              &hnd.GetScope());            title += " (";            feature::GetLabel(*gene, &title, feature::eContent,                              &hnd.GetScope());            title += "), mRNA";        }    } else if (title.empty()  &&  is_nr  &&  source.NotEmpty()               &&  source->GetOrg().IsSetTaxname()) {        for (CTypeConstIterator<CSeq_feat> it(hnd.GetTopLevelSeqEntry());             it;  ++it) {            if (it->GetData().IsGene()) {                title = source->GetOrg().GetTaxname() + ' ';                feature::GetLabel(*it, &title, feature::eContent);                title += ", ";                switch (mol_info->GetBiomol()) {                case CMolInfo::eBiomol_pre_RNA: title += "precursorRNA"; break;                case CMolInfo::eBiomol_mRNA:    title += "mRNA";         break;                case CMolInfo::eBiomol_rRNA:    title += "rRNA";         break;                case CMolInfo::eBiomol_tRNA:    title += "tRNA";         break;                case CMolInfo::eBiomol_snRNA:   title += "snRNA";        break;                case CMolInfo::eBiomol_scRNA:   title += "scRNA";        break;                case CMolInfo::eBiomol_cRNA:    title += "cRNA";         break;                case CMolInfo::eBiomol_snoRNA:  title += "snoRNA";       break;                default:                        title += "miscRNA";      break;                }                BREAK(it);            }        }    }    // originally further down, but moved up to match the C version    while (NStr::EndsWith(title, ".")  ||  NStr::EndsWith(title, " ")) {        title.erase(title.end() - 1);    }    if (title.empty()  &&  pdb_id.NotEmpty()) {        CSeqdesc_CI it(hnd, CSeqdesc::e_Pdb);        for (;  it;  ++it) {            if ( !it->GetPdb().GetCompound().empty() ) {                if (isprint(pdb_id->GetChain())) {                    title = string("Chain ") + (char)pdb_id->GetChain() + ", ";                }                title += it->GetPdb().GetCompound().front();                BREAK(it);            }        }    }    if (title.empty()  &&  pat_id.NotEmpty()) {        title = "Sequence " + NStr::IntToString(pat_id->GetSeqid())            + " from Patent " + pat_id->GetCit().GetCountry()            + ' ' + pat_id->GetCit().GetId().GetNumber();    }    if (title.empty()  &&  core->GetInst().GetMol() == CSeq_inst::eMol_aa) {        title = s_TitleFromProtein(hnd, hnd.GetScope(), organism);        if ( !title.empty() ) {            flags |= fGetTitle_Organism;        }    }    if (title.empty()  &&  !htg_tech        &&  core->GetInst().GetRepr() == CSeq_inst::eRepr_seg) {        title = s_TitleFromSegment(hnd, hnd.GetScope());    }    if (title.empty()  &&  !htg_tech  &&  source.NotEmpty()) {        title = s_TitleFromBioSource(*source);        if (title.empty()) {            title = "No definition line found";        }    }    if (third_party  &&  !title.empty()        &&  !NStr::StartsWith(title, "TPA: ", NStr::eNocase)) {        prefix += "TPA: ";    }    switch (tech) {    case CMolInfo::eTech_htgs_0:        if (title.find("LOW-PASS") == NPOS) {            suffix = ", LOW-PASS SEQUENCE SAMPLING";        }        break;    case CMolInfo::eTech_htgs_1:    case CMolInfo::eTech_htgs_2:    {        bool is_draft  = false;        bool cancelled = false;        const CGB_block::TKeywords* keywords = 0;        for (CSeqdesc_CI gb(hnd, CSeqdesc::e_Genbank);  gb;  ++gb) {            if (gb->GetGenbank().IsSetKeywords()) {                keywords = &gb->GetGenbank().GetKeywords();            }            BREAK(gb);        }        if ( !keywords ) {            for (CSeqdesc_CI embl(hnd, CSeqdesc::e_Embl);  embl;  ++embl) {                if (embl->GetEmbl().IsSetKeywords()) {                    keywords = &embl->GetEmbl().GetKeywords();                }                BREAK(embl);            }        }        if (keywords) {            ITERATE (CGB_block::TKeywords, it, *keywords) {                if (NStr::Compare(*it, "HTGS_DRAFT", NStr::eNocase) == 0) {                    is_draft = true;                    break;                } else if (NStr::Compare(*it, "HTGS_CANCELLED", NStr::eNocase)                           == 0) {                    cancelled = true;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?