首页 › 资源下载 › 生物技术 › ncbi源码 › 源码查看
sequence_set.cpp

来自「ncbi源码」· C++ 代码 · 共 881 行 · 第 1/3 页
CPP
881 行
/* * =========================================================================== * PRODUCTION $Log: sequence_set.cpp,v $ * PRODUCTION Revision 1000.4  2004/06/01 18:29:07  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.68 * PRODUCTION * =========================================================================== *//*  $Id: sequence_set.cpp,v 1000.4 2004/06/01 18:29:07 gouriano Exp $* ===========================================================================**                            PUBLIC DOMAIN NOTICE*               National Center for Biotechnology Information**  This software/database is a "United States Government Work" under the*  terms of the United States Copyright Act.  It was written as part of*  the author's official duties as a United States Government employee and*  thus cannot be copyrighted.  This software/database is freely available*  to the public for use. The National Library of Medicine and the U.S.*  Government have not placed any restriction on its use or reproduction.**  Although all reasonable efforts have been taken to ensure the accuracy*  and reliability of the software and data, the NLM and the U.S.*  Government do not and cannot warrant the performance or results that*  may be obtained by using this software or data. The NLM and the U.S.*  Government disclaim all warranties, express or implied, including*  warranties of performance, merchantability or fitness for any particular*  purpose.**  Please cite the author in any work or product based on this material.** ===========================================================================** Authors:  Paul Thiessen** File Description:*      Classes to hold sets of sequences** ===========================================================================*/#ifdef _MSC_VER#pragma warning(disable:4018)   // disable signed/unsigned mismatch warning in MSVC#endif#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <corelib/ncbistre.hpp>#include <corelib/ncbistl.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/PDB_mol_id.hpp>#include <objects/general/Object_id.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/NCBIeaa.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/NCBIstdaa.hpp>#include <objects/seq/NCBI4na.hpp>#include <objects/seq/NCBI8na.hpp>#include <objects/seq/NCBI2na.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/general/Dbtag.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seqblock/PDB_block.hpp>#include <objects/seqfeat/BioSource.hpp>#include <objects/seqfeat/Org_ref.hpp>#include <regex.h>  // regex from C-toolkit#include "sequence_set.hpp"#include "molecule.hpp"#include "structure_set.hpp"#include "cn3d_tools.hpp"#include "molecule_identifier.hpp"#include "messenger.hpp"USING_NCBI_SCOPE;USING_SCOPE(objects);BEGIN_SCOPE(Cn3D)static void UnpackSeqSet(CBioseq_set& bss, SequenceSet *parent, SequenceSet::SequenceList& seqlist){    CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end();    for (q=bss.SetSeq_set().begin(); q!=qe; ++q) {        if (q->GetObject().IsSeq()) {            // only store amino acid or nucleotide sequences            if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na)                continue;            const Sequence *sequence = new Sequence(parent, q->GetObject().SetSeq());            seqlist.push_back(sequence);        } else { // Bioseq-set            UnpackSeqSet(q->GetObject().SetSet(), parent, seqlist);        }    }}static void UnpackSeqEntry(CSeq_entry& seqEntry, SequenceSet *parent, SequenceSet::SequenceList& seqlist){    if (seqEntry.IsSeq()) {        const Sequence *sequence = new Sequence(parent, seqEntry.SetSeq());        seqlist.push_back(sequence);    } else { // Bioseq-set        UnpackSeqSet(seqEntry.SetSet(), parent, seqlist);    }}SequenceSet::SequenceSet(StructureBase *parent, SeqEntryList& seqEntries) :    StructureBase(parent){    SeqEntryList::iterator s, se = seqEntries.end();    for (s=seqEntries.begin(); s!=se; ++s)        UnpackSeqEntry(s->GetObject(), this, sequences);    TRACEMSG("number of sequences: " << sequences.size());}#define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)#define SECONDOF2(byte) ((byte) & 0x0F)static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA){    if (SECONDOF2(vec.back()) > 0)        str->resize(vec.size() * 2);    else        str->resize(vec.size() * 2 - 1);    // first, extract 4-bit values    int i;    for (i=0; i<vec.size(); ++i) {        str->at(2*i) = FIRSTOF2(vec[i]);        if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]);    }    // then convert 4-bit values to ascii characters    for (i=0; i<str->size(); ++i) {        switch (str->at(i)) {            case 1: str->at(i) = 'A'; break;            case 2: str->at(i) = 'C'; break;            case 4: str->at(i) = 'G'; break;            case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;            default:                str->at(i) = 'X';        }    }}#define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)#define SECONDOF4(byte) (((byte) & 0x30) >> 4)#define THIRDOF4(byte) (((byte) & 0x0C) >> 2)#define FOURTHOF4(byte) ((byte) & 0x03)static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA){    str->resize(vec.size() * 4);    // first, extract 4-bit values    int i;    for (i=0; i<vec.size(); ++i) {        str->at(4*i) = FIRSTOF4(vec[i]);        str->at(4*i + 1) = SECONDOF4(vec[i]);        str->at(4*i + 2) = THIRDOF4(vec[i]);        str->at(4*i + 3) = FOURTHOF4(vec[i]);    }    // then convert 4-bit values to ascii characters    for (i=0; i<str->size(); ++i) {        switch (str->at(i)) {            case 0: str->at(i) = 'A'; break;            case 1: str->at(i) = 'C'; break;            case 2: str->at(i) = 'G'; break;            case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;        }    }}static void StringFromStdaa(const vector < char >& vec, string *str){    static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*";    str->resize(vec.size());    for (int i=0; i<vec.size(); ++i)        str->at(i) = stdaaMap[vec[i]];}Sequence::Sequence(SequenceSet *parent, ncbi::objects::CBioseq& bioseq) :    StructureBase(parent), bioseqASN(&bioseq), molecule(NULL), isProtein(false){    int gi = MoleculeIdentifier::VALUE_NOT_SET, mmdbID = MoleculeIdentifier::VALUE_NOT_SET,        pdbChain = MoleculeIdentifier::VALUE_NOT_SET;    string pdbID, accession;    // get Seq-id info    CBioseq::TId::const_iterator s, se = bioseq.GetId().end();    for (s=bioseq.GetId().begin(); s!=se; ++s) {        if ((*s)->IsGi()) {            gi = (*s)->GetGi();        } else if ((*s)->IsPdb()) {            pdbID = (*s)->GetPdb().GetMol().Get();            if ((*s)->GetPdb().IsSetChain())                pdbChain = (*s)->GetPdb().GetChain();            else                pdbChain = ' ';        } else if ((*s)->IsLocal() && (*s)->GetLocal().IsStr()) {            accession = (*s)->GetLocal().GetStr();            // special case where local accession is actually a PDB chain + extra stuff            if (pdbID.size() == 0 && accession.size() >= 7 &&                    accession[4] == ' ' && accession[6] == ' ' && isalpha(accession[5])) {                pdbID = accession.substr(0, 4);                pdbChain = accession[5];                accession.erase();            }        } else if ((*s)->IsGenbank() && (*s)->GetGenbank().IsSetAccession()) {            accession = (*s)->GetGenbank().GetAccession();        } else if ((*s)->IsSwissprot() && (*s)->GetSwissprot().IsSetAccession()) {            accession = (*s)->GetSwissprot().GetAccession();        } else if ((*s)->IsOther() && (*s)->GetOther().IsSetAccession()) {            accession = (*s)->GetOther().GetAccession();        } else if ((*s)->IsEmbl() && (*s)->GetEmbl().IsSetAccession()) {            accession = (*s)->GetEmbl().GetAccession();        } else if ((*s)->IsDdbj() && (*s)->GetDdbj().IsSetAccession()) {            accession = (*s)->GetDdbj().GetAccession();        }    }    if (gi == MoleculeIdentifier::VALUE_NOT_SET && pdbID.size() == 0 && accession.size() == 0) {        ERRORMSG("Sequence::Sequence() - can't parse SeqId");        return;    }    if (bioseq.IsSetDescr()) {        string defline, taxid;        CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end();        for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) {            // get "defline" from title or compound            if ((*d)->IsTitle()) {              // prefer title over compound                defline = (*d)->GetTitle();            } else if (defline.size() == 0 && (*d)->IsPdb() && (*d)->GetPdb().GetCompound().size() > 0) {                defline = (*d)->GetPdb().GetCompound().front();            }            // get taxonomy            if ((*d)->IsSource()) {                if ((*d)->GetSource().GetOrg().IsSetTaxname())                    taxid = (*d)->GetSource().GetOrg().GetTaxname();                else if ((*d)->GetSource().GetOrg().IsSetCommon())                    taxid = (*d)->GetSource().GetOrg().GetCommon();            }        }        if (taxid.size() > 0)            description = string("[") + taxid + ']';        if (defline.size() > 0) {            if (taxid.size() > 0) description += ' ';            description += defline;        }    }    // get link to MMDB id - mainly for CDD's where Biostrucs have to be loaded separately    if (bioseq.IsSetAnnot()) {        CBioseq::TAnnot::const_iterator a, ae = bioseq.GetAnnot().end();        for (a=bioseq.GetAnnot().begin(); a!=ae; ++a) {            if (a->GetObject().GetData().IsIds()) {                CSeq_annot::C_Data::TIds::const_iterator i, ie = a->GetObject().GetData().GetIds().end();                for (i=a->GetObject().GetData().GetIds().begin(); i!=ie; ++i) {                    if (i->GetObject().IsGeneral() &&                        i->GetObject().GetGeneral().GetDb() == "mmdb" &&                        i->GetObject().GetGeneral().GetTag().IsId()) {                        mmdbID = i->GetObject().GetGeneral().GetTag().GetId();                        break;                    }                }                if (i != ie) break;            }        }    }    if (mmdbID != MoleculeIdentifier::VALUE_NOT_SET)        TRACEMSG("sequence gi " << gi << ", PDB '" << pdbID << "' chain '" << (char) pdbChain <<            "', is from MMDB id " << mmdbID);    // get sequence string    if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) {
sequence_set.cpp - 源码说明

本页面展示了「ncbi源码」中的 sequence_set.cpp 源码文件，采用 C++ 编程语言编写，共 881 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ncbi相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?