⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 su_sequence_set.cpp

📁 ncbi源码
💻 CPP
字号:
/* * =========================================================================== * PRODUCTION $Log: su_sequence_set.cpp,v $ * PRODUCTION Revision 1000.0  2004/06/01 18:14:43  gouriano * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3 * PRODUCTION * =========================================================================== *//*  $Id: su_sequence_set.cpp,v 1000.0 2004/06/01 18:14:43 gouriano Exp $* ===========================================================================**                            PUBLIC DOMAIN NOTICE*               National Center for Biotechnology Information**  This software/database is a "United States Government Work" under the*  terms of the United States Copyright Act.  It was written as part of*  the author's official duties as a United States Government employee and*  thus cannot be copyrighted.  This software/database is freely available*  to the public for use. The National Library of Medicine and the U.S.*  Government have not placed any restriction on its use or reproduction.**  Although all reasonable efforts have been taken to ensure the accuracy*  and reliability of the software and data, the NLM and the U.S.*  Government do not and cannot warrant the performance or results that*  may be obtained by using this software or data. The NLM and the U.S.*  Government disclaim all warranties, express or implied, including*  warranties of performance, merchantability or fitness for any particular*  purpose.**  Please cite the author in any work or product based on this material.** ===========================================================================** Authors:  Paul Thiessen** File Description:*      Classes to hold sets of sequences** ===========================================================================*/#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <corelib/ncbistre.hpp>#include <corelib/ncbistl.hpp>#include <vector>#include <map>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/PDB_mol_id.hpp>#include <objects/general/Object_id.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/NCBIeaa.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/NCBIstdaa.hpp>#include <objects/seq/NCBI4na.hpp>#include <objects/seq/NCBI8na.hpp>#include <objects/seq/NCBI2na.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/general/Dbtag.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seqblock/PDB_block.hpp>#include <objects/seqfeat/BioSource.hpp>#include <objects/seqfeat/Org_ref.hpp>#include "su_sequence_set.hpp"#include "su_private.hpp"// borrow Cn3D's asn conversion functions#include "../src/app/cn3d/asn_converter.hpp"// C-toolkit stuff for BioseqPtr creation#include <objseq.h>#include <objalign.h>USING_NCBI_SCOPE;USING_SCOPE(objects);BEGIN_SCOPE(struct_util)// holds C Bioseqs associated with Sequencestypedef map < const Sequence *, Bioseq * > BioseqMap;static BioseqMap bioseqs;void AddCSeqId(SeqIdPtr *sid, const ncbi::objects::CSeq_id& cppid){    string err;    ValNode *vn = (SeqIdPtr) Cn3D::ConvertAsnFromCPPToC(cppid, (AsnReadFunc) SeqIdAsnRead, &err);    if (!vn || err.size() > 0) {        ERROR_MESSAGE("AddCSeqId() - ConvertAsnFromCPPToC() failed");        return;    }    ValNodeLink(sid, vn);}static void AddCSeqIdAll(SeqIdPtr *id, const Sequence& sequence){    CBioseq::TId::const_iterator i, ie = sequence.GetAllIdentifiers().end();    for (i=sequence.GetAllIdentifiers().begin(); i!=ie; ++i)        AddCSeqId(id, **i);}BioseqPtr GetOrCreateBioseq(const Sequence *sequence){    if (!sequence || !sequence->m_isProtein) {        ERROR_MESSAGE("GetOrCreateBioseq() - got non-protein or NULL Sequence");        return NULL;    }    // if already done    BioseqMap::const_iterator b = bioseqs.find(sequence);    if (b != bioseqs.end())        return b->second;    // create new Bioseq and fill it in from Sequence data    BioseqPtr bioseq = BioseqNew();    bioseq->mol = Seq_mol_aa;    bioseq->seq_data_type = Seq_code_ncbieaa;    bioseq->repr = Seq_repr_raw;    bioseq->length = sequence->Length();    bioseq->seq_data = BSNew(bioseq->length);    BSWrite(bioseq->seq_data, const_cast<char*>(sequence->m_sequenceString.c_str()), bioseq->length);    // create Seq-id    AddCSeqIdAll(&(bioseq->id), *sequence);    // store Bioseq    bioseqs[sequence] = bioseq;    return bioseq;}static void UnpackSeqSet(CBioseq_set& bss, SequenceSet::SequenceList& seqlist){    CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end();    for (q=bss.SetSeq_set().begin(); q!=qe; ++q) {        if (q->GetObject().IsSeq()) {            // only store amino acid or nucleotide sequences            if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na)                continue;            CRef < Sequence > sequence(new Sequence(q->GetObject().SetSeq()));            seqlist.push_back(sequence);        } else { // Bioseq-set            UnpackSeqSet(q->GetObject().SetSet(), seqlist);        }    }}static void UnpackSeqEntry(CSeq_entry& seqEntry, SequenceSet::SequenceList& seqlist){    if (seqEntry.IsSeq()) {        CRef < Sequence > sequence(new Sequence(seqEntry.SetSeq()));        seqlist.push_back(sequence);    } else { // Bioseq-set        UnpackSeqSet(seqEntry.SetSet(), seqlist);    }}SequenceSet::SequenceSet(SeqEntryList& seqEntries){    SeqEntryList::iterator s, se = seqEntries.end();    for (s=seqEntries.begin(); s!=se; ++s)        UnpackSeqEntry(s->GetObject(), m_sequences);    TRACE_MESSAGE("number of sequences: " << m_sequences.size());}#define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)#define SECONDOF2(byte) ((byte) & 0x0F)static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA){    if (SECONDOF2(vec.back()) > 0)        str->resize(vec.size() * 2);    else        str->resize(vec.size() * 2 - 1);    // first, extract 4-bit values    unsigned int i;    for (i=0; i<vec.size(); ++i) {        str->at(2*i) = FIRSTOF2(vec[i]);        if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]);    }    // then convert 4-bit values to ascii characters    for (i=0; i<str->size(); ++i) {        switch (str->at(i)) {            case 1: str->at(i) = 'A'; break;            case 2: str->at(i) = 'C'; break;            case 4: str->at(i) = 'G'; break;            case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;            default:                str->at(i) = 'X';        }    }}#define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)#define SECONDOF4(byte) (((byte) & 0x30) >> 4)#define THIRDOF4(byte) (((byte) & 0x0C) >> 2)#define FOURTHOF4(byte) ((byte) & 0x03)static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA){    str->resize(vec.size() * 4);    // first, extract 4-bit values    unsigned int i;    for (i=0; i<vec.size(); ++i) {        str->at(4*i) = FIRSTOF4(vec[i]);        str->at(4*i + 1) = SECONDOF4(vec[i]);        str->at(4*i + 2) = THIRDOF4(vec[i]);        str->at(4*i + 3) = FOURTHOF4(vec[i]);    }    // then convert 4-bit values to ascii characters    for (i=0; i<str->size(); ++i) {        switch (str->at(i)) {            case 0: str->at(i) = 'A'; break;            case 1: str->at(i) = 'C'; break;            case 2: str->at(i) = 'G'; break;            case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;        }    }}static void StringFromStdaa(const vector < char >& vec, string *str){    static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*";    str->resize(vec.size());    for (unsigned int i=0; i<vec.size(); ++i)        str->at(i) = stdaaMap[vec[i]];}Sequence::Sequence(ncbi::objects::CBioseq& bioseq) :    m_bioseqASN(&bioseq), m_isProtein(false){    // fill out description    if (bioseq.IsSetDescr()) {        string defline, taxid;        CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end();        for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) {            // get "defline" from title or compound            if ((*d)->IsTitle()) {              // prefer title over compound                defline = (*d)->GetTitle();            } else if (defline.size() == 0 && (*d)->IsPdb() && (*d)->GetPdb().GetCompound().size() > 0) {                defline = (*d)->GetPdb().GetCompound().front();            }            // get taxonomy            if ((*d)->IsSource()) {                if ((*d)->GetSource().GetOrg().IsSetTaxname())                    taxid = (*d)->GetSource().GetOrg().GetTaxname();                else if ((*d)->GetSource().GetOrg().IsSetCommon())                    taxid = (*d)->GetSource().GetOrg().GetCommon();            }        }        if (taxid.size() > 0)            m_description = string("[") + taxid + ']';        if (defline.size() > 0) {            if (taxid.size() > 0)                m_description += ' ';            m_description += defline;        }    }    // get sequence string    if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) {        // protein formats        if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) {            m_sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get();            m_isProtein = true;        } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) {            m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get();            m_isProtein = true;        } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) {            StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &m_sequenceString);            m_isProtein = true;        }        // nucleotide formats        else if (bioseq.GetInst().GetSeq_data().IsIupacna()) {            m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get();            // convert 'T' to 'U' for RNA            if (bioseq.GetInst().GetMol() == CSeq_inst::eMol_rna) {                for (unsigned int i=0; i<m_sequenceString.size(); ++i) {                    if (m_sequenceString[i] == 'T')                        m_sequenceString[i] = 'U';                }            }        } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) {            StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &m_sequenceString,                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));        } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) {  // same repr. for non-X as 4na            StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &m_sequenceString,                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));        } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) {            StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &m_sequenceString,                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));            if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < m_sequenceString.length())                m_sequenceString.resize(bioseq.GetInst().GetLength());        }        else            THROW_MESSAGE("Sequence::Sequence(): confused by sequence format");        // check length        if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != m_sequenceString.length())            THROW_MESSAGE("Sequence::Sequence() - sequence string length mismatch");        // force uppercase        for (unsigned int i=0; i<m_sequenceString.size(); ++i)            m_sequenceString[i] = toupper(m_sequenceString[i]);    } else        THROW_MESSAGE("Sequence::Sequence(): confused by sequence representation");}Sequence::~Sequence(void){    BioseqMap::iterator b = bioseqs.find(this);    if (b != bioseqs.end()) {        BioseqFree(b->second);        bioseqs.erase(b);    }}#define RETURN_FIRST_SEQID_THAT_(is) \    for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) \        if ((*i)->is()) \            return **iconst CSeq_id& Sequence::GetPreferredIdentifier(void) const{    CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end();    // try to find one of these first    RETURN_FIRST_SEQID_THAT_(IsPdb);    RETURN_FIRST_SEQID_THAT_(IsGi);    // otherwise, just use the first one    return m_bioseqASN->GetId().front().GetObject();}bool Sequence::MatchesSeqId(const CSeq_id& seqID) const{    CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end();    for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) {        if (seqID.Match(**i))            return true;    }    return false;}string Sequence::IdentifierString(void) const{    return CSeq_id::GetStringDescr(*m_bioseqASN, CSeq_id::eFormat_FastA);}END_SCOPE(struct_util)/** ---------------------------------------------------------------------------* $Log: su_sequence_set.cpp,v $* Revision 1000.0  2004/06/01 18:14:43  gouriano* PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3** Revision 1.3  2004/05/28 09:46:57  thiessen* restructure C-toolkit header usage ; move C Bioseq storage into su_sequence_set** Revision 1.2  2004/05/25 15:52:18  thiessen* add BlockMultipleAlignment, IBM algorithm** Revision 1.1  2004/05/24 23:04:05  thiessen* initial checkin**/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -