cav_seqset.cpp

来自「ncbi源码」· C++ 代码 · 共 418 行

CPP
418
字号
/* * =========================================================================== * PRODUCTION $Log: cav_seqset.cpp,v $ * PRODUCTION Revision 1000.2  2004/06/01 19:41:21  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.5 * PRODUCTION * =========================================================================== *//*  $Id: cav_seqset.cpp,v 1000.2 2004/06/01 19:41:21 gouriano Exp $* ===========================================================================**                            PUBLIC DOMAIN NOTICE*               National Center for Biotechnology Information**  This software/database is a "United States Government Work" under the*  terms of the United States Copyright Act.  It was written as part of*  the author's official duties as a United States Government employee and*  thus cannot be copyrighted.  This software/database is freely available*  to the public for use. The National Library of Medicine and the U.S.*  Government have not placed any restriction on its use or reproduction.**  Although all reasonable efforts have been taken to ensure the accuracy*  and reliability of the software and data, the NLM and the U.S.*  Government do not and cannot warrant the performance or results that*  may be obtained by using this software or data. The NLM and the U.S.*  Government disclaim all warranties, express or implied, including*  warranties of performance, merchantability or fitness for any particular*  purpose.**  Please cite the author in any work or product based on this material.** ===========================================================================** Authors:  Paul Thiessen** File Description:*      Classes to hold sets of sequences** ===========================================================================*/#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <corelib/ncbistre.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/PDB_mol_id.hpp>#include <objects/general/Object_id.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/NCBIeaa.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/NCBI4na.hpp>#include <objects/seq/NCBI8na.hpp>#include <objects/seq/NCBI2na.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/NCBIstdaa.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/general/Dbtag.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seqblock/PDB_block.hpp>#include <memory>#include <objtools/cddalignview/cav_seqset.hpp>#include <objtools/cddalignview/cddalignview.h>BEGIN_NCBI_SCOPEUSING_SCOPE(objects);void SequenceSet::UnpackSeqSet(const CBioseq_set& bss){    CBioseq_set::TSeq_set::const_iterator q, qe = bss.GetSeq_set().end();    for (q=bss.GetSeq_set().begin(); q!=qe; ++q) {        if (q->GetObject().IsSeq()) {            // only store amino acid or nucleotide sequences            if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna &&                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na)                continue;            const Sequence *sequence = new Sequence(q->GetObject().GetSeq());            if (!sequence || sequence->Status() != CAV_SUCCESS) {                status = sequence->Status();                return;            }            sequences.push_back(sequence);        } else { // Bioseq-set            UnpackSeqSet(q->GetObject().GetSet());        }    }}void SequenceSet::UnpackSeqEntry(const CSeq_entry& seqEntry){    if (seqEntry.IsSeq()) {        const Sequence *sequence = new Sequence(seqEntry.GetSeq());        if (!sequence || sequence->Status() != CAV_SUCCESS) {            status = sequence->Status();            return;        }        sequences.push_back(sequence);    } else { // Bioseq-set        UnpackSeqSet(seqEntry.GetSet());    }}SequenceSet::SequenceSet(const CSeq_entry& seqEntry) :    master(NULL), status(CAV_SUCCESS){    UnpackSeqEntry(seqEntry);    ERR_POST(Info << "number of sequences: " << sequences.size());}SequenceSet::SequenceSet(const SeqEntryList& seqEntries) :    master(NULL), status(CAV_SUCCESS){    SeqEntryList::const_iterator s, se = seqEntries.end();    for (s=seqEntries.begin(); s!=se; ++s)        UnpackSeqEntry(s->GetObject());    ERR_POST(Info << "number of sequences: " << sequences.size());}SequenceSet::~SequenceSet(void){    SequenceList::iterator s, se = sequences.end();    for (s=sequences.begin(); s!=se; ++s) delete *s;}const int Sequence::NOT_SET = -1;#define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)#define SECONDOF2(byte) ((byte) & 0x0F)static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA){    if (SECONDOF2(vec.back()) > 0)        str->resize(vec.size() * 2);    else        str->resize(vec.size() * 2 - 1);    // first, extract 4-bit values    int i;    for (i=0; i<vec.size(); ++i) {        str->at(2*i) = FIRSTOF2(vec[i]);        if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]);    }    // then convert 4-bit values to ascii characters    for (i=0; i<str->size(); ++i) {        switch (str->at(i)) {            case 1: str->at(i) = 'A'; break;            case 2: str->at(i) = 'C'; break;            case 4: str->at(i) = 'G'; break;            case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;            default:                str->at(i) = 'X';        }    }}#define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)#define SECONDOF4(byte) (((byte) & 0x30) >> 4)#define THIRDOF4(byte) (((byte) & 0x0C) >> 2)#define FOURTHOF4(byte) ((byte) & 0x03)static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA){    str->resize(vec.size() * 4);    // first, extract 4-bit values    int i;    for (i=0; i<vec.size(); ++i) {        str->at(4*i) = FIRSTOF4(vec[i]);        str->at(4*i + 1) = SECONDOF4(vec[i]);        str->at(4*i + 2) = THIRDOF4(vec[i]);        str->at(4*i + 3) = FOURTHOF4(vec[i]);    }    // then convert 4-bit values to ascii characters    for (i=0; i<str->size(); ++i) {        switch (str->at(i)) {            case 0: str->at(i) = 'A'; break;            case 1: str->at(i) = 'C'; break;            case 2: str->at(i) = 'G'; break;            case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;        }    }}static void StringFromStdaa(const vector < char >& vec, std::string *str){    static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*";    str->resize(vec.size());    for (int i=0; i<vec.size(); ++i)        str->at(i) = stdaaMap[vec[i]];}Sequence::Sequence(const CBioseq& bioseq) :    gi(NOT_SET), pdbChain(' '), mmdbLink(NOT_SET), status(CAV_ERROR_SEQUENCES),    bioseqASN(&bioseq){    // get Seq-id info    CBioseq::TId::const_iterator s, se = bioseq.GetId().end();    for (s=bioseq.GetId().begin(); s!=se; ++s) {        if (s->GetObject().IsGi()) {            gi = s->GetObject().GetGi();        } else if (s->GetObject().IsPdb()) {            pdbID = s->GetObject().GetPdb().GetMol().Get();            if (s->GetObject().GetPdb().IsSetChain())                pdbChain = s->GetObject().GetPdb().GetChain();        } else if (s->GetObject().IsLocal() && s->GetObject().GetLocal().IsStr()) {            pdbID = s->GetObject().GetLocal().GetStr();        } else if (s->GetObject().IsGenbank() && s->GetObject().GetGenbank().IsSetAccession()) {            accession = s->GetObject().GetGenbank().GetAccession();        } else if (s->GetObject().IsSwissprot() && s->GetObject().GetSwissprot().IsSetAccession()) {            accession = s->GetObject().GetSwissprot().GetAccession();        }    }    if (gi == NOT_SET && pdbID.size() == 0 && accession.size() == 0) {        ERR_POST(Error << "Sequence::Sequence() - can't parse SeqId");        return;    }    // try to get description from title or compound    if (bioseq.IsSetDescr()) {        CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end();        for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) {            if (d->GetObject().IsTitle()) {                description = d->GetObject().GetTitle();                break;            } else if (d->GetObject().IsPdb() && d->GetObject().GetPdb().GetCompound().size() > 0) {                description = d->GetObject().GetPdb().GetCompound().front();                break;            }        }    }    // get link to MMDB id - mainly for CDD's where Biostrucs have to be loaded separately    if (bioseq.IsSetAnnot()) {        CBioseq::TAnnot::const_iterator a, ae = bioseq.GetAnnot().end();        for (a=bioseq.GetAnnot().begin(); a!=ae; ++a) {            if (a->GetObject().GetData().IsIds()) {                CSeq_annot::C_Data::TIds::const_iterator i, ie = a->GetObject().GetData().GetIds().end();                for (i=a->GetObject().GetData().GetIds().begin(); i!=ie; ++i) {                    if (i->GetObject().IsGeneral() &&                        i->GetObject().GetGeneral().GetDb() == "mmdb" &&                        i->GetObject().GetGeneral().GetTag().IsId()) {                        mmdbLink = i->GetObject().GetGeneral().GetTag().GetId();                        break;                    }                }                if (i != ie) break;            }        }    }    if (mmdbLink != NOT_SET)        ERR_POST(Info << "sequence gi " << gi << ", PDB '" << pdbID << "' chain '" << (char) pdbChain <<            "', is from MMDB id " << mmdbLink);    // get sequence string    if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) {        // protein formats        if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) {            sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get();        } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) {            sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get();        } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) {            StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &sequenceString);        }        // nucleotide formats        else if (bioseq.GetInst().GetSeq_data().IsIupacna()) {            sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get();        } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) {            StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &sequenceString,                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));        } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) {  // same repr. for non-X as 4na            StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &sequenceString,                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));        } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) {            StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &sequenceString,                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));            if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < sequenceString.length())                sequenceString.resize(bioseq.GetInst().GetLength());        }        else {            ERR_POST(Critical << "Sequence::Sequence() - sequence " << gi                << ": confused by sequence string format");            return;        }        if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != sequenceString.length()) {            ERR_POST(Critical << "Sequence::Sequence() - sequence string length mismatch");            return;        }    } else {        ERR_POST(Critical << "Sequence::Sequence() - sequence " << gi                << ": confused by sequence representation");        return;    }    status = CAV_SUCCESS;}CSeq_id * Sequence::CreateSeqId(void) const{    CSeq_id *sid = new CSeq_id();    if (pdbID.size() > 0) {        sid->SetPdb().SetMol().Set(pdbID);        if (pdbChain != ' ') sid->SetPdb().SetChain(pdbChain);    } else { // use gi        sid->SetGi(gi);    }    return sid;}string Sequence::GetTitle(void) const{    CNcbiOstrstream oss;    if (pdbID.size() > 0) {        oss << pdbID;        if (pdbChain != ' ') {            oss <<  '_' << (char) pdbChain;        }    } else if (gi != NOT_SET)        oss << "gi " << gi;    else if (accession.size() > 0)        oss << "acc " << accession;    else        oss << '?';    oss << '\0';	auto_ptr<char> chars(oss.str()); // deletes memory upon function return    return string(oss.str());}END_NCBI_SCOPE/** ---------------------------------------------------------------------------* $Log: cav_seqset.cpp,v $* Revision 1000.2  2004/06/01 19:41:21  gouriano* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.5** Revision 1.5  2004/05/21 21:42:51  gorelenk* Added PCH ncbi_pch.hpp** Revision 1.4  2004/05/08 10:56:20  thiessen* better handling of repeated sequences** Revision 1.3  2004/03/15 18:51:27  thiessen* prefer prefix vs. postfix ++/-- operators** Revision 1.2  2003/06/02 16:06:41  dicuccio* Rearranged src/objects/ subtree.  This includes the following shifts:*     - src/objects/asn2asn --> arc/app/asn2asn*     - src/objects/testmedline --> src/objects/ncbimime/test*     - src/objects/objmgr --> src/objmgr*     - src/objects/util --> src/objmgr/util*     - src/objects/alnmgr --> src/objtools/alnmgr*     - src/objects/flat --> src/objtools/flat*     - src/objects/validator --> src/objtools/validator*     - src/objects/cddalignview --> src/objtools/cddalignview* In addition, libseq now includes six of the objects/seq... libs, and libmmdb* replaces the three libmmdb? libs.** Revision 1.1  2003/03/19 19:04:12  thiessen* move again** Revision 1.2  2003/03/19 16:06:28  thiessen* add <memory>** Revision 1.1  2003/03/19 05:33:43  thiessen* move to src/app/cddalignview** Revision 1.9  2003/02/03 17:52:03  thiessen* move CVS Log to end of file** Revision 1.8  2002/05/28 17:55:48  thiessen* cavpr** Revision 1.7  2001/07/12 21:32:13  thiessen* update for swissprot accessions** Revision 1.6  2001/01/29 18:13:34  thiessen* split into C-callable library + main** Revision 1.5  2001/01/25 00:51:20  thiessen* add command-line args; can read asn data from stdin** Revision 1.4  2001/01/23 20:42:01  thiessen* add description** Revision 1.3  2001/01/23 17:34:12  thiessen* add HTML output** Revision 1.2  2001/01/22 15:55:11  thiessen* correctly set up ncbi namespacing** Revision 1.1  2001/01/22 13:15:24  thiessen* initial checkin**/

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?