📄 sequence_set.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: sequence_set.cpp,v $ * PRODUCTION Revision 1000.4 2004/06/01 18:29:07 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.68 * PRODUCTION * =========================================================================== *//* $Id: sequence_set.cpp,v 1000.4 2004/06/01 18:29:07 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Authors: Paul Thiessen** File Description:* Classes to hold sets of sequences** ===========================================================================*/#ifdef _MSC_VER#pragma warning(disable:4018) // disable signed/unsigned mismatch warning in MSVC#endif#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <corelib/ncbistre.hpp>#include <corelib/ncbistl.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/PDB_mol_id.hpp>#include <objects/general/Object_id.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/NCBIeaa.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/NCBIstdaa.hpp>#include <objects/seq/NCBI4na.hpp>#include <objects/seq/NCBI8na.hpp>#include <objects/seq/NCBI2na.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/general/Dbtag.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seqblock/PDB_block.hpp>#include <objects/seqfeat/BioSource.hpp>#include <objects/seqfeat/Org_ref.hpp>#include <regex.h> // regex from C-toolkit#include "sequence_set.hpp"#include "molecule.hpp"#include "structure_set.hpp"#include "cn3d_tools.hpp"#include "molecule_identifier.hpp"#include "messenger.hpp"USING_NCBI_SCOPE;USING_SCOPE(objects);BEGIN_SCOPE(Cn3D)static void UnpackSeqSet(CBioseq_set& bss, SequenceSet *parent, SequenceSet::SequenceList& seqlist){ CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end(); for (q=bss.SetSeq_set().begin(); q!=qe; ++q) { if (q->GetObject().IsSeq()) { // only store amino acid or nucleotide sequences if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na) continue; const Sequence *sequence = new Sequence(parent, q->GetObject().SetSeq()); seqlist.push_back(sequence); } else { // Bioseq-set UnpackSeqSet(q->GetObject().SetSet(), parent, seqlist); } }}static void UnpackSeqEntry(CSeq_entry& seqEntry, SequenceSet *parent, SequenceSet::SequenceList& seqlist){ if (seqEntry.IsSeq()) { const Sequence *sequence = new Sequence(parent, seqEntry.SetSeq()); seqlist.push_back(sequence); } else { // Bioseq-set UnpackSeqSet(seqEntry.SetSet(), parent, seqlist); }}SequenceSet::SequenceSet(StructureBase *parent, SeqEntryList& seqEntries) : StructureBase(parent){ SeqEntryList::iterator s, se = seqEntries.end(); for (s=seqEntries.begin(); s!=se; ++s) UnpackSeqEntry(s->GetObject(), this, sequences); TRACEMSG("number of sequences: " << sequences.size());}#define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)#define SECONDOF2(byte) ((byte) & 0x0F)static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA){ if (SECONDOF2(vec.back()) > 0) str->resize(vec.size() * 2); else str->resize(vec.size() * 2 - 1); // first, extract 4-bit values int i; for (i=0; i<vec.size(); ++i) { str->at(2*i) = FIRSTOF2(vec[i]); if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]); } // then convert 4-bit values to ascii characters for (i=0; i<str->size(); ++i) { switch (str->at(i)) { case 1: str->at(i) = 'A'; break; case 2: str->at(i) = 'C'; break; case 4: str->at(i) = 'G'; break; case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break; default: str->at(i) = 'X'; } }}#define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)#define SECONDOF4(byte) (((byte) & 0x30) >> 4)#define THIRDOF4(byte) (((byte) & 0x0C) >> 2)#define FOURTHOF4(byte) ((byte) & 0x03)static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA){ str->resize(vec.size() * 4); // first, extract 4-bit values int i; for (i=0; i<vec.size(); ++i) { str->at(4*i) = FIRSTOF4(vec[i]); str->at(4*i + 1) = SECONDOF4(vec[i]); str->at(4*i + 2) = THIRDOF4(vec[i]); str->at(4*i + 3) = FOURTHOF4(vec[i]); } // then convert 4-bit values to ascii characters for (i=0; i<str->size(); ++i) { switch (str->at(i)) { case 0: str->at(i) = 'A'; break; case 1: str->at(i) = 'C'; break; case 2: str->at(i) = 'G'; break; case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break; } }}static void StringFromStdaa(const vector < char >& vec, string *str){ static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*"; str->resize(vec.size()); for (int i=0; i<vec.size(); ++i) str->at(i) = stdaaMap[vec[i]];}Sequence::Sequence(SequenceSet *parent, ncbi::objects::CBioseq& bioseq) : StructureBase(parent), bioseqASN(&bioseq), molecule(NULL), isProtein(false){ int gi = MoleculeIdentifier::VALUE_NOT_SET, mmdbID = MoleculeIdentifier::VALUE_NOT_SET, pdbChain = MoleculeIdentifier::VALUE_NOT_SET; string pdbID, accession; // get Seq-id info CBioseq::TId::const_iterator s, se = bioseq.GetId().end(); for (s=bioseq.GetId().begin(); s!=se; ++s) { if ((*s)->IsGi()) { gi = (*s)->GetGi(); } else if ((*s)->IsPdb()) { pdbID = (*s)->GetPdb().GetMol().Get(); if ((*s)->GetPdb().IsSetChain()) pdbChain = (*s)->GetPdb().GetChain(); else pdbChain = ' '; } else if ((*s)->IsLocal() && (*s)->GetLocal().IsStr()) { accession = (*s)->GetLocal().GetStr(); // special case where local accession is actually a PDB chain + extra stuff if (pdbID.size() == 0 && accession.size() >= 7 && accession[4] == ' ' && accession[6] == ' ' && isalpha(accession[5])) { pdbID = accession.substr(0, 4); pdbChain = accession[5]; accession.erase(); } } else if ((*s)->IsGenbank() && (*s)->GetGenbank().IsSetAccession()) { accession = (*s)->GetGenbank().GetAccession(); } else if ((*s)->IsSwissprot() && (*s)->GetSwissprot().IsSetAccession()) { accession = (*s)->GetSwissprot().GetAccession(); } else if ((*s)->IsOther() && (*s)->GetOther().IsSetAccession()) { accession = (*s)->GetOther().GetAccession(); } else if ((*s)->IsEmbl() && (*s)->GetEmbl().IsSetAccession()) { accession = (*s)->GetEmbl().GetAccession(); } else if ((*s)->IsDdbj() && (*s)->GetDdbj().IsSetAccession()) { accession = (*s)->GetDdbj().GetAccession(); } } if (gi == MoleculeIdentifier::VALUE_NOT_SET && pdbID.size() == 0 && accession.size() == 0) { ERRORMSG("Sequence::Sequence() - can't parse SeqId"); return; } if (bioseq.IsSetDescr()) { string defline, taxid; CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end(); for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) { // get "defline" from title or compound if ((*d)->IsTitle()) { // prefer title over compound defline = (*d)->GetTitle(); } else if (defline.size() == 0 && (*d)->IsPdb() && (*d)->GetPdb().GetCompound().size() > 0) { defline = (*d)->GetPdb().GetCompound().front(); } // get taxonomy if ((*d)->IsSource()) { if ((*d)->GetSource().GetOrg().IsSetTaxname()) taxid = (*d)->GetSource().GetOrg().GetTaxname(); else if ((*d)->GetSource().GetOrg().IsSetCommon()) taxid = (*d)->GetSource().GetOrg().GetCommon(); } } if (taxid.size() > 0) description = string("[") + taxid + ']'; if (defline.size() > 0) { if (taxid.size() > 0) description += ' '; description += defline; } } // get link to MMDB id - mainly for CDD's where Biostrucs have to be loaded separately if (bioseq.IsSetAnnot()) { CBioseq::TAnnot::const_iterator a, ae = bioseq.GetAnnot().end(); for (a=bioseq.GetAnnot().begin(); a!=ae; ++a) { if (a->GetObject().GetData().IsIds()) { CSeq_annot::C_Data::TIds::const_iterator i, ie = a->GetObject().GetData().GetIds().end(); for (i=a->GetObject().GetData().GetIds().begin(); i!=ie; ++i) { if (i->GetObject().IsGeneral() && i->GetObject().GetGeneral().GetDb() == "mmdb" && i->GetObject().GetGeneral().GetTag().IsId()) { mmdbID = i->GetObject().GetGeneral().GetTag().GetId(); break; } } if (i != ie) break; } } } if (mmdbID != MoleculeIdentifier::VALUE_NOT_SET) TRACEMSG("sequence gi " << gi << ", PDB '" << pdbID << "' chain '" << (char) pdbChain << "', is from MMDB id " << mmdbID); // get sequence string if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -