📄 su_sequence_set.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: su_sequence_set.cpp,v $ * PRODUCTION Revision 1000.0 2004/06/01 18:14:43 gouriano * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3 * PRODUCTION * =========================================================================== *//* $Id: su_sequence_set.cpp,v 1000.0 2004/06/01 18:14:43 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Authors: Paul Thiessen** File Description:* Classes to hold sets of sequences** ===========================================================================*/#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <corelib/ncbistre.hpp>#include <corelib/ncbistl.hpp>#include <vector>#include <map>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/PDB_seq_id.hpp>#include <objects/seqloc/PDB_mol_id.hpp>#include <objects/general/Object_id.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/NCBIeaa.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/NCBIstdaa.hpp>#include <objects/seq/NCBI4na.hpp>#include <objects/seq/NCBI8na.hpp>#include <objects/seq/NCBI2na.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/general/Dbtag.hpp>#include <objects/seqloc/Textseq_id.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seqblock/PDB_block.hpp>#include <objects/seqfeat/BioSource.hpp>#include <objects/seqfeat/Org_ref.hpp>#include "su_sequence_set.hpp"#include "su_private.hpp"// borrow Cn3D's asn conversion functions#include "../src/app/cn3d/asn_converter.hpp"// C-toolkit stuff for BioseqPtr creation#include <objseq.h>#include <objalign.h>USING_NCBI_SCOPE;USING_SCOPE(objects);BEGIN_SCOPE(struct_util)// holds C Bioseqs associated with Sequencestypedef map < const Sequence *, Bioseq * > BioseqMap;static BioseqMap bioseqs;void AddCSeqId(SeqIdPtr *sid, const ncbi::objects::CSeq_id& cppid){ string err; ValNode *vn = (SeqIdPtr) Cn3D::ConvertAsnFromCPPToC(cppid, (AsnReadFunc) SeqIdAsnRead, &err); if (!vn || err.size() > 0) { ERROR_MESSAGE("AddCSeqId() - ConvertAsnFromCPPToC() failed"); return; } ValNodeLink(sid, vn);}static void AddCSeqIdAll(SeqIdPtr *id, const Sequence& sequence){ CBioseq::TId::const_iterator i, ie = sequence.GetAllIdentifiers().end(); for (i=sequence.GetAllIdentifiers().begin(); i!=ie; ++i) AddCSeqId(id, **i);}BioseqPtr GetOrCreateBioseq(const Sequence *sequence){ if (!sequence || !sequence->m_isProtein) { ERROR_MESSAGE("GetOrCreateBioseq() - got non-protein or NULL Sequence"); return NULL; } // if already done BioseqMap::const_iterator b = bioseqs.find(sequence); if (b != bioseqs.end()) return b->second; // create new Bioseq and fill it in from Sequence data BioseqPtr bioseq = BioseqNew(); bioseq->mol = Seq_mol_aa; bioseq->seq_data_type = Seq_code_ncbieaa; bioseq->repr = Seq_repr_raw; bioseq->length = sequence->Length(); bioseq->seq_data = BSNew(bioseq->length); BSWrite(bioseq->seq_data, const_cast<char*>(sequence->m_sequenceString.c_str()), bioseq->length); // create Seq-id AddCSeqIdAll(&(bioseq->id), *sequence); // store Bioseq bioseqs[sequence] = bioseq; return bioseq;}static void UnpackSeqSet(CBioseq_set& bss, SequenceSet::SequenceList& seqlist){ CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end(); for (q=bss.SetSeq_set().begin(); q!=qe; ++q) { if (q->GetObject().IsSeq()) { // only store amino acid or nucleotide sequences if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna && q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na) continue; CRef < Sequence > sequence(new Sequence(q->GetObject().SetSeq())); seqlist.push_back(sequence); } else { // Bioseq-set UnpackSeqSet(q->GetObject().SetSet(), seqlist); } }}static void UnpackSeqEntry(CSeq_entry& seqEntry, SequenceSet::SequenceList& seqlist){ if (seqEntry.IsSeq()) { CRef < Sequence > sequence(new Sequence(seqEntry.SetSeq())); seqlist.push_back(sequence); } else { // Bioseq-set UnpackSeqSet(seqEntry.SetSet(), seqlist); }}SequenceSet::SequenceSet(SeqEntryList& seqEntries){ SeqEntryList::iterator s, se = seqEntries.end(); for (s=seqEntries.begin(); s!=se; ++s) UnpackSeqEntry(s->GetObject(), m_sequences); TRACE_MESSAGE("number of sequences: " << m_sequences.size());}#define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)#define SECONDOF2(byte) ((byte) & 0x0F)static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA){ if (SECONDOF2(vec.back()) > 0) str->resize(vec.size() * 2); else str->resize(vec.size() * 2 - 1); // first, extract 4-bit values unsigned int i; for (i=0; i<vec.size(); ++i) { str->at(2*i) = FIRSTOF2(vec[i]); if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]); } // then convert 4-bit values to ascii characters for (i=0; i<str->size(); ++i) { switch (str->at(i)) { case 1: str->at(i) = 'A'; break; case 2: str->at(i) = 'C'; break; case 4: str->at(i) = 'G'; break; case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break; default: str->at(i) = 'X'; } }}#define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)#define SECONDOF4(byte) (((byte) & 0x30) >> 4)#define THIRDOF4(byte) (((byte) & 0x0C) >> 2)#define FOURTHOF4(byte) ((byte) & 0x03)static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA){ str->resize(vec.size() * 4); // first, extract 4-bit values unsigned int i; for (i=0; i<vec.size(); ++i) { str->at(4*i) = FIRSTOF4(vec[i]); str->at(4*i + 1) = SECONDOF4(vec[i]); str->at(4*i + 2) = THIRDOF4(vec[i]); str->at(4*i + 3) = FOURTHOF4(vec[i]); } // then convert 4-bit values to ascii characters for (i=0; i<str->size(); ++i) { switch (str->at(i)) { case 0: str->at(i) = 'A'; break; case 1: str->at(i) = 'C'; break; case 2: str->at(i) = 'G'; break; case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break; } }}static void StringFromStdaa(const vector < char >& vec, string *str){ static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*"; str->resize(vec.size()); for (unsigned int i=0; i<vec.size(); ++i) str->at(i) = stdaaMap[vec[i]];}Sequence::Sequence(ncbi::objects::CBioseq& bioseq) : m_bioseqASN(&bioseq), m_isProtein(false){ // fill out description if (bioseq.IsSetDescr()) { string defline, taxid; CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end(); for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) { // get "defline" from title or compound if ((*d)->IsTitle()) { // prefer title over compound defline = (*d)->GetTitle(); } else if (defline.size() == 0 && (*d)->IsPdb() && (*d)->GetPdb().GetCompound().size() > 0) { defline = (*d)->GetPdb().GetCompound().front(); } // get taxonomy if ((*d)->IsSource()) { if ((*d)->GetSource().GetOrg().IsSetTaxname()) taxid = (*d)->GetSource().GetOrg().GetTaxname(); else if ((*d)->GetSource().GetOrg().IsSetCommon()) taxid = (*d)->GetSource().GetOrg().GetCommon(); } } if (taxid.size() > 0) m_description = string("[") + taxid + ']'; if (defline.size() > 0) { if (taxid.size() > 0) m_description += ' '; m_description += defline; } } // get sequence string if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) { // protein formats if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) { m_sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get(); m_isProtein = true; } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) { m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get(); m_isProtein = true; } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) { StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &m_sequenceString); m_isProtein = true; } // nucleotide formats else if (bioseq.GetInst().GetSeq_data().IsIupacna()) { m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get(); // convert 'T' to 'U' for RNA if (bioseq.GetInst().GetMol() == CSeq_inst::eMol_rna) { for (unsigned int i=0; i<m_sequenceString.size(); ++i) { if (m_sequenceString[i] == 'T') m_sequenceString[i] = 'U'; } } } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) { StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &m_sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) { // same repr. for non-X as 4na StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &m_sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) { StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &m_sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < m_sequenceString.length()) m_sequenceString.resize(bioseq.GetInst().GetLength()); } else THROW_MESSAGE("Sequence::Sequence(): confused by sequence format"); // check length if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != m_sequenceString.length()) THROW_MESSAGE("Sequence::Sequence() - sequence string length mismatch"); // force uppercase for (unsigned int i=0; i<m_sequenceString.size(); ++i) m_sequenceString[i] = toupper(m_sequenceString[i]); } else THROW_MESSAGE("Sequence::Sequence(): confused by sequence representation");}Sequence::~Sequence(void){ BioseqMap::iterator b = bioseqs.find(this); if (b != bioseqs.end()) { BioseqFree(b->second); bioseqs.erase(b); }}#define RETURN_FIRST_SEQID_THAT_(is) \ for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) \ if ((*i)->is()) \ return **iconst CSeq_id& Sequence::GetPreferredIdentifier(void) const{ CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end(); // try to find one of these first RETURN_FIRST_SEQID_THAT_(IsPdb); RETURN_FIRST_SEQID_THAT_(IsGi); // otherwise, just use the first one return m_bioseqASN->GetId().front().GetObject();}bool Sequence::MatchesSeqId(const CSeq_id& seqID) const{ CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end(); for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) { if (seqID.Match(**i)) return true; } return false;}string Sequence::IdentifierString(void) const{ return CSeq_id::GetStringDescr(*m_bioseqASN, CSeq_id::eFormat_FastA);}END_SCOPE(struct_util)/** ---------------------------------------------------------------------------* $Log: su_sequence_set.cpp,v $* Revision 1000.0 2004/06/01 18:14:43 gouriano* PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3** Revision 1.3 2004/05/28 09:46:57 thiessen* restructure C-toolkit header usage ; move C Bioseq storage into su_sequence_set** Revision 1.2 2004/05/25 15:52:18 thiessen* add BlockMultipleAlignment, IBM algorithm** Revision 1.1 2004/05/24 23:04:05 thiessen* initial checkin**/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -