📄 su_block_multiple_alignment.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: su_block_multiple_alignment.cpp,v $ * PRODUCTION Revision 1000.0 2004/06/01 18:14:28 gouriano * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.9 * PRODUCTION * =========================================================================== *//* $Id: su_block_multiple_alignment.cpp,v 1000.0 2004/06/01 18:14:28 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Authors: Paul Thiessen** File Description:* Classes to hold alignment data** ===========================================================================*/#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <corelib/ncbi_limits.hpp>#include <util/tables/raw_scoremat.h>#include <objects/seqalign/Dense_diag.hpp>#include "su_block_multiple_alignment.hpp"#include "su_sequence_set.hpp"#include "su_private.hpp"// C-toolkit stuff for PSSM calculation#include <objalign.h>#include <blast.h>#include <blastkar.h>#include <cddutil.h>#include <thrdatd.h>#include <thrddecl.h>USING_NCBI_SCOPE;USING_SCOPE(objects);BEGIN_SCOPE(struct_util)// from su_sequence_set.cppextern BioseqPtr GetOrCreateBioseq(const Sequence *sequence);extern void AddCSeqId(SeqIdPtr *sid, const ncbi::objects::CSeq_id& cppid);static const int SCALING_FACTOR = 1000000;static const string ThreaderResidues = "ARNDCQEGHILKMFPSTWYV";static const string BLASTResidues = "-ABCDEFGHIKLMNPQRSTVWXYZU*";BlockMultipleAlignment::BlockMultipleAlignment(const SequenceList& sequenceList){ m_sequences = sequenceList; m_pssm = NULL; InitCache(); m_rowDoubles.resize(m_sequences.size(), 0.0); m_rowStrings.resize(m_sequences.size());}void BlockMultipleAlignment::InitCache(void){ m_cachePrevRow = eUndefined; m_cachePrevBlock = NULL; m_cacheBlockIterator = m_blocks.begin();}BlockMultipleAlignment::~BlockMultipleAlignment(void){ RemovePSSM();}BlockMultipleAlignment * BlockMultipleAlignment::Clone(void) const{ BlockMultipleAlignment *copy = new BlockMultipleAlignment(m_sequences); BlockList::const_iterator b, be = m_blocks.end(); for (b=m_blocks.begin(); b!=be; ++b) copy->m_blocks.push_back(CRef<Block>((*b)->Clone(copy))); copy->UpdateBlockMap(); copy->m_rowDoubles = m_rowDoubles; copy->m_rowStrings = m_rowStrings; return copy;}static inline unsigned int ScreenResidueCharacter(char original){ char ch = toupper(original); switch (ch) { case 'A': case 'R': case 'N': case 'D': case 'C': case 'Q': case 'E': case 'G': case 'H': case 'I': case 'L': case 'K': case 'M': case 'F': case 'P': case 'S': case 'T': case 'W': case 'Y': case 'V': case 'B': case 'Z': break; default: ch = 'X'; // make all but natural aa's just 'X' } return ch;}static int GetBLOSUM62Score(char a, char b){ static SNCBIFullScoreMatrix Blosum62Matrix; static bool unpacked = false; if (!unpacked) { NCBISM_Unpack(&NCBISM_Blosum62, &Blosum62Matrix); unpacked = true; } return Blosum62Matrix.s[ScreenResidueCharacter(a)][ScreenResidueCharacter(b)];}// creates a SeqAlign from a BlockMultipleAlignmentstatic SeqAlignPtr CreateCSeqAlign(const BlockMultipleAlignment& multiple){ // one SeqAlign (chained into a linked list) for each slave row SeqAlignPtr prevSap = NULL, firstSap = NULL; for (unsigned int row=1; row<multiple.NRows(); ++row) { SeqAlignPtr sap = SeqAlignNew(); if (prevSap) prevSap->next = sap; prevSap = sap; if (!firstSap) firstSap = sap; sap->type = SAT_PARTIAL; sap->dim = 2; sap->segtype = SAS_DENDIAG; DenseDiagPtr prevDd = NULL; BlockMultipleAlignment::UngappedAlignedBlockList blocks; multiple.GetUngappedAlignedBlocks(&blocks); BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator b, be = blocks.end(); for (b=blocks.begin(); b!=be; ++b) { DenseDiagPtr dd = DenseDiagNew(); if (prevDd) prevDd->next = dd; prevDd = dd; if (b == blocks.begin()) sap->segs = dd; dd->dim = 2; AddCSeqId(&(dd->id), multiple.GetSequenceOfRow(0)->GetPreferredIdentifier()); AddCSeqId(&(dd->id), multiple.GetSequenceOfRow(row)->GetPreferredIdentifier()); dd->len = (*b)->m_width; dd->starts = (Int4Ptr) MemNew(2 * sizeof(Int4)); const Block::Range *range = (*b)->GetRangeOfRow(0); dd->starts[0] = range->from; range = (*b)->GetRangeOfRow(row); dd->starts[1] = range->from; } } return firstSap;}static void CreateAllBioseqs(const BlockMultipleAlignment& multiple){ for (unsigned int row=0; row<multiple.NRows(); ++row) GetOrCreateBioseq(multiple.GetSequenceOfRow(row));}static Seq_Mtf * CreateSeqMtf(const BlockMultipleAlignment& multiple, double weightPSSM, BLAST_KarlinBlkPtr karlinBlock){ // special case for "PSSM" of single-row "alignment" - just use BLOSUM62 score if (multiple.NRows() == 1) { Seq_Mtf *seqMtf = NewSeqMtf(multiple.GetMaster()->Length(), ThreaderResidues.size()); for (unsigned int res=0; res<multiple.GetMaster()->Length(); ++res) for (unsigned int aa=0; aa<ThreaderResidues.size(); ++aa) seqMtf->ww[res][aa] = ThrdRound(weightPSSM * SCALING_FACTOR * GetBLOSUM62Score(multiple.GetMaster()->m_sequenceString[res], ThreaderResidues[aa])); WARNING_MESSAGE("Created Seq_Mtf (PSSM) from BLOSUM62 scores"); return seqMtf; } // convert all sequences to Bioseqs CreateAllBioseqs(multiple); // create SeqAlign from this BlockMultipleAlignment SeqAlignPtr seqAlign = CreateCSeqAlign(multiple); // "spread" unaligned residues between aligned blocks, for PSSM construction CddDegapSeqAlign(seqAlign); Seq_Mtf *seqMtf = NULL; for (int i=11; i>=1; --i) { // first try auto-determined pseudocount (-1); if fails, find higest <= 10 that works int pseudocount = (i == 11) ? -1 : i; seqMtf = CddDenDiagCposComp2KBP( GetOrCreateBioseq(multiple.GetMaster()), pseudocount, seqAlign, NULL, NULL, weightPSSM, SCALING_FACTOR, NULL, karlinBlock ); if (seqMtf) break; else WARNING_MESSAGE("Cannot use " << ((pseudocount == -1) ? "(empirical) " : "") << "pseudocount of " << pseudocount); } if (seqMtf) TRACE_MESSAGE("created Seq_Mtf (PSSM)"); else ERROR_MESSAGE("Cannot find any pseudocount that yields an acceptable PSSM!"); SeqAlignSetFree(seqAlign); return seqMtf;}// gives BLAST residue number for a character (or # for 'X' if char not found)int LookupBLASTResidueNumberFromCharacter(unsigned char r){ typedef map < unsigned char, int > Char2Int; static Char2Int charMap; if (charMap.size() == 0) { for (unsigned int i=0; i<BLASTResidues.size(); ++i) charMap[BLASTResidues[i]] = i; } Char2Int::const_iterator n = charMap.find(toupper(r)); if (n != charMap.end()) return n->second; else return charMap.find('X')->second;}// gives BLAST residue number for a threader residue number (or # for 'X' if char == -1)static int LookupBLASTResidueNumberFromThreaderResidueNumber(unsigned char r){ r = toupper(r); return LookupBLASTResidueNumberFromCharacter( (r < ThreaderResidues.size()) ? ThreaderResidues[r] : 'X');}static Int4 Round(double d){ if (d >= 0.0) return (Int4) (d + 0.5); else return (Int4) (d - 0.5);}const BLAST_Matrix * BlockMultipleAlignment::GetPSSM(void) const{ if (m_pssm) return m_pssm; // for now, use threader's SeqMtf BLAST_KarlinBlkPtr karlinBlock = BlastKarlinBlkCreate(); Seq_Mtf *seqMtf = CreateSeqMtf(*this, 1.0, karlinBlock); m_pssm = (BLAST_Matrix *) MemNew(sizeof(BLAST_Matrix)); m_pssm->is_prot = TRUE; m_pssm->name = StringSave("BLOSUM62"); m_pssm->karlinK = karlinBlock->K; m_pssm->rows = seqMtf->n + 1; m_pssm->columns = 26; int i, j; m_pssm->matrix = (Int4 **) MemNew(m_pssm->rows * sizeof(Int4 *)); for (i=0; i<m_pssm->rows; ++i) { m_pssm->matrix[i] = (Int4 *) MemNew(m_pssm->columns * sizeof(Int4)); // set scores from threader matrix if (i < seqMtf->n) { // initialize all rows with custom score, or BLAST_SCORE_MIN; to match what Aron's function creates for (j=0; j<m_pssm->columns; ++j) m_pssm->matrix[i][j] = (j == 21 ? -1 : (j == 25 ? -4 : BLAST_SCORE_MIN)); for (j=0; j<seqMtf->AlphabetSize; ++j) { m_pssm->matrix[i][LookupBLASTResidueNumberFromThreaderResidueNumber(j)] = Round(((double) seqMtf->ww[i][j]) / SCALING_FACTOR); } } else { // initialize last row with BLAST_SCORE_MIN for (j=0; j<m_pssm->columns; ++j) m_pssm->matrix[i][j] = BLAST_SCORE_MIN; } } m_pssm->posFreqs = NULL; FreeSeqMtf(seqMtf); BlastKarlinBlkDestruct(karlinBlock); return m_pssm;}void BlockMultipleAlignment::RemovePSSM(void) const{ if (m_pssm) { BLAST_MatrixDestruct(m_pssm); m_pssm = NULL; }}bool BlockMultipleAlignment::CheckAlignedBlock(const Block *block) const{ if (!block || !block->IsAligned()) { ERROR_MESSAGE("CheckAlignedBlock() - checks aligned blocks only"); return false; } if (block->NSequences() != m_sequences.size()) { ERROR_MESSAGE("CheckAlignedBlock() - block size mismatch"); return false; } // make sure ranges are reasonable for each sequence unsigned int row; const Block *prevBlock = GetBlockBefore(block), *nextBlock = GetBlockAfter(block); const Block::Range *range, *prevRange = NULL, *nextRange = NULL; SequenceList::const_iterator sequence = m_sequences.begin(); for (row=0; row<block->NSequences(); ++row, ++sequence) { range = block->GetRangeOfRow(row); if (prevBlock) prevRange = prevBlock->GetRangeOfRow(row); if (nextBlock) nextRange = nextBlock->GetRangeOfRow(row); if (range->to - range->from + 1 != block->m_width || // check block m_width (prevRange && range->from <= prevRange->to) || // check for range overlap (nextRange && range->to >= nextRange->from) || // check for range overlap range->from > range->to || // check range values range->to >= (*sequence)->Length()) // check bounds of end { ERROR_MESSAGE("CheckAlignedBlock() - range error"); return false; } } return true;}bool BlockMultipleAlignment::AddAlignedBlockAtEnd(UngappedAlignedBlock *newBlock){ m_blocks.push_back(CRef<Block>(newBlock)); return CheckAlignedBlock(newBlock);}UnalignedBlock * BlockMultipleAlignment:: CreateNewUnalignedBlockBetween(const Block *leftBlock, const Block *rightBlock){ if ((leftBlock && !leftBlock->IsAligned()) || (rightBlock && !rightBlock->IsAligned())) { ERROR_MESSAGE("CreateNewUnalignedBlockBetween() - passed an unaligned block"); return NULL; } unsigned int row, from, to, length; SequenceList::const_iterator s, se = m_sequences.end(); UnalignedBlock *newBlock = new UnalignedBlock(this); newBlock->m_width = 0; for (row=0, s=m_sequences.begin(); s!=se; ++row, ++s) { if (leftBlock) from = leftBlock->GetRangeOfRow(row)->to + 1; else from = 0; if (rightBlock) to = rightBlock->GetRangeOfRow(row)->from - 1; else to = (*s)->Length() - 1; newBlock->SetRangeOfRow(row, from, to); length = to - from + 1; if ((to - from + 1) < 0) { // just to make sure... ERROR_MESSAGE("CreateNewUnalignedBlockBetween() - unaligned length < 0"); return NULL; } if (length > newBlock->m_width) newBlock->m_width = length; } if (newBlock->m_width == 0) { delete newBlock; return NULL; } else return newBlock;}bool BlockMultipleAlignment::AddUnalignedBlocks(void){ BlockList::iterator a, ae = m_blocks.end(); const Block *alignedBlock = NULL, *prevAlignedBlock = NULL; Block *newUnalignedBlock; // unaligned m_blocks to the left of each aligned block for (a=m_blocks.begin(); a!=ae; ++a) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -