📄 cn3d_blast.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: cn3d_blast.cpp,v $ * PRODUCTION Revision 1000.2 2004/06/01 18:28:09 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.34 * PRODUCTION * =========================================================================== *//* $Id: cn3d_blast.cpp,v 1000.2 2004/06/01 18:28:09 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Authors: Paul Thiessen** File Description:* module for aligning with BLAST and related algorithms** ===========================================================================*/#ifdef _MSC_VER#pragma warning(disable:4018) // disable signed/unsigned mismatch warning in MSVC#endif#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <objects/seqalign/Seq_align.hpp>#include <objects/seqalign/Dense_seg.hpp>#include <objects/seqalign/Score.hpp>#include <objects/general/Object_id.hpp>#ifdef __WXMSW__#include <windows.h>#include <wx/msw/winundef.h>#endif#include <wx/wx.h>// C stuff#include <objseq.h>#include <blast.h>#include <blastkar.h>#include "cn3d_blast.hpp"#include "structure_set.hpp"#include "sequence_set.hpp"#include "block_multiple_alignment.hpp"#include "cn3d_tools.hpp"#include "asn_converter.hpp"#include "molecule_identifier.hpp"#include "cn3d_threader.hpp"// hack so I can catch memory leaks specific to this module, at the line where allocation occurs#ifdef _DEBUG#ifdef MemNew#undef MemNew#endif#define MemNew(sz) memset(malloc(sz), 0, (sz))#endifUSING_NCBI_SCOPE;USING_SCOPE(objects);BEGIN_SCOPE(Cn3D)const string BLASTer::BLASTResidues = "-ABCDEFGHIKLMNPQRSTVWXYZU*";// gives BLAST residue number for a character (or # for 'X' if char not found)int LookupBLASTResidueNumberFromCharacter(char r){ typedef map < char, int > Char2Int; static Char2Int charMap; if (charMap.size() == 0) { for (int i=0; i<BLASTer::BLASTResidues.size(); ++i) charMap[BLASTer::BLASTResidues[i]] = i; } Char2Int::const_iterator n = charMap.find(toupper(r)); if (n != charMap.end()) return n->second; else return charMap.find('X')->second;}// gives BLAST residue number for a threader residue number (or # for 'X' if char == -1)int LookupBLASTResidueNumberFromThreaderResidueNumber(char r){ r = toupper(r); return LookupBLASTResidueNumberFromCharacter( (r >= 0 && r < Threader::ThreaderResidues.size()) ? Threader::ThreaderResidues[r] : 'X');}#ifdef _DEBUG#define PRINT_PSSM // for testing/debugging#endifstatic BLAST_OptionsBlkPtr CreateBlastOptionsBlk(void){ BLAST_OptionsBlkPtr bob = BLASTOptionNew("blastp", true); bob->db_length = 2717223; // size of CDD database v1.62 bob->scalingFactor = 1.0; return bob;}static void SetEffectiveSearchSpaceSize(BLAST_OptionsBlkPtr bob, Int4 queryLength){ bob->searchsp_eff = BLASTCalculateSearchSpace(bob, 1, bob->db_length, queryLength);}void BLASTer::CreateNewPairwiseAlignmentsByBlast(const BlockMultipleAlignment *multiple, const AlignmentList& toRealign, AlignmentList *newAlignments, bool usePSSM){ if (usePSSM && !multiple) { ERRORMSG("usePSSM true, but NULL multiple alignment"); return; } // set up BLAST stuff BLAST_OptionsBlkPtr options = CreateBlastOptionsBlk(); if (!options) { ERRORMSG("BLASTOptionNew() failed"); return; } // create Seq-loc intervals SeqLocPtr masterSeqLoc = (SeqLocPtr) MemNew(sizeof(SeqLoc)); masterSeqLoc->choice = SEQLOC_INT; SeqIntPtr masterSeqInt = SeqIntNew(); masterSeqLoc->data.ptrvalue = masterSeqInt; if (multiple) { BlockMultipleAlignment::UngappedAlignedBlockList uaBlocks; multiple->GetUngappedAlignedBlocks(&uaBlocks); if (uaBlocks.size() > 0) { int excess = 0; if (!RegistryGetInteger(REG_ADVANCED_SECTION, REG_FOOTPRINT_RES, &excess)) WARNINGMSG("Can't get footprint excess residues from registry"); masterSeqInt->from = uaBlocks.front()->GetRangeOfRow(0)->from - excess; if (masterSeqInt->from < 0) masterSeqInt->from = 0; masterSeqInt->to = uaBlocks.back()->GetRangeOfRow(0)->to + excess; if (masterSeqInt->to >= multiple->GetMaster()->Length()) masterSeqInt->to = multiple->GetMaster()->Length() - 1; } else { masterSeqInt->from = 0; masterSeqInt->to = multiple->GetMaster()->Length() - 1; } } SeqLocPtr slaveSeqLoc = (SeqLocPtr) MemNew(sizeof(SeqLoc)); slaveSeqLoc->choice = SEQLOC_INT; SeqIntPtr slaveSeqInt = SeqIntNew(); slaveSeqLoc->data.ptrvalue = slaveSeqInt; // create BLAST_Matrix if using PSSM from multiple const BLAST_Matrix *BLASTmatrix = NULL; if (usePSSM) BLASTmatrix = multiple->GetPSSM(); string err; AlignmentList::const_iterator s, se = toRealign.end(); for (s=toRealign.begin(); s!=se; ++s) { if (multiple && (*s)->GetMaster() != multiple->GetMaster()) ERRORMSG("master sequence mismatch"); // get C Bioseq for master const Sequence *masterSeq = multiple ? multiple->GetMaster() : (*s)->GetMaster(); Bioseq *masterBioseq = masterSeq->parentSet->GetOrCreateBioseq(masterSeq); masterSeqInt->id = masterBioseq->id; // override master alignment interval if specified if (!multiple) { if ((*s)->alignMasterFrom >= 0 && (*s)->alignMasterFrom < masterSeq->Length() && (*s)->alignMasterTo >= 0 && (*s)->alignMasterTo < masterSeq->Length() && (*s)->alignMasterFrom <= (*s)->alignMasterTo) { masterSeqInt->from = (*s)->alignMasterFrom; masterSeqInt->to = (*s)->alignMasterTo; } else { masterSeqInt->from = 0; masterSeqInt->to = masterSeq->Length() - 1; } } // get C Bioseq for slave of each incoming (pairwise) alignment const Sequence *slaveSeq = (*s)->GetSequenceOfRow(1); Bioseq *slaveBioseq = slaveSeq->parentSet->GetOrCreateBioseq(slaveSeq); // setup Seq-loc interval for slave slaveSeqInt->id = slaveBioseq->id; slaveSeqInt->from = ((*s)->alignSlaveFrom >= 0 && (*s)->alignSlaveFrom < slaveSeq->Length()) ? (*s)->alignSlaveFrom : 0; slaveSeqInt->to = ((*s)->alignSlaveTo >= 0 && (*s)->alignSlaveTo < slaveSeq->Length()) ? (*s)->alignSlaveTo : slaveSeq->Length() - 1; INFOMSG("for BLAST: master range " << (masterSeqInt->from + 1) << " to " << (masterSeqInt->to + 1) << ", slave range " << (slaveSeqInt->from + 1) << " to " << (slaveSeqInt->to + 1)); // set search space size SetEffectiveSearchSpaceSize(options, slaveSeqInt->to - slaveSeqInt->from + 1); INFOMSG("effective search space size: " << ((int) options->searchsp_eff)); // actually do the BLAST alignment SeqAlign *salp = NULL; if (usePSSM) { INFOMSG("calling BlastTwoSequencesByLocWithCallback(); PSSM db_length " << (int) options->db_length << ", K " << BLASTmatrix->karlinK); salp = BlastTwoSequencesByLocWithCallback( masterSeqLoc, slaveSeqLoc, "blastp", options, NULL, NULL, NULL, const_cast<BLAST_Matrix*>(BLASTmatrix)); } else { INFOMSG("calling BlastTwoSequencesByLoc()"); salp = BlastTwoSequencesByLoc(masterSeqLoc, slaveSeqLoc, "blastp", options); } // create new alignment structure BlockMultipleAlignment::SequenceList *seqs = new BlockMultipleAlignment::SequenceList(2); (*seqs)[0] = masterSeq; (*seqs)[1] = slaveSeq; BlockMultipleAlignment *newAlignment = new BlockMultipleAlignment(seqs, slaveSeq->parentSet->alignmentManager); newAlignment->SetRowDouble(0, kMax_Double); newAlignment->SetRowDouble(1, kMax_Double); // process the result if (!salp) { WARNINGMSG("BLAST failed to find a significant alignment"); } else { // convert C SeqAlign to C++ for convenience#ifdef _DEBUG AsnIoPtr aip = AsnIoOpen("seqalign.txt", "w"); SeqAlignAsnWrite(salp, aip, NULL); AsnIoFree(aip, true);#endif CSeq_align sa; bool okay = ConvertAsnFromCToCPP(salp, (AsnWriteFunc) SeqAlignAsnWrite, &sa, &err); SeqAlignSetFree(salp); if (!okay) { ERRORMSG("Conversion of SeqAlign to C++ object failed"); continue; } // make sure the structure of this SeqAlign is what we expect if (!sa.IsSetDim() || sa.GetDim() != 2 || !sa.GetSegs().IsDenseg() || sa.GetSegs().GetDenseg().GetDim() != 2 || !masterSeq->identifier->MatchesSeqId(*(sa.GetSegs().GetDenseg().GetIds().front())) || !slaveSeq->identifier->MatchesSeqId(*(sa.GetSegs().GetDenseg().GetIds().back()))) { ERRORMSG("Confused by BLAST result format"); continue; } // fill out with blocks from BLAST alignment CDense_seg::TStarts::const_iterator iStart = sa.GetSegs().GetDenseg().GetStarts().begin(); CDense_seg::TLens::const_iterator iLen = sa.GetSegs().GetDenseg().GetLens().begin();#ifdef _DEBUG int raw_pssm = 0, raw_bl62 = 0;#endif for (int i=0; i<sa.GetSegs().GetDenseg().GetNumseg(); ++i) { int masterStart = *(iStart++), slaveStart = *(iStart++), len = *(iLen++); if (masterStart >= 0 && slaveStart >= 0 && len > 0) { UngappedAlignedBlock *newBlock = new UngappedAlignedBlock(newAlignment); newBlock->SetRangeOfRow(0, masterStart, masterStart + len - 1); newBlock->SetRangeOfRow(1, slaveStart, slaveStart + len - 1); newBlock->width = len; newAlignment->AddAlignedBlockAtEnd(newBlock);#ifdef _DEBUG // calculate score manually, to compare with that returned by BLAST for (int j=0; j<len; ++j) { if (usePSSM) raw_pssm += BLASTmatrix->matrix [masterStart + j] [LookupBLASTResidueNumberFromCharacter( slaveSeq->sequenceString[slaveStart + j])]; raw_bl62 += GetBLOSUM62Score( masterSeq->sequenceString[masterStart + j], slaveSeq->sequenceString[slaveStart + j]); }#endif }#ifdef _DEBUG else if ((masterStart < 0 || slaveStart < 0) && len > 0) { int gap = options->gap_open + options->gap_extend * len; if (usePSSM) raw_pssm -= gap; raw_bl62 -= gap; }#endif }#ifdef _DEBUG TRACEMSG("Calculated raw score by PSSM: " << raw_pssm);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -