📄 splign_app.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: splign_app.cpp,v $ * PRODUCTION Revision 1000.6 2004/06/01 18:05:26 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.24 * PRODUCTION * =========================================================================== *//* $Id: splign_app.cpp,v 1000.6 2004/06/01 18:05:26 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Yuri Kapustin * * File Description: Splign application * */#include <ncbi_pch.hpp>#include "splign_app.hpp"#include "splign_app_exception.hpp"#include <corelib/ncbistd.hpp>#include <serial/objostrasn.hpp>#include <serial/serial.hpp>#include <algo/align/nw_spliced_aligner16.hpp>#include <algo/align/nw_spliced_aligner32.hpp>#include <algo/align/splign/splign.hpp>#include <algo/align/splign/splign_simple.hpp>#include <algo/align/splign/splign_formatter.hpp>#include <objmgr/object_manager.hpp>#include <objmgr/scope.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seqalign/Seq_align.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <algo/blast/api/bl2seq.hpp>#include <iostream>#include <memory>BEGIN_NCBI_SCOPEUSING_SCOPE(objects);const char kQuality_high[] = "high";const char kQuality_low[] = "low";void CSplignApp::Init(){ HideStdArgs( fHideLogfile | fHideConffile | fHideVersion); auto_ptr<CArgDescriptions> argdescr(new CArgDescriptions); string program_name ("Splign v.1.06");#ifdef GENOME_PIPELINE program_name += 'p';#endif argdescr->SetUsageContext(GetArguments().GetProgramName(), program_name); argdescr->AddOptionalKey ("index", "index", "Batch mode index file (use -mkidx to generate).", CArgDescriptions::eString); argdescr->AddFlag ("mkidx", "Generate batch mode index and quit.", true); argdescr->AddOptionalKey ("hits", "hits", "Batch mode hit file. " "This file defines the set of sequences to align and " "is also used to guide alignments.", CArgDescriptions::eString); argdescr->AddOptionalKey ("query", "query", "FastA file with the spliced sequence. Use in pairwise mode only.", CArgDescriptions::eString); argdescr->AddOptionalKey ("subj", "subj", "FastA file with the genomic sequence. Use in pairwise mode only.", CArgDescriptions::eString); argdescr->AddDefaultKey ("log", "log", "Splign log file", CArgDescriptions::eString, "splign.log"); argdescr->AddOptionalKey ("asn", "asn", "ASN.1 output file name", CArgDescriptions::eString); argdescr->AddDefaultKey ("strand", "strand", "Spliced sequence's strand.", CArgDescriptions::eString, "plus"); argdescr->AddFlag ("noendgaps", "Skip detection of unaligning regions at the ends.", true); argdescr->AddFlag ("nopolya", "Assume no Poly(A) tail.", true); argdescr->AddDefaultKey ("compartment_penalty", "compartment_penalty", "Penalty to open a new compartment.", CArgDescriptions::eDouble, "0.75"); argdescr->AddDefaultKey ("min_query_cov", "min_query_cov", "Min query coverage by initial Blast hits. " "Queries with less coverage will not be aligned.", CArgDescriptions::eDouble, "0.25"); argdescr->AddDefaultKey ("min_idty", "identity", "Minimal exon identity. Lower identity segments " "will be marked as gaps.", CArgDescriptions::eDouble, "0.75");#ifdef GENOME_PIPELINE argdescr->AddDefaultKey ("quality", "quality", "Genomic sequence quality.", CArgDescriptions::eString, kQuality_high); argdescr->AddDefaultKey ("Wm", "match", "match score", CArgDescriptions::eInteger, NStr::IntToString(CNWAligner::GetDefaultWm()).c_str()); argdescr->AddDefaultKey ("Wms", "mismatch", "mismatch score", CArgDescriptions::eInteger, NStr::IntToString(CNWAligner::GetDefaultWms()).c_str()); argdescr->AddDefaultKey ("Wg", "gap", "gap opening score", CArgDescriptions::eInteger, NStr::IntToString(CNWAligner::GetDefaultWg()).c_str()); argdescr->AddDefaultKey ("Ws", "space", "gap extension (space) score", CArgDescriptions::eInteger, NStr::IntToString(CNWAligner::GetDefaultWs()).c_str()); argdescr->AddDefaultKey ("Wi0", "Wi0", "Conventional intron (GT/AG) score", CArgDescriptions::eInteger, NStr::IntToString(CSplicedAligner16::GetDefaultWi(0)).c_str()); argdescr->AddDefaultKey ("Wi1", "Wi1", "Conventional intron (GC/AG) score", CArgDescriptions::eInteger, NStr::IntToString(CSplicedAligner16::GetDefaultWi(1)).c_str()); argdescr->AddDefaultKey ("Wi2", "Wi2", "Conventional intron (AT/AC) score", CArgDescriptions::eInteger, NStr::IntToString(CSplicedAligner16::GetDefaultWi(2)).c_str()); argdescr->AddDefaultKey ("Wi3", "Wi3", "Arbitrary intron score", CArgDescriptions::eInteger, NStr::IntToString(CSplicedAligner16::GetDefaultWi(3)).c_str()); // restrictions CArgAllow_Strings* constrain_errlevel = new CArgAllow_Strings; constrain_errlevel->Allow(kQuality_low)->Allow(kQuality_high); argdescr->SetConstraint("quality", constrain_errlevel); #endif CArgAllow_Strings* constrain_strand = new CArgAllow_Strings; constrain_strand->Allow("plus")->Allow("minus"); argdescr->SetConstraint("strand", constrain_strand); CArgAllow* constrain01 = new CArgAllow_Doubles(0,1); argdescr->SetConstraint("min_idty", constrain01); argdescr->SetConstraint("compartment_penalty", constrain01); argdescr->SetConstraint("min_query_cov", constrain01); SetupArgDescriptions(argdescr.release());}bool CSplignApp::x_GetNextPair(istream* ifs, vector<CHit>* hits){ hits->clear(); if(!m_pending.size() && (!ifs || !*ifs) ) { return false; } if(!m_pending.size()) { string query, subj; if(m_firstline.size()) { CHit hit (m_firstline.c_str()); query = hit.m_Query; subj = hit.m_Subj; m_pending.push_back(hit); } char buf [1024]; while(ifs) { buf[0] = 0; CT_POS_TYPE pos0 = ifs->tellg(); ifs->getline(buf, sizeof buf, '\n'); CT_POS_TYPE pos1 = ifs->tellg(); if(pos1 == pos0) break; // GCC hack if(buf[0] == '#') continue; // skip comments const char* p = buf; // skip leading spaces while(*p == ' ' || *p == '\t') ++p; if(*p == 0) continue; // skip empty lines CHit hit (p); if(query.size() == 0) { query = hit.m_Query; } if(subj.size() == 0) { subj = hit.m_Subj; } if(hit.m_ai[0] > hit.m_ai[1]) { NCBI_THROW(CSplignAppException, eBadData, "Hit with reversed query coordinates detected." ); } if(hit.m_ai[2] == hit.m_ai[3]) { // skip single bases continue; } if(hit.m_Query != query || hit.m_Subj != subj) { m_firstline = p; break; } m_pending.push_back(hit); } } const size_t pending_size = m_pending.size(); if(pending_size) { const string& query = m_pending[0].m_Query; const string& subj = m_pending[0].m_Subj; size_t i = 1; for(; i < pending_size; ++i) { const CHit& h = m_pending[i]; if(h.m_Query != query || h.m_Subj != subj) { break; } } hits->resize(i); copy(m_pending.begin(), m_pending.begin() + i, hits->begin()); m_pending.erase(m_pending.begin(), m_pending.begin() + i); } return hits->size() > 0;}void CSplignApp::x_LogStatus(size_t model_id, const string& query, const string& subj, bool error, const string& msg){ string error_tag (error? "Error: ": ""); if(model_id == 0) { m_logstream << '-'; } else { m_logstream << model_id; } m_logstream << '\t' << query << '\t' << subj << '\t' << error_tag << msg << endl;}istream* CSplignApp::x_GetPairwiseHitStream( CSeqLoaderPairwise& seq_loader) const{ vector<char> query, subj; seq_loader.Load(seq_loader.GetQueryStringId(), &query, 0, kMax_UInt); seq_loader.Load(seq_loader.GetSubjStringId(), &subj, 0, kMax_UInt); CRef<CObjectManager> objmgr(new CObjectManager); CRef<CSeq_loc> seqloc_query; CRef<CScope> scope_query; {{ scope_query.Reset(new CScope(*objmgr)); scope_query->AddDefaults(); CRef<CSeq_entry> se (seq_loader.GetQuerySeqEntry()); scope_query->AddTopLevelSeqEntry(*se); seqloc_query.Reset(new CSeq_loc); seqloc_query->SetWhole().Assign(*(se->GetSeq().GetId().front())); }} CRef<CSeq_loc> seqloc_subj; CRef<CScope> scope_subj; {{ scope_subj.Reset(new CScope(*objmgr)); scope_subj->AddDefaults(); CRef<CSeq_entry> se (seq_loader.GetSubjSeqEntry()); scope_subj->AddTopLevelSeqEntry(*se); seqloc_subj.Reset(new CSeq_loc); seqloc_subj->SetWhole().Assign(*(se->GetSeq().GetId().front())); }} blast::CBl2Seq Blast(blast::SSeqLoc(seqloc_query.GetNonNullPointer(), scope_query.GetNonNullPointer()), blast::SSeqLoc(seqloc_subj.GetNonNullPointer(), scope_subj.GetNonNullPointer()), blast::eMegablast); blast::TSeqAlignVector blast_output (Blast.Run()); if (!blast_output.empty() && blast_output.front().NotEmpty() &&
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -