📄 dust_app.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: dust_app.cpp,v $ * PRODUCTION Revision 1000.2 2004/06/01 18:06:43 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.4 * PRODUCTION * =========================================================================== *//* $Id: dust_app.cpp,v 1000.2 2004/06/01 18:06:43 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors: Richa Agarwala * * File Description: * Program to read nucleotide FASTA and produce DUST'd version. */#include <ncbi_pch.hpp>#include <corelib/ncbiapp.hpp>#include <corelib/ncbistr.hpp>#include <serial/serial.hpp>#include <serial/objistr.hpp>#include <serial/objostr.hpp>#include <serial/iterator.hpp>// Objects includes#include <corelib/ncbidbg.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seq/seqport_util.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/Seq_interval.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqset/Seq_entry.hpp>#include <algo/blast/core/blast_dust.h>#include <algo/blast/core/blast_filter.h>#include <objtools/readers/fasta.hpp> #define FASTA_LEN 60USING_NCBI_SCOPE;using namespace objects;/////////////////////////////////////////////////////////////////////////////// CDustApplication::class CDustApplication : public CNcbiApplication{public: static const char * const USAGE_LINE;private: virtual void Init(void); virtual int Run(void); virtual void Exit(void);};/////////////////////////////////////////////////////////////////////////////// Usage descriptionconst char * constCDustApplication::USAGE_LINE = "Dust nucleotide sequence";/////////////////////////////////////////////////////////////////////////////// Get argumentsvoid CDustApplication::Init(void){ // Create command-line argument descriptions class auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions); // Specify USAGE context arg_desc->SetUsageContext(GetArguments().GetProgramBasename(), USAGE_LINE); // Describe the expected command-line arguments arg_desc->AddKey("input", "Inputfile", "The input file", CArgDescriptions::eInputFile); arg_desc->AddKey("output", "Outputfile", "The output file", CArgDescriptions::eOutputFile); arg_desc->AddDefaultKey( "window", "window_size", "window size", CArgDescriptions::eInteger, "64" ); arg_desc->AddDefaultKey( "level", "level", "minimum level", CArgDescriptions::eInteger, "20" ); arg_desc->AddDefaultKey( "minwin", "minwin", "minimum window size reported", CArgDescriptions::eInteger, "4" ); arg_desc->AddDefaultKey( "linker", "linker", "link windows apart by <linker> bp", CArgDescriptions::eInteger, "1" ); arg_desc->AddDefaultKey("segments", "segments", "Print segments instead of FASTA", CArgDescriptions::eBoolean, "F"); // Setup arg.descriptions for this application SetupArgDescriptions(arg_desc.release());}static string GetFastaDefline(const CBioseq & bioseq){ string seq_id_description = CSeq_id::GetStringDescr(bioseq, CSeq_id::eFormat_FastA); if ((seq_id_description.size() < 5) || (NStr::CompareCase(seq_id_description, 0, 4, "lcl|") != 0)) { NcbiCerr << "Input Bioseq was not generated as a local entry" << endl; exit(0); } // Take "lcl|" out of id and prefix it with ">" string defline = ">"+ seq_id_description.substr(4,seq_id_description.size()-4); // Add title if present if (bioseq.CanGetDescr()) { const CSeq_descr & title_seq = bioseq.GetDescr(); if (title_seq.CanGet()) { const CSeq_descr_Base::Tdata & title = title_seq.Get(); if (!title.empty()) { CSeq_descr_Base::Tdata::const_iterator it = title.begin(); CRef<CSeqdesc> ref_desc = *it; if (it != title.end()) { defline.append(1, ' '); // space between id and title ref_desc->GetLabel(&defline, CSeqdesc::eContent); } } } } return defline;}int CDustApplication::Run(void){ CRef<CSeq_entry> seq_entry; TSeqPos i, start, end; Uint1 *sequence; // Sequnce in BLASTNA format BlastSeqLoc* dust_loc = NULL; // Regions that get DUST'd BlastSeqLoc* temp_loc = NULL; // Get arguments CArgs args = GetArgs(); Int2 window = args["window"].AsInteger(); Int2 level = args["level"].AsInteger(); Int2 minwin = args["minwin"].AsInteger(); Int2 linker = args["linker"].AsInteger(); Boolean PrintSegments = args["segments"].AsBoolean(); // Set flags for reading data TReadFastaFlags flags = 0; flags = fReadFasta_ForceType; flags |= fReadFasta_AssumeNuc; flags |= fReadFasta_OneSeq; flags |= fReadFasta_NoParseID; // Open files CNcbiIfstream input_stream(args["input"].AsString().c_str(), IOS_BASE::in); CNcbiOfstream output_stream(args["output"].AsString().c_str(), IOS_BASE::out); // Read data int counter = 0; while (!input_stream.eof()) { // Get one sequence from Fasta file vector<CConstRef<CSeq_loc> > lcase_mask; seq_entry = ReadFasta(input_stream, flags, &counter, &lcase_mask); if (seq_entry->IsSeq() && seq_entry->GetSeq().IsNa()) { const CBioseq & bioseq = seq_entry->GetSeq(); if (bioseq.CanGetInst() && bioseq.GetInst().CanGetLength() && bioseq.GetInst().CanGetSeq_data() ) { // Get sequence id string seq_id_description = CSeq_id::GetStringDescr(bioseq, CSeq_id::eFormat_FastA); // Get defline string defline = GetFastaDefline(bioseq); // Get sequence length TSeqPos seq_length = bioseq.GetInst().GetLength(); // Get nucleotide data for sequence const CSeq_data & seqdata = bioseq.GetInst().GetSeq_data(); // Convert to Iupacna so don't have to worry about packing auto_ptr< CSeq_data > dest( new CSeq_data ); CSeqportUtil::Convert(seqdata, dest.get(), CSeq_data::e_Iupacna, 0, seq_length); const string & data = dest->GetIupacna().Get(); // Convert nucleotides to BLASTNA format sequence = (Uint1*) malloc(sizeof(Uint1)*seq_length); if (sequence == NULL) { NcbiCerr << "No space for storing sequence" << endl; exit(0); } for (i = 0; i < seq_length; i++) switch (data[i]) { case 'a': case 'A': sequence[i] = 0; break; case 'c': case 'C': sequence[i] = 1; break; case 'g': case 'G': sequence[i] = 2; break; case 't': case 'T': sequence[i] = 3; break; case 'r': case 'R': sequence[i] = 4; break; case 'y': case 'Y': sequence[i] = 5; break; case 'm': case 'M': sequence[i] = 6; break; case 'k': case 'K': sequence[i] = 7; break; case 'w': case 'W': sequence[i] = 8; break; case 's': case 'S': sequence[i] = 9; break; case 'b': case 'B': sequence[i] = 10; break; case 'd': case 'D': sequence[i] = 11; break; case 'h': case 'H': sequence[i] = 12; break; case 'v': case 'V': sequence[i] = 13; break; case 'n': case 'N': sequence[i] = 14; break; default: sequence[i] = 14; NcbiCerr << "Warning: Position " << i+1 ; NcbiCerr << " of sequence " << seq_id_description; NcbiCerr << " is and invalid residue; treating it as N" << endl; } // Do dusting dust_loc = NULL; SeqBufferDust(sequence, seq_length, 0, level, window, minwin, linker, &dust_loc); if (PrintSegments) { for (temp_loc = dust_loc; temp_loc; temp_loc = temp_loc->next) { start = (TSeqPos) ((SSeqRange *) temp_loc->ptr)->left; end = (TSeqPos) ((SSeqRange *) temp_loc->ptr)->right; output_stream << seq_id_description << "\t"; output_stream << start+1 << "\t" << end+1 << endl; } } else { output_stream << defline << endl; // Get copy of (const) data and mask DUST locations found string copy_data = data; for (temp_loc = dust_loc; temp_loc; temp_loc = temp_loc->next) { start = (TSeqPos) ((SSeqRange *) temp_loc->ptr)->left; end = (TSeqPos) ((SSeqRange *) temp_loc->ptr)->right; for (; start <= end; start++) copy_data[start] = tolower(copy_data[start]); } // Get lowercase positions if (lcase_mask[0].NotEmpty()) { CConstRef<CSeq_loc> lowercase = lcase_mask[0]; if (lowercase->IsPacked_int()) ITERATE(list< CRef<CSeq_interval> >, itr, lowercase->GetPacked_int().Get()) { start = (TSeqPos) (*itr)->GetFrom(); end = (TSeqPos) (*itr)->GetTo(); for (; start < end; start++) copy_data[start] = tolower(copy_data[start]); } } // Print result start = 0; TSeqPos copy_size = copy_data.size(); while (start < copy_size) { end = ((start+FASTA_LEN) >= copy_size) ? copy_size : start+FASTA_LEN; output_stream << copy_data.substr(start, end-start) << endl; start = end; } } free(sequence); BlastSeqLocFree(dust_loc); } } } return 0;}/////////////////////////////////////////////////////////////////////////////// Cleanupvoid CDustApplication::Exit(void){ SetDiagStream(0);}/////////////////////////////////////////////////////////////////////////////// MAINint main(int argc, const char* argv[]){ // Execute main application function return CDustApplication().AppMain(argc, argv, 0, eDS_Default, 0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -