⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dust_app.cpp

📁 ncbi源码
💻 CPP
字号:
/* * =========================================================================== * PRODUCTION $Log: dust_app.cpp,v $ * PRODUCTION Revision 1000.2  2004/06/01 18:06:43  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.4 * PRODUCTION * =========================================================================== *//*  $Id: dust_app.cpp,v 1000.2 2004/06/01 18:06:43 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors:  Richa Agarwala * * File Description: *   Program to read nucleotide FASTA and produce DUST'd version. */#include <ncbi_pch.hpp>#include <corelib/ncbiapp.hpp>#include <corelib/ncbistr.hpp>#include <serial/serial.hpp>#include <serial/objistr.hpp>#include <serial/objostr.hpp>#include <serial/iterator.hpp>// Objects includes#include <corelib/ncbidbg.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seq/seqport_util.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/Seq_interval.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqset/Seq_entry.hpp>#include <algo/blast/core/blast_dust.h>#include <algo/blast/core/blast_filter.h>#include <objtools/readers/fasta.hpp> #define FASTA_LEN 60USING_NCBI_SCOPE;using namespace objects;///////////////////////////////////////////////////////////////////////////////  CDustApplication::class CDustApplication : public CNcbiApplication{public:    static const char * const USAGE_LINE;private:    virtual void Init(void);    virtual int  Run(void);    virtual void Exit(void);};///////////////////////////////////////////////////////////////////////////////  Usage descriptionconst char * constCDustApplication::USAGE_LINE = "Dust nucleotide sequence";///////////////////////////////////////////////////////////////////////////////  Get argumentsvoid CDustApplication::Init(void){    // Create command-line argument descriptions class    auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);    // Specify USAGE context    arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),                              USAGE_LINE);    // Describe the expected command-line arguments    arg_desc->AddKey("input", "Inputfile",                        "The input file", CArgDescriptions::eInputFile);    arg_desc->AddKey("output", "Outputfile",                        "The output file", CArgDescriptions::eOutputFile);    arg_desc->AddDefaultKey( "window", "window_size", "window size",                             CArgDescriptions::eInteger, "64" );    arg_desc->AddDefaultKey( "level", "level", "minimum level",                             CArgDescriptions::eInteger, "20" );    arg_desc->AddDefaultKey( "minwin", "minwin", "minimum window size reported",                             CArgDescriptions::eInteger, "4" );    arg_desc->AddDefaultKey( "linker", "linker", "link windows apart by <linker> bp",                             CArgDescriptions::eInteger, "1" );    arg_desc->AddDefaultKey("segments", "segments", "Print segments instead of FASTA", CArgDescriptions::eBoolean, "F");    // Setup arg.descriptions for this application    SetupArgDescriptions(arg_desc.release());}static string GetFastaDefline(const CBioseq & bioseq){    string seq_id_description =            CSeq_id::GetStringDescr(bioseq, CSeq_id::eFormat_FastA);    if ((seq_id_description.size() < 5) ||        (NStr::CompareCase(seq_id_description, 0, 4, "lcl|") != 0))    {        NcbiCerr << "Input Bioseq was not generated as a local entry" << endl;        exit(0);    }    // Take "lcl|" out of id and prefix it with ">"    string defline = ">"+ seq_id_description.substr(4,seq_id_description.size()-4);    // Add title if present    if (bioseq.CanGetDescr())    {        const CSeq_descr & title_seq = bioseq.GetDescr();        if (title_seq.CanGet())        {            const CSeq_descr_Base::Tdata & title = title_seq.Get();            if (!title.empty())            {                CSeq_descr_Base::Tdata::const_iterator it = title.begin();                CRef<CSeqdesc> ref_desc = *it;                if (it != title.end())                {                    defline.append(1, ' '); // space between id and title                    ref_desc->GetLabel(&defline, CSeqdesc::eContent);                }            }        }    }    return defline;}int CDustApplication::Run(void){    CRef<CSeq_entry> seq_entry;    TSeqPos i, start, end;    Uint1 *sequence;                // Sequnce in BLASTNA format    BlastSeqLoc* dust_loc = NULL;   // Regions that get DUST'd    BlastSeqLoc* temp_loc = NULL;       // Get arguments    CArgs args = GetArgs();    Int2 window = args["window"].AsInteger();    Int2 level = args["level"].AsInteger();    Int2 minwin = args["minwin"].AsInteger();    Int2 linker = args["linker"].AsInteger();    Boolean PrintSegments = args["segments"].AsBoolean();    // Set flags for reading data    TReadFastaFlags flags = 0;    flags = fReadFasta_ForceType;    flags |= fReadFasta_AssumeNuc;    flags |= fReadFasta_OneSeq;    flags |= fReadFasta_NoParseID;    // Open files    CNcbiIfstream input_stream(args["input"].AsString().c_str(), IOS_BASE::in);    CNcbiOfstream output_stream(args["output"].AsString().c_str(), IOS_BASE::out);    // Read data    int counter = 0;    while (!input_stream.eof())    {        // Get one sequence from Fasta file        vector<CConstRef<CSeq_loc> > lcase_mask;        seq_entry = ReadFasta(input_stream, flags, &counter, &lcase_mask);        if (seq_entry->IsSeq() && seq_entry->GetSeq().IsNa())        {            const CBioseq & bioseq = seq_entry->GetSeq();            if (bioseq.CanGetInst() &&                bioseq.GetInst().CanGetLength() &&                bioseq.GetInst().CanGetSeq_data() )            {                // Get sequence id                string seq_id_description =                        CSeq_id::GetStringDescr(bioseq, CSeq_id::eFormat_FastA);                // Get defline                string defline = GetFastaDefline(bioseq);                    // Get sequence length                TSeqPos seq_length = bioseq.GetInst().GetLength();                    // Get nucleotide data for sequence                const CSeq_data & seqdata = bioseq.GetInst().GetSeq_data();                // Convert to Iupacna so don't have to worry about packing                auto_ptr< CSeq_data > dest( new CSeq_data );                CSeqportUtil::Convert(seqdata, dest.get(), CSeq_data::e_Iupacna,                                      0, seq_length);                const string & data = dest->GetIupacna().Get();                // Convert nucleotides to BLASTNA format                sequence = (Uint1*) malloc(sizeof(Uint1)*seq_length);                if (sequence == NULL)                {                    NcbiCerr << "No space for storing sequence" << endl;                    exit(0);                }                for (i = 0; i < seq_length; i++)                    switch (data[i]) {                        case 'a': case 'A': sequence[i] = 0; break;                        case 'c': case 'C': sequence[i] = 1; break;                        case 'g': case 'G': sequence[i] = 2; break;                        case 't': case 'T': sequence[i] = 3; break;                        case 'r': case 'R': sequence[i] = 4; break;                        case 'y': case 'Y': sequence[i] = 5; break;                        case 'm': case 'M': sequence[i] = 6; break;                        case 'k': case 'K': sequence[i] = 7; break;                        case 'w': case 'W': sequence[i] = 8; break;                        case 's': case 'S': sequence[i] = 9; break;                        case 'b': case 'B': sequence[i] = 10; break;                        case 'd': case 'D': sequence[i] = 11; break;                        case 'h': case 'H': sequence[i] = 12; break;                        case 'v': case 'V': sequence[i] = 13; break;                        case 'n': case 'N': sequence[i] = 14; break;                        default: sequence[i] = 14;                                 NcbiCerr << "Warning: Position " << i+1 ;                                 NcbiCerr << " of sequence " << seq_id_description;                                 NcbiCerr << " is and invalid residue; treating it as N" << endl;                    }                // Do dusting                dust_loc = NULL;                SeqBufferDust(sequence, seq_length, 0, level, window,                               minwin, linker, &dust_loc);                if (PrintSegments)                {                    for (temp_loc = dust_loc; temp_loc; temp_loc = temp_loc->next)                    {                        start = (TSeqPos) ((SSeqRange *) temp_loc->ptr)->left;                        end = (TSeqPos) ((SSeqRange *) temp_loc->ptr)->right;                        output_stream << seq_id_description << "\t";                        output_stream << start+1 << "\t" << end+1 << endl;                    }                }                else                {                    output_stream << defline << endl;                    // Get copy of (const) data and mask DUST locations found                    string copy_data = data;                    for (temp_loc = dust_loc; temp_loc; temp_loc = temp_loc->next)                    {                        start = (TSeqPos) ((SSeqRange *) temp_loc->ptr)->left;                        end = (TSeqPos) ((SSeqRange *) temp_loc->ptr)->right;                        for (; start <= end; start++)                            copy_data[start] = tolower(copy_data[start]);                    }                    // Get lowercase positions                    if (lcase_mask[0].NotEmpty())                    {                        CConstRef<CSeq_loc> lowercase = lcase_mask[0];                        if (lowercase->IsPacked_int())                            ITERATE(list< CRef<CSeq_interval> >, itr,                                     lowercase->GetPacked_int().Get())                            {                                start = (TSeqPos) (*itr)->GetFrom();                                end = (TSeqPos) (*itr)->GetTo();                                for (; start < end; start++)                                    copy_data[start] = tolower(copy_data[start]);                            }                    }                    // Print result                    start = 0;                    TSeqPos copy_size = copy_data.size();                    while (start < copy_size)                    {                        end = ((start+FASTA_LEN) >= copy_size) ? copy_size : start+FASTA_LEN;                        output_stream << copy_data.substr(start, end-start) << endl;                        start = end;                    }                }                    free(sequence);                BlastSeqLocFree(dust_loc);            }        }    }    return 0;}///////////////////////////////////////////////////////////////////////////////  Cleanupvoid CDustApplication::Exit(void){    SetDiagStream(0);}///////////////////////////////////////////////////////////////////////////////  MAINint main(int argc, const char* argv[]){    // Execute main application function    return CDustApplication().AppMain(argc, argv, 0, eDS_Default, 0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -