gff_formatter.cpp

来自「ncbi源码」· C++ 代码 · 共 628 行 · 第 1/2 页

CPP
628
字号
/* * =========================================================================== * PRODUCTION $Log: gff_formatter.cpp,v $ * PRODUCTION Revision 1000.1  2004/06/01 19:44:46  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7 * PRODUCTION * =========================================================================== *//*  $Id: gff_formatter.cpp,v 1000.1 2004/06/01 19:44:46 gouriano Exp $* ===========================================================================**                            PUBLIC DOMAIN NOTICE*               National Center for Biotechnology Information**  This software/database is a "United States Government Work" under the*  terms of the United States Copyright Act.  It was written as part of*  the author's official duties as a United States Government employee and*  thus cannot be copyrighted.  This software/database is freely available*  to the public for use. The National Library of Medicine and the U.S.*  Government have not placed any restriction on its use or reproduction.**  Although all reasonable efforts have been taken to ensure the accuracy*  and reliability of the software and data, the NLM and the U.S.*  Government do not and cannot warrant the performance or results that*  may be obtained by using this software or data. The NLM and the U.S.*  Government disclaim all warranties, express or implied, including*  warranties of performance, merchantability or fitness for any particular*  purpose.**  Please cite the author in any work or product based on this material.** ===========================================================================** Author:  Aaron Ucko, NCBI*          Mati Shomrat** File Description:*           **/#include <ncbi_pch.hpp>#include <corelib/ncbistd.hpp>#include <objects/seqfeat/Genetic_code_table.hpp>#include <objects/general/Date.hpp>#include <objects/seq/Bioseq.hpp>#include <objmgr/util/sequence.hpp>#include <objmgr/util/feature.hpp>#include <objmgr/seq_vector.hpp>#include <objtools/format/gff_formatter.hpp>#include <objtools/format/items/locus_item.hpp>#include <objtools/format/items/date_item.hpp>#include <objtools/format/items/feature_item.hpp>#include <objtools/format/items/basecount_item.hpp>#include <objtools/format/items/sequence_item.hpp>#include <objtools/format/items/ctrl_items.hpp>#include <objtools/format/context.hpp>#include <algorithm>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)CGFFFormatter::CGFFFormatter(void) :    m_GFFFlags(CGFFFormatter::fGTFCompat){}void CGFFFormatter::Start(IFlatTextOStream& text_os){    list<string> l;    l.push_back("##gff-version 2");    l.push_back("##source-version NCBI C++ formatter 0.1");    l.push_back("##date " + CurrentTime().AsString("Y-M-D"));    text_os.AddParagraph(l);}void CGFFFormatter::StartSection(const CStartSectionItem& ssec, IFlatTextOStream& text_os){    list<string> l;    CBioseqContext& bctx = *ssec.GetContext();    switch (bctx.GetMol()) {    case CSeq_inst::eMol_dna:  m_SeqType = "DNA";      break;    case CSeq_inst::eMol_rna:  m_SeqType = "RNA";      break;    case CSeq_inst::eMol_aa:   m_SeqType = "Protein";  break;    default:                   m_SeqType.erase();      break;    }    if ( !m_SeqType.empty() ) {        l.push_back("##Type " + m_SeqType + ' '                    + bctx.GetAccession());    }    text_os.AddParagraph(l);}void CGFFFormatter::EndSection(const CEndSectionItem&, IFlatTextOStream& text_os){    if ( !m_EndSequence.empty() ) {        list<string> l;        l.push_back(m_EndSequence);        text_os.AddParagraph(l);    }}void CGFFFormatter::FormatLocus(const CLocusItem& locus,  IFlatTextOStream& text_os){    m_Strandedness = locus.GetStrand();}void CGFFFormatter::FormatDate(const CDateItem& date, IFlatTextOStream& text_os){    m_Date.erase();    const CDate* d = date.GetUpdateDate();    if ( d != 0 ) {        d->GetDate(&m_Date, "%4Y-%{%2M%|??%}-%{%2D%|??%}");    }}/////////////////////////////////////////////////////////////////////////////// FEATURESvoid CGFFFormatter::FormatFeature(const CFeatureItemBase& f, IFlatTextOStream& text_os){    const CSeq_feat& seqfeat = f.GetFeat();    string           key(f.GetKey()), oldkey;    bool             gtf     = false;    CBioseqContext& ctx = *f.GetContext();    CScope* scope = &ctx.GetScope();    // CSeq_loc         tentative_stop;    if ((m_GFFFlags & fGTFCompat)  &&  !ctx.IsProt()        &&  (key == "CDS"  ||  key == "exon")) {        gtf = true;    } else if ((m_GFFFlags & fGTFCompat)               &&  ctx.GetMol() == CSeq_inst::eMol_dna               &&  seqfeat.GetData().IsRna()) {        oldkey = key;        key    = "exon";        gtf    = true;    } else if ((m_GFFFlags & fGTFOnly) == fGTFOnly) {        return;    }    CConstRef<CFlatFeature> feat = f.Format();    list<string>  l;    list<string>  attr_list;    if ( !oldkey.empty() ) {        attr_list.push_back("gbkey \"" + oldkey + "\";");    }    ITERATE (CFlatFeature::TQuals, it, feat->GetQuals()) {        string name = (*it)->GetName();        if (name == "codon_start"  ||  name == "translation"            ||  name == "transcription") {            continue; // suppressed to reduce verbosity        } else if (name == "number"  &&  key == "exon") {            name = "exon_number";        } else if ((m_GFFFlags & fGTFCompat)  &&  !ctx.IsProt()                   &&  name == "gene") {            string gene_id = x_GetGeneID(*feat, (*it)->GetValue(), ctx);            if (key != "gene") {                string transcript_id = x_GetTranscriptID(*feat, gene_id, ctx);                attr_list.push_front                    ("transcript_id \"" + transcript_id + "\";");            }            attr_list.push_front("gene_id \"" + gene_id + "\";");            continue;        }        string value;        NStr::Replace((*it)->GetValue(), " \b", kEmptyStr, value);        string value2(NStr::PrintableString(value));        // some parsers may be dumb, so quote further        value.erase();        ITERATE (string, c, value2) {            switch (*c) {            case ' ':  value += "\\x20"; break;            case '\"': value += "x22";   break; // already backslashed            case '#':  value += "\\x23"; break;            default:   value += *c;            }        }        attr_list.push_back(name + " \"" + value + "\";");    }    string attrs(NStr::Join(attr_list, " "));    string source = x_GetSourceName(ctx);    int frame = -1;    if (seqfeat.GetData().IsCdregion()  &&  !ctx.IsProt() ) {        const CCdregion& cds = seqfeat.GetData().GetCdregion();        frame = max(cds.GetFrame() - 1, 0);    }    CRef<CSeq_loc> tentative_stop;    if (gtf  &&  seqfeat.GetData().IsCdregion()) {        const CCdregion& cds = seqfeat.GetData().GetCdregion();        if ( !f.GetLoc().IsPartialRight()  &&  seqfeat.IsSetProduct() ) {            TSeqPos loc_len = sequence::GetLength(f.GetLoc(), scope);            TSeqPos prod_len = sequence::GetLength(seqfeat.GetProduct(),                                                   scope);            if (loc_len >= frame + 3 * prod_len + 3) {                SRelLoc::TRange range;                range.SetFrom(frame + 3 * prod_len);                range.SetTo  (frame + 3 * prod_len + 2);                // needs to be partial for TranslateCdregion to DTRT                range.SetFuzz_from().SetLim(CInt_fuzz::eLim_lt);                SRelLoc::TRanges ranges;                ranges.push_back(CRef<SRelLoc::TRange>(&range));                tentative_stop = SRelLoc(f.GetLoc(), ranges).Resolve(scope);            }            if (tentative_stop.NotEmpty()  &&  !tentative_stop->IsNull()) {                string s;                CCdregion_translate::TranslateCdregion                    (s, ctx.GetHandle(), *tentative_stop, cds);                if (s != "*") {                    tentative_stop.Reset();                }            } else {                tentative_stop.Reset();            }        }    }    x_AddFeature(l, f.GetLoc(), source, key, "." /*score*/, frame, attrs, gtf, ctx, tentative_stop);    if (gtf  &&  seqfeat.GetData().IsCdregion()) {        const CCdregion& cds = seqfeat.GetData().GetCdregion();        if ( !f.GetLoc().IsPartialLeft() ) {            CRef<CSeq_loc> tentative_start;            {{                CRef<SRelLoc::TRange> range(new SRelLoc::TRange);                SRelLoc::TRanges      ranges;                range->SetFrom(frame);                range->SetTo(frame + 2);                ranges.push_back(range);                tentative_start = SRelLoc(f.GetLoc(), ranges).Resolve(scope);            }}            string s;            ctx.GetHandle().GetSequenceView                (*tentative_start, CBioseq_Handle::eViewConstructed)                .GetSeqData(0, 3, s);            const CTrans_table* tt;            if (cds.IsSetCode()) {                tt = &CGen_code_table::GetTransTable(cds.GetCode());            } else {                tt = &CGen_code_table::GetTransTable(1);            }            if (s.size() == 3                &&  tt->IsAnyStart(tt->SetCodonState(s[0], s[1], s[2]))) {                x_AddFeature(l, *tentative_start, source, "start_codon",                             "." /* score */, 0, attrs, gtf, ctx);            }        }        if ( tentative_stop ) {            x_AddFeature(l, *tentative_stop, source, "stop_codon",                                 "." /* score */, 0, attrs, gtf, ctx);        }    }    text_os.AddParagraph(l, &seqfeat);}/////////////////////////////////////////////////////////////////////////////// BASE COUNT// used as a trigger for the sequence headervoid CGFFFormatter::FormatBasecount(const CBaseCountItem& bc, IFlatTextOStream& text_os){    if ( !(m_GFFFlags & fShowSeq) )        return;    CBioseqContext& ctx = *bc.GetContext();    list<string> l;    l.push_back("##" + m_SeqType + ' ' + ctx.GetAccession());    text_os.AddParagraph(l);    m_EndSequence = "##end-" + m_SeqType;}/////////////////////////////////////////////////////////////////////////////// SEQUENCEvoid CGFFFormatter::FormatSequence(const CSequenceItem& seq, IFlatTextOStream& text_os){    if ( !(m_GFFFlags & fShowSeq) )

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?