gff_reader.cpp

来自「ncbi源码」· C++ 代码 · 共 769 行 · 第 1/2 页

CPP
769
字号
/* * =========================================================================== * PRODUCTION $Log: gff_reader.cpp,v $ * PRODUCTION Revision 1000.1  2004/06/01 19:46:20  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6 * PRODUCTION * =========================================================================== *//*  $Id: gff_reader.cpp,v 1000.1 2004/06/01 19:46:20 gouriano Exp $* ===========================================================================**                            PUBLIC DOMAIN NOTICE*               National Center for Biotechnology Information**  This software/database is a "United States Government Work" under the*  terms of the United States Copyright Act.  It was written as part of*  the author's official duties as a United States Government employee and*  thus cannot be copyrighted.  This software/database is freely available*  to the public for use. The National Library of Medicine and the U.S.*  Government have not placed any restriction on its use or reproduction.**  Although all reasonable efforts have been taken to ensure the accuracy*  and reliability of the software and data, the NLM and the U.S.*  Government do not and cannot warrant the performance or results that*  may be obtained by using this software or data. The NLM and the U.S.*  Government disclaim all warranties, express or implied, including*  warranties of performance, merchantability or fitness for any particular*  purpose.**  Please cite the author in any work or product based on this material.** ===========================================================================** Authors:  Aaron Ucko, Wratko Hlavina** File Description:*   Reader for GFF (including GTF) files.** ===========================================================================*/#include <ncbi_pch.hpp>#include <objtools/readers/gff_reader.hpp>#include <corelib/ncbitime.hpp>#include <corelib/ncbiutil.hpp>#include <serial/iterator.hpp>#include <objects/general/Date.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seqfeat/Cdregion.hpp>#include <objects/seqfeat/Gb_qual.hpp>#include <objects/seqloc/Seq_interval.hpp>#include <objects/seqloc/Seq_point.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objtools/readers/readfeat.hpp>#include <algorithm>#include <ctype.h>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)CRef<CSeq_entry> CGFFReader::Read(CNcbiIstream& in, TFlags flags){    x_Reset();    m_Flags  = flags;    m_Stream = &in;    string line;    while (x_GetNextLine(line)) {        if (NStr::StartsWith(line, "##")) {            x_ParseStructuredComment(line);        } else if (NStr::StartsWith(line, "#")) {            // regular comment; ignore        } else {            CRef<SRecord> record = x_ParseFeatureInterval(line);            if (record) {                string id = x_FeatureID(*record);                if (id.empty()) {                    x_PlaceFeature(*x_ParseRecord(*record), *record);                } else {                    CRef<SRecord>& match = m_DelayedFeats[id];                    if (match) {                        x_MergeRecords(*match, *record);                    } else {                        match.Reset(record);                    }                }            }        }    }    ITERATE (TDelayedFeats, it, m_DelayedFeats) {        x_PlaceFeature(*x_ParseRecord(*it->second), *it->second);    }    CRef<CSeq_entry> tse(m_TSE); // need to save before resetting.    x_Reset();    return tse;}void CGFFReader::x_Warn(const string& message, unsigned int line){    if (line) {        ERR_POST(Warning << message << " [GFF input, line " << line << ']');    } else {        ERR_POST(Warning << message << " [GFF input]");    }}void CGFFReader::x_Reset(void){    m_TSE.Reset(new CSeq_entry);    m_SeqNameCache.clear();    m_SeqCache.clear();    m_DelayedFeats.clear();    m_DefMol.erase();    m_LineNumber = 0;}bool CGFFReader::x_GetNextLine(string& line){    ++m_LineNumber;    return NcbiGetlineEOL(*m_Stream, line)  ||  !line.empty();}void CGFFReader::x_ParseStructuredComment(const string& line){    vector<string> v;    NStr::Tokenize(line, "# \t", v, NStr::eMergeDelims);    if (v[0] == "date"  &&  v.size() > 1) {        x_ParseDateComment(v[1]);    } else if (v[0] == "type"  &&  v.size() > 1) {        x_ParseTypeComment(v[1], v.size() > 2 ? v[2] : kEmptyStr);    }    // etc.}void CGFFReader::x_ParseDateComment(const string& date){    try {        CRef<CSeqdesc> desc(new CSeqdesc);        desc->SetUpdate_date().SetToTime(CTime(date, "Y-M-D"),                                         CDate::ePrecision_day);        m_TSE->SetSet().SetDescr().Set().push_back(desc);    } catch (CTimeException& e) {        x_Warn(string("Bad ISO date: ") + e.what(), x_GetLineNumber());    }}void CGFFReader::x_ParseTypeComment(const string& moltype,                                    const string& seqname){    if (seqname.empty()) {        m_DefMol = moltype;    } else {        // automatically adds to m_TSE if new        x_ResolveID(*x_ResolveSeqName(seqname), moltype);    }}CRef<CGFFReader::SRecord>CGFFReader::x_ParseFeatureInterval(const string& line){    vector<string> v;    bool           misdelimited = false;    NStr::Tokenize(line, "\t", v, NStr::eNoMergeDelims);    if (v.size() < 8) {        NStr::Tokenize(line, " \t", v, NStr::eMergeDelims);        if (v.size() < 8) {            x_Warn("Skipping line due to insufficient fields",                   x_GetLineNumber());            return null;        } else {            x_Warn("Bad delimiters (should use tabs)", x_GetLineNumber());            misdelimited = true;        }    } else {        // XXX - warn about extra fields (if any), but only if they're        // not comments        v.resize(9);    }    CRef<SRecord> record(x_NewRecord());    string        accession;    TSeqPos       from = 0, to = numeric_limits<TSeqPos>::max();    ENa_strand    strand = eNa_strand_unknown;    accession      = v[0];    record->source = v[1];    record->key    = v[2];    try {        from = NStr::StringToUInt(v[3]) - 1;    } catch (std::exception& e) {        x_Warn(string("Bad FROM position: ") + e.what(), x_GetLineNumber());    }    try {        to = NStr::StringToUInt(v[4]) - 1;    } catch (std::exception& e) {        x_Warn(string("Bad TO position: ") + e.what(), x_GetLineNumber());    }    record->score = v[5];    if (v[6] == "+") {        strand = eNa_strand_plus;    } else if (v[6] == "-") {        strand = eNa_strand_minus;    } else if (v[6] != ".") {        x_Warn("Bad strand " + v[6] + " (should be [+-.])", x_GetLineNumber());    }    if (v[7] == "0"  ||  v[7] == "1"  ||  v[7] == "2") {        record->frame = v[7][0] - '0';    } else if (v[7] == ".") {        record->frame = -1;    } else {        x_Warn("Bad frame " + v[7] + " (should be [012.])", x_GetLineNumber());        record->frame = -1;    }    {{        SRecord::SSubLoc subloc;        subloc.accession = accession;        subloc.strand    = strand;        subloc.ranges.insert(TSeqRange(from, to));        record->loc.push_back(subloc);    }}       string         attr_last_value;    vector<string> attr_values;    char           quote_char = 0;    for (SIZE_TYPE i = 8;  i < v.size();  ++i) {        string s = v[i] + ' ';        if ( !misdelimited  &&  i > 8) {            SIZE_TYPE pos = s.find_first_not_of(" ");            if (pos != NPOS) {                if (s[pos] == '#') {                    break;                } else {                    x_Warn("Extra non-comment fields", x_GetLineNumber());                }            } else {                continue; // just spaces anyway...            }        }        SIZE_TYPE pos = 0;        while (pos < s.size()) {            SIZE_TYPE pos2;            if (quote_char) { // must be inside a value                pos2 = s.find_first_of(" \'\"\\", pos);                _ASSERT(pos2 != NPOS); // due to trailing space                if (s[pos2] == quote_char) {                    if (attr_values.empty()) {                        x_Warn("quoted attribute tag " + attr_last_value,                               x_GetLineNumber());                    }                    quote_char = 0;                    attr_last_value += s.substr(pos, pos2 - pos);                    try {                        attr_values.push_back(NStr::ParseEscapes                                              (attr_last_value));                    } catch (CStringException& e) {                        attr_values.push_back(attr_last_value);                        x_Warn(e.what() + (" in value of " + attr_values[0]),                               x_GetLineNumber());                    }                    attr_last_value.erase();                } else if (s[pos2] == '\\') {                    _VERIFY(++pos2 != s.size());                    attr_last_value += s.substr(pos, pos2 + 1 - pos);                } else {                    attr_last_value += s.substr(pos, pos2 + 1 - pos);                }            } else {                pos2 = s.find_first_of(" #;\"", pos); // also look for \'?                _ASSERT(pos2 != NPOS); // due to trailing space                if (pos != pos2) {                    // grab and place the preceding token                    attr_last_value += s.substr(pos, pos2 - pos);                    attr_values.push_back(attr_last_value);                    attr_last_value.erase();                }                switch (s[pos2]) {                case ' ':                    break;                case '#':                    i = v.size();                    break;                case ';':                    if (attr_values.empty()) {                        x_Warn("null attribute", x_GetLineNumber());                    } else {                        x_AddAttribute(*record, attr_values);                        attr_values.clear();                    }                    break;                // NB: we don't currently search for single quotes.                case '\"': case '\'':                    quote_char = s[pos2];                    break;                default:                    _TROUBLE;                }            }            pos = pos2 + 1;        }    }    if ( !attr_values.empty() ) {        x_Warn("unterminated attribute " + attr_values[0], x_GetLineNumber());        x_AddAttribute(*record, attr_values);    }    return record;}CRef<CSeq_feat> CGFFReader::x_ParseRecord(const SRecord& record){    CRef<CSeq_feat> feat(CFeature_table_reader::CreateSeqFeat                         (record.key, *x_ResolveLoc(record.loc),                          CFeature_table_reader::fKeepBadKey));    if (record.frame >= 0  &&  feat->GetData().IsCdregion()) {        feat->SetData().SetCdregion().SetFrame            (static_cast<CCdregion::EFrame>(record.frame + 1));    }    ITERATE (SRecord::TAttrs, it, record.attrs) {        string tag = it->front();        string value;        switch (it->size()) {        case 1:            break;        case 2:            value = (*it)[1];            break;        default:            x_Warn("Ignoring extra fields in value of " + tag, record.line_no);            value = (*it)[1];            break;        }        if (x_GetFlags() & fGBQuals) {            if ( !(x_GetFlags() & fNoGTF) ) { // translate                if (tag == "transcript_id") {                    //continue;                } else if (tag == "gene_id") {                    tag = "gene";                    SIZE_TYPE colon = value.find(':');                    if (colon != NPOS) {                        value.erase(0, colon + 1);                    }                } else if (tag == "exon_number") {                    tag = "number";                }            }            CFeature_table_reader::AddFeatQual                (feat, tag, value, CFeature_table_reader::fKeepBadKey);        } else { // don't attempt to parse, just treat as imported            CRef<CGb_qual> qual(new CGb_qual);            qual->SetQual(tag);            qual->SetVal(value);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?