gff_reader.cpp
来自「ncbi源码」· C++ 代码 · 共 769 行 · 第 1/2 页
CPP
769 行
/* * =========================================================================== * PRODUCTION $Log: gff_reader.cpp,v $ * PRODUCTION Revision 1000.1 2004/06/01 19:46:20 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6 * PRODUCTION * =========================================================================== *//* $Id: gff_reader.cpp,v 1000.1 2004/06/01 19:46:20 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Authors: Aaron Ucko, Wratko Hlavina** File Description:* Reader for GFF (including GTF) files.** ===========================================================================*/#include <ncbi_pch.hpp>#include <objtools/readers/gff_reader.hpp>#include <corelib/ncbitime.hpp>#include <corelib/ncbiutil.hpp>#include <serial/iterator.hpp>#include <objects/general/Date.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seqfeat/Cdregion.hpp>#include <objects/seqfeat/Gb_qual.hpp>#include <objects/seqloc/Seq_interval.hpp>#include <objects/seqloc/Seq_point.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objtools/readers/readfeat.hpp>#include <algorithm>#include <ctype.h>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)CRef<CSeq_entry> CGFFReader::Read(CNcbiIstream& in, TFlags flags){ x_Reset(); m_Flags = flags; m_Stream = ∈ string line; while (x_GetNextLine(line)) { if (NStr::StartsWith(line, "##")) { x_ParseStructuredComment(line); } else if (NStr::StartsWith(line, "#")) { // regular comment; ignore } else { CRef<SRecord> record = x_ParseFeatureInterval(line); if (record) { string id = x_FeatureID(*record); if (id.empty()) { x_PlaceFeature(*x_ParseRecord(*record), *record); } else { CRef<SRecord>& match = m_DelayedFeats[id]; if (match) { x_MergeRecords(*match, *record); } else { match.Reset(record); } } } } } ITERATE (TDelayedFeats, it, m_DelayedFeats) { x_PlaceFeature(*x_ParseRecord(*it->second), *it->second); } CRef<CSeq_entry> tse(m_TSE); // need to save before resetting. x_Reset(); return tse;}void CGFFReader::x_Warn(const string& message, unsigned int line){ if (line) { ERR_POST(Warning << message << " [GFF input, line " << line << ']'); } else { ERR_POST(Warning << message << " [GFF input]"); }}void CGFFReader::x_Reset(void){ m_TSE.Reset(new CSeq_entry); m_SeqNameCache.clear(); m_SeqCache.clear(); m_DelayedFeats.clear(); m_DefMol.erase(); m_LineNumber = 0;}bool CGFFReader::x_GetNextLine(string& line){ ++m_LineNumber; return NcbiGetlineEOL(*m_Stream, line) || !line.empty();}void CGFFReader::x_ParseStructuredComment(const string& line){ vector<string> v; NStr::Tokenize(line, "# \t", v, NStr::eMergeDelims); if (v[0] == "date" && v.size() > 1) { x_ParseDateComment(v[1]); } else if (v[0] == "type" && v.size() > 1) { x_ParseTypeComment(v[1], v.size() > 2 ? v[2] : kEmptyStr); } // etc.}void CGFFReader::x_ParseDateComment(const string& date){ try { CRef<CSeqdesc> desc(new CSeqdesc); desc->SetUpdate_date().SetToTime(CTime(date, "Y-M-D"), CDate::ePrecision_day); m_TSE->SetSet().SetDescr().Set().push_back(desc); } catch (CTimeException& e) { x_Warn(string("Bad ISO date: ") + e.what(), x_GetLineNumber()); }}void CGFFReader::x_ParseTypeComment(const string& moltype, const string& seqname){ if (seqname.empty()) { m_DefMol = moltype; } else { // automatically adds to m_TSE if new x_ResolveID(*x_ResolveSeqName(seqname), moltype); }}CRef<CGFFReader::SRecord>CGFFReader::x_ParseFeatureInterval(const string& line){ vector<string> v; bool misdelimited = false; NStr::Tokenize(line, "\t", v, NStr::eNoMergeDelims); if (v.size() < 8) { NStr::Tokenize(line, " \t", v, NStr::eMergeDelims); if (v.size() < 8) { x_Warn("Skipping line due to insufficient fields", x_GetLineNumber()); return null; } else { x_Warn("Bad delimiters (should use tabs)", x_GetLineNumber()); misdelimited = true; } } else { // XXX - warn about extra fields (if any), but only if they're // not comments v.resize(9); } CRef<SRecord> record(x_NewRecord()); string accession; TSeqPos from = 0, to = numeric_limits<TSeqPos>::max(); ENa_strand strand = eNa_strand_unknown; accession = v[0]; record->source = v[1]; record->key = v[2]; try { from = NStr::StringToUInt(v[3]) - 1; } catch (std::exception& e) { x_Warn(string("Bad FROM position: ") + e.what(), x_GetLineNumber()); } try { to = NStr::StringToUInt(v[4]) - 1; } catch (std::exception& e) { x_Warn(string("Bad TO position: ") + e.what(), x_GetLineNumber()); } record->score = v[5]; if (v[6] == "+") { strand = eNa_strand_plus; } else if (v[6] == "-") { strand = eNa_strand_minus; } else if (v[6] != ".") { x_Warn("Bad strand " + v[6] + " (should be [+-.])", x_GetLineNumber()); } if (v[7] == "0" || v[7] == "1" || v[7] == "2") { record->frame = v[7][0] - '0'; } else if (v[7] == ".") { record->frame = -1; } else { x_Warn("Bad frame " + v[7] + " (should be [012.])", x_GetLineNumber()); record->frame = -1; } {{ SRecord::SSubLoc subloc; subloc.accession = accession; subloc.strand = strand; subloc.ranges.insert(TSeqRange(from, to)); record->loc.push_back(subloc); }} string attr_last_value; vector<string> attr_values; char quote_char = 0; for (SIZE_TYPE i = 8; i < v.size(); ++i) { string s = v[i] + ' '; if ( !misdelimited && i > 8) { SIZE_TYPE pos = s.find_first_not_of(" "); if (pos != NPOS) { if (s[pos] == '#') { break; } else { x_Warn("Extra non-comment fields", x_GetLineNumber()); } } else { continue; // just spaces anyway... } } SIZE_TYPE pos = 0; while (pos < s.size()) { SIZE_TYPE pos2; if (quote_char) { // must be inside a value pos2 = s.find_first_of(" \'\"\\", pos); _ASSERT(pos2 != NPOS); // due to trailing space if (s[pos2] == quote_char) { if (attr_values.empty()) { x_Warn("quoted attribute tag " + attr_last_value, x_GetLineNumber()); } quote_char = 0; attr_last_value += s.substr(pos, pos2 - pos); try { attr_values.push_back(NStr::ParseEscapes (attr_last_value)); } catch (CStringException& e) { attr_values.push_back(attr_last_value); x_Warn(e.what() + (" in value of " + attr_values[0]), x_GetLineNumber()); } attr_last_value.erase(); } else if (s[pos2] == '\\') { _VERIFY(++pos2 != s.size()); attr_last_value += s.substr(pos, pos2 + 1 - pos); } else { attr_last_value += s.substr(pos, pos2 + 1 - pos); } } else { pos2 = s.find_first_of(" #;\"", pos); // also look for \'? _ASSERT(pos2 != NPOS); // due to trailing space if (pos != pos2) { // grab and place the preceding token attr_last_value += s.substr(pos, pos2 - pos); attr_values.push_back(attr_last_value); attr_last_value.erase(); } switch (s[pos2]) { case ' ': break; case '#': i = v.size(); break; case ';': if (attr_values.empty()) { x_Warn("null attribute", x_GetLineNumber()); } else { x_AddAttribute(*record, attr_values); attr_values.clear(); } break; // NB: we don't currently search for single quotes. case '\"': case '\'': quote_char = s[pos2]; break; default: _TROUBLE; } } pos = pos2 + 1; } } if ( !attr_values.empty() ) { x_Warn("unterminated attribute " + attr_values[0], x_GetLineNumber()); x_AddAttribute(*record, attr_values); } return record;}CRef<CSeq_feat> CGFFReader::x_ParseRecord(const SRecord& record){ CRef<CSeq_feat> feat(CFeature_table_reader::CreateSeqFeat (record.key, *x_ResolveLoc(record.loc), CFeature_table_reader::fKeepBadKey)); if (record.frame >= 0 && feat->GetData().IsCdregion()) { feat->SetData().SetCdregion().SetFrame (static_cast<CCdregion::EFrame>(record.frame + 1)); } ITERATE (SRecord::TAttrs, it, record.attrs) { string tag = it->front(); string value; switch (it->size()) { case 1: break; case 2: value = (*it)[1]; break; default: x_Warn("Ignoring extra fields in value of " + tag, record.line_no); value = (*it)[1]; break; } if (x_GetFlags() & fGBQuals) { if ( !(x_GetFlags() & fNoGTF) ) { // translate if (tag == "transcript_id") { //continue; } else if (tag == "gene_id") { tag = "gene"; SIZE_TYPE colon = value.find(':'); if (colon != NPOS) { value.erase(0, colon + 1); } } else if (tag == "exon_number") { tag = "number"; } } CFeature_table_reader::AddFeatQual (feat, tag, value, CFeature_table_reader::fKeepBadKey); } else { // don't attempt to parse, just treat as imported CRef<CGb_qual> qual(new CGb_qual); qual->SetQual(tag); qual->SetVal(value);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?