gff_reader.cpp
来自「ncbi源码」· C++ 代码 · 共 769 行 · 第 1/2 页
CPP
769 行
feat->SetQual().push_back(qual); } } return feat;}CRef<CSeq_loc> CGFFReader::x_ResolveLoc(const SRecord::TLoc& loc){ CRef<CSeq_loc> seqloc(new CSeq_loc); ITERATE (SRecord::TLoc, it, loc) { CRef<CSeq_id> id = x_ResolveSeqName(it->accession); ITERATE (set<TSeqRange>, range, it->ranges) { CRef<CSeq_loc> segment(new CSeq_loc); if (range->GetLength() == 1) { CSeq_point& pnt = segment->SetPnt(); pnt.SetId (*id); pnt.SetPoint(range->GetFrom()); if (it->strand != eNa_strand_unknown) { pnt.SetStrand(it->strand); } } else { CSeq_interval& si = segment->SetInt(); si.SetId (*id); si.SetFrom(range->GetFrom()); si.SetTo (range->GetTo()); if (it->strand != eNa_strand_unknown) { si.SetStrand(it->strand); } } if (IsReverse(it->strand)) { seqloc->SetMix().Set().push_front(segment); } else { seqloc->SetMix().Set().push_back(segment); } } } if (seqloc->GetMix().Get().size() == 1) { return seqloc->SetMix().Set().front(); } else { return seqloc; }}void CGFFReader::x_AddAttribute(SRecord& record, vector<string>& attr){ if (x_GetFlags() & fGBQuals) { if (attr[0] == "gbkey" && attr.size() == 2) { record.key = attr[1]; return; } } record.attrs.insert(attr);}string CGFFReader::x_FeatureID(const SRecord& record){ if (x_GetFlags() & fNoGTF) { return kEmptyStr; } static const vector<string> sc_GeneId(1, "gene_id"); SRecord::TAttrs::const_iterator gene_it = record.attrs.lower_bound(sc_GeneId); static const vector<string> sc_TranscriptId(1, "transcript_id"); SRecord::TAttrs::const_iterator transcript_it = record.attrs.lower_bound(sc_TranscriptId); static const vector<string> sc_ProteinId(1, "protein_id"); SRecord::TAttrs::const_iterator protein_it = record.attrs.lower_bound(sc_ProteinId); // concatenate our IDs from above, if found string id; if (gene_it != record.attrs.end() && gene_it->front() == "gene_id") { id += (*gene_it)[1]; } if (transcript_it != record.attrs.end() && transcript_it->front() == "transcript_id") { if ( !id.empty() ) { id += ' '; } id += (*transcript_it)[1]; } if (protein_it != record.attrs.end() && protein_it->front() == "protein_id") { if ( !id.empty() ) { id += ' '; } id += (*protein_it)[1]; } // look for db xrefs static const vector<string> sc_Dbxref(1, "db_xref"); SRecord::TAttrs::const_iterator dbxref_it = record.attrs.lower_bound(sc_Dbxref); for ( ; dbxref_it != record.attrs.end() && dbxref_it->front() == "db_xref"; ++dbxref_it) { if ( !id.empty() ) { id += ' '; } id += (*dbxref_it)[1]; } if ( id.empty() ) { return id; } if (record.key == "start_codon" || record.key == "stop_codon") { //id += " " + record.key; id += "CDS"; } else if (record.key == "CDS" || NStr::FindNoCase(record.key, "rna") != NPOS) { //id += " " + record.key; id += record.key; } else { // probably an intron, exon, or single site return kEmptyStr; } _TRACE("id = " << id); return id;}void CGFFReader::x_MergeRecords(SRecord& dest, const SRecord& src){ // XXX - perform sanity checks and warn on mismatch bool merge_overlaps = false; if ((src.key == "start_codon" || src.key == "stop_codon") && dest.key == "CDS") { // start_codon and stop_codon features should be contained inside of // existing CDS locations merge_overlaps = true; } ITERATE (SRecord::TLoc, slit, src.loc) { bool merged = false; NON_CONST_ITERATE (SRecord::TLoc, dlit, dest.loc) { if (slit->accession != dlit->accession) { if (dest.loc.size() == 1) { x_Warn("Multi-accession feature", src.line_no); } continue; } else if (slit->strand != dlit->strand) { if (dest.loc.size() == 1) { x_Warn("Multi-orientation feature", src.line_no); } continue; } else { if (merge_overlaps) { ITERATE (set<TSeqRange>, src_iter, slit->ranges) { TSeqRange range(*src_iter); set<TSeqRange>::iterator dst_iter = dlit->ranges.begin(); for ( ; dst_iter != dlit->ranges.end(); ) { if (dst_iter->IntersectingWith(range)) { range += *dst_iter; _TRACE("merging overlapping ranges: " << range.GetFrom() << " - " << range.GetTo() << " <-> " << dst_iter->GetFrom() << " - " << dst_iter->GetTo()); dlit->ranges.erase(dst_iter++); } else { ++dst_iter; } } dlit->ranges.insert(range); } } else { ITERATE (set<TSeqRange>, set_iter, slit->ranges) { dlit->ranges.insert(*set_iter); } } merged = true; break; } } if ( !merged ) { dest.loc.push_back(*slit); } } if (src.key != dest.key) { if (dest.key == "CDS" && NStr::EndsWith(src.key, "_codon") && !(x_GetFlags() & fNoGTF) ) { // ok } else if (src.key == "CDS" && NStr::EndsWith(dest.key, "_codon") && !(x_GetFlags() & fNoGTF) ) { dest.key == "CDS"; } else { x_Warn("Merging features with different keys: " + dest.key + " != " + src.key, src.line_no); } } x_MergeAttributes(dest, src);}void CGFFReader::x_MergeAttributes(SRecord& dest, const SRecord& src){ SRecord::TAttrs::iterator dait = dest.attrs.begin(); SRecord::TAttrs::iterator dait_end = dest.attrs.end(); SRecord::TAttrs::iterator dait_tag = dait_end; ITERATE (SRecord::TAttrs, sait, src.attrs) { const string& tag = sait->front(); while (dait != dait_end && dait->front() < tag) { ++dait; } if (dait->front() == tag) { if (dait_tag == dait_end || dait_tag->front() != tag) { dait_tag = dait; } while (dait != dait_end && *dait < *sait) { ++dait; } } if (dait != dait_end && *dait == *sait) { continue; // identical } else if ( !(x_GetFlags() & fNoGTF) && tag == "exon_number") { if (dait_tag != dait_end) { while (dait != dait_end && dait->front() == tag) { ++dait; } dest.attrs.erase(dait_tag, dait); } } else { dest.attrs.insert(dait, *sait); } }}void CGFFReader::x_PlaceFeature(CSeq_feat& feat, const SRecord&){ CRef<CBioseq> seq; if ( !feat.IsSetProduct() ) { for (CTypeConstIterator<CSeq_id> it(feat.GetLocation()); it; ++it) { CRef<CBioseq> seq2 = x_ResolveID(*it, kEmptyStr); if ( !seq ) { seq.Reset(seq2); } else if ( seq2.NotEmpty() && seq != seq2) { seq.Reset(); BREAK(it); } } } CBioseq::TAnnot& annots = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot(); NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) { if ((*it)->GetData().IsFtable()) { (*it)->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat)); return; } } CRef<CSeq_annot> annot(new CSeq_annot); annot->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat)); annots.push_back(annot);}CRef<CSeq_id> CGFFReader::x_ResolveSeqName(const string& name){ CRef<CSeq_id>& id = m_SeqNameCache[name]; if ( !id ) { id.Reset(x_ResolveNewSeqName(name)); } if ( !id ) { x_Warn("x_ResolveNewSeqName returned null for " + name); id.Reset(new CSeq_id(CSeq_id::e_Local, name, name)); } return id;}CRef<CSeq_id> CGFFReader::x_ResolveNewSeqName(const string& name){ return CRef<CSeq_id>(new CSeq_id(name));}CRef<CBioseq> CGFFReader::x_ResolveID(const CSeq_id& id, const string& mol){ CRef<CBioseq>& seq = m_SeqCache[CConstRef<CSeq_id>(&id)]; if ( !seq ) { seq.Reset(x_ResolveNewID(id, mol)); // Derived versions of x_ResolveNewID may legimately return null // results.... if (seq) { x_PlaceSeq(*seq); ITERATE (CBioseq::TId, it, seq->GetId()) { m_SeqCache.insert(make_pair(CConstRef<CSeq_id>(*it), seq)); } } } return seq;}CRef<CBioseq> CGFFReader::x_ResolveNewID(const CSeq_id& id, const string& mol0){ CRef<CBioseq> seq(new CBioseq); CRef<CSeq_id> id_copy(new CSeq_id); id_copy->Assign(id); seq->SetId().push_back(id_copy); seq->SetInst().SetRepr(CSeq_inst::eRepr_virtual); const string& mol = mol0.empty() ? m_DefMol : mol0; if (mol.empty() || mol == "dna") { seq->SetInst().SetMol(CSeq_inst::eMol_dna); } else if (mol == "rna") { seq->SetInst().SetMol(CSeq_inst::eMol_rna); } else if (mol == "protein") { seq->SetInst().SetMol(CSeq_inst::eMol_aa); } else { x_Warn("unrecognized sequence type " + mol + "; assuming DNA"); seq->SetInst().SetMol(CSeq_inst::eMol_dna); } return seq;}void CGFFReader::x_PlaceSeq(CBioseq& seq){ bool found = false; for (CTypeConstIterator<CBioseq> it(*m_TSE); it; ++it) { if (&*it == &seq) { found = true; BREAK(it); } } if ( !found ) { CRef<CSeq_entry> se(new CSeq_entry); se->SetSeq(seq); m_TSE->SetSet().SetSeq_set().push_back(se); }}END_SCOPE(objects)END_NCBI_SCOPE/** ===========================================================================** $Log: gff_reader.cpp,v $* Revision 1000.1 2004/06/01 19:46:20 gouriano* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6** Revision 1.6 2004/05/21 21:42:55 gorelenk* Added PCH ncbi_pch.hpp** Revision 1.5 2004/05/08 12:13:24 dicuccio* Switched away from using CRangeCollection<> for locations, as this lead to* inadvertent merging of correctly overlapping exons** Revision 1.4 2004/01/06 17:02:40 dicuccio* (From Aaron Ucko): Fixed ordering of intervals in a seq-loc-mix on the negative* strand** Revision 1.3 2003/12/31 12:48:29 dicuccio* Fixed interpretation of positions - GFF/GTF is 1-based, ASN.1 is 0-based** Revision 1.2 2003/12/04 00:58:24 ucko* Fix for WorkShop's context-insensitive make_pair.** Revision 1.1 2003/12/03 20:56:36 ucko* Initial commit.*** ===========================================================================*/
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?