gff_reader.cpp

来自「ncbi源码」· C++ 代码 · 共 769 行 · 第 1/2 页

CPP
769
字号
            feat->SetQual().push_back(qual);        }    }    return feat;}CRef<CSeq_loc> CGFFReader::x_ResolveLoc(const SRecord::TLoc& loc){    CRef<CSeq_loc> seqloc(new CSeq_loc);    ITERATE (SRecord::TLoc, it, loc) {        CRef<CSeq_id> id = x_ResolveSeqName(it->accession);        ITERATE (set<TSeqRange>, range, it->ranges) {            CRef<CSeq_loc> segment(new CSeq_loc);            if (range->GetLength() == 1) {                CSeq_point& pnt = segment->SetPnt();                pnt.SetId   (*id);                pnt.SetPoint(range->GetFrom());                if (it->strand != eNa_strand_unknown) {                    pnt.SetStrand(it->strand);                }            } else {                CSeq_interval& si = segment->SetInt();                si.SetId  (*id);                si.SetFrom(range->GetFrom());                si.SetTo  (range->GetTo());                if (it->strand != eNa_strand_unknown) {                    si.SetStrand(it->strand);                }            }            if (IsReverse(it->strand)) {                seqloc->SetMix().Set().push_front(segment);            } else {                seqloc->SetMix().Set().push_back(segment);            }        }    }    if (seqloc->GetMix().Get().size() == 1) {        return seqloc->SetMix().Set().front();    } else {        return seqloc;    }}void CGFFReader::x_AddAttribute(SRecord& record, vector<string>& attr){    if (x_GetFlags() & fGBQuals) {        if (attr[0] == "gbkey"  &&  attr.size() == 2) {            record.key = attr[1];            return;        }    }    record.attrs.insert(attr);}string CGFFReader::x_FeatureID(const SRecord& record){    if (x_GetFlags() & fNoGTF) {        return kEmptyStr;    }    static const vector<string> sc_GeneId(1, "gene_id");    SRecord::TAttrs::const_iterator gene_it        = record.attrs.lower_bound(sc_GeneId);    static const vector<string> sc_TranscriptId(1, "transcript_id");    SRecord::TAttrs::const_iterator transcript_it        = record.attrs.lower_bound(sc_TranscriptId);    static const vector<string> sc_ProteinId(1, "protein_id");    SRecord::TAttrs::const_iterator protein_it        = record.attrs.lower_bound(sc_ProteinId);    // concatenate our IDs from above, if found    string id;    if (gene_it != record.attrs.end()  &&        gene_it->front() == "gene_id") {        id += (*gene_it)[1];    }    if (transcript_it != record.attrs.end()  &&        transcript_it->front() == "transcript_id") {        if ( !id.empty() ) {            id += ' ';        }        id += (*transcript_it)[1];    }    if (protein_it != record.attrs.end()  &&        protein_it->front() == "protein_id") {        if ( !id.empty() ) {            id += ' ';        }        id += (*protein_it)[1];    }    // look for db xrefs    static const vector<string> sc_Dbxref(1, "db_xref");    SRecord::TAttrs::const_iterator dbxref_it        = record.attrs.lower_bound(sc_Dbxref);    for ( ; dbxref_it != record.attrs.end()  &&            dbxref_it->front() == "db_xref";  ++dbxref_it) {        if ( !id.empty() ) {            id += ' ';        }        id += (*dbxref_it)[1];    }    if ( id.empty() ) {        return id;    }    if (record.key == "start_codon" ||  record.key == "stop_codon") {        //id += " " + record.key;        id += "CDS";    } else if (record.key == "CDS"               ||  NStr::FindNoCase(record.key, "rna") != NPOS) {        //id += " " + record.key;        id += record.key;    } else { // probably an intron, exon, or single site        return kEmptyStr;    }    _TRACE("id = " << id);    return id;}void CGFFReader::x_MergeRecords(SRecord& dest, const SRecord& src){    // XXX - perform sanity checks and warn on mismatch    bool merge_overlaps = false;    if ((src.key == "start_codon"  ||  src.key == "stop_codon")  &&        dest.key == "CDS") {        // start_codon and stop_codon features should be contained inside of        // existing CDS locations        merge_overlaps = true;    }    ITERATE (SRecord::TLoc, slit, src.loc) {        bool merged = false;        NON_CONST_ITERATE (SRecord::TLoc, dlit, dest.loc) {            if (slit->accession != dlit->accession) {                if (dest.loc.size() == 1) {                    x_Warn("Multi-accession feature", src.line_no);                }                continue;            } else if (slit->strand != dlit->strand) {                if (dest.loc.size() == 1) {                    x_Warn("Multi-orientation feature", src.line_no);                }                continue;            } else {                if (merge_overlaps) {                    ITERATE (set<TSeqRange>, src_iter, slit->ranges) {                        TSeqRange range(*src_iter);                        set<TSeqRange>::iterator dst_iter =                            dlit->ranges.begin();                        for ( ;  dst_iter != dlit->ranges.end();  ) {                            if (dst_iter->IntersectingWith(range)) {                                range += *dst_iter;                                _TRACE("merging overlapping ranges: "                                       << range.GetFrom() << " - "                                       << range.GetTo() << " <-> "                                       << dst_iter->GetFrom() << " - "                                       << dst_iter->GetTo());                                dlit->ranges.erase(dst_iter++);                            } else {                                ++dst_iter;                            }                        }                        dlit->ranges.insert(range);                    }                } else {                    ITERATE (set<TSeqRange>, set_iter, slit->ranges) {                        dlit->ranges.insert(*set_iter);                    }                }                merged = true;                break;            }        }        if ( !merged ) {            dest.loc.push_back(*slit);        }    }    if (src.key != dest.key) {        if (dest.key == "CDS"  &&  NStr::EndsWith(src.key, "_codon")            &&  !(x_GetFlags() & fNoGTF) ) {            // ok        } else if (src.key == "CDS" &&  NStr::EndsWith(dest.key, "_codon")            &&  !(x_GetFlags() & fNoGTF) ) {            dest.key == "CDS";        } else {            x_Warn("Merging features with different keys: " + dest.key                   + " != " + src.key, src.line_no);        }    }    x_MergeAttributes(dest, src);}void CGFFReader::x_MergeAttributes(SRecord& dest, const SRecord& src){    SRecord::TAttrs::iterator dait     = dest.attrs.begin();    SRecord::TAttrs::iterator dait_end = dest.attrs.end();    SRecord::TAttrs::iterator dait_tag = dait_end;    ITERATE (SRecord::TAttrs, sait, src.attrs) {        const string& tag = sait->front();        while (dait != dait_end  &&  dait->front() < tag) {            ++dait;        }        if (dait->front() == tag) {            if (dait_tag == dait_end  ||  dait_tag->front() != tag) {                dait_tag = dait;            }            while (dait != dait_end  &&  *dait < *sait) {                ++dait;            }        }        if (dait != dait_end  &&  *dait == *sait) {            continue; // identical        } else if ( !(x_GetFlags() & fNoGTF)  &&  tag == "exon_number") {            if (dait_tag != dait_end) {                while (dait != dait_end  &&  dait->front() == tag) {                    ++dait;                }                dest.attrs.erase(dait_tag, dait);            }        } else {            dest.attrs.insert(dait, *sait);        }    }}void CGFFReader::x_PlaceFeature(CSeq_feat& feat, const SRecord&){    CRef<CBioseq> seq;    if ( !feat.IsSetProduct() ) {        for (CTypeConstIterator<CSeq_id> it(feat.GetLocation());  it;  ++it) {            CRef<CBioseq> seq2 = x_ResolveID(*it, kEmptyStr);            if ( !seq ) {                seq.Reset(seq2);            } else if ( seq2.NotEmpty()  &&  seq != seq2) {                seq.Reset();                BREAK(it);            }        }    }    CBioseq::TAnnot& annots        = seq ? seq->SetAnnot() : m_TSE->SetSet().SetAnnot();    NON_CONST_ITERATE (CBioseq::TAnnot, it, annots) {        if ((*it)->GetData().IsFtable()) {            (*it)->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));            return;        }    }    CRef<CSeq_annot> annot(new CSeq_annot);    annot->SetData().SetFtable().push_back(CRef<CSeq_feat>(&feat));    annots.push_back(annot);}CRef<CSeq_id> CGFFReader::x_ResolveSeqName(const string& name){    CRef<CSeq_id>& id = m_SeqNameCache[name];    if ( !id ) {        id.Reset(x_ResolveNewSeqName(name));    }    if ( !id ) {        x_Warn("x_ResolveNewSeqName returned null for " + name);        id.Reset(new CSeq_id(CSeq_id::e_Local, name, name));    }    return id;}CRef<CSeq_id> CGFFReader::x_ResolveNewSeqName(const string& name){    return CRef<CSeq_id>(new CSeq_id(name));}CRef<CBioseq> CGFFReader::x_ResolveID(const CSeq_id& id, const string& mol){    CRef<CBioseq>& seq = m_SeqCache[CConstRef<CSeq_id>(&id)];    if ( !seq ) {        seq.Reset(x_ResolveNewID(id, mol));        // Derived versions of x_ResolveNewID may legimately return null        // results....        if (seq) {            x_PlaceSeq(*seq);            ITERATE (CBioseq::TId, it, seq->GetId()) {                m_SeqCache.insert(make_pair(CConstRef<CSeq_id>(*it), seq));            }        }    }    return seq;}CRef<CBioseq> CGFFReader::x_ResolveNewID(const CSeq_id& id, const string& mol0){    CRef<CBioseq> seq(new CBioseq);    CRef<CSeq_id> id_copy(new CSeq_id);    id_copy->Assign(id);    seq->SetId().push_back(id_copy);    seq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);    const string& mol = mol0.empty() ? m_DefMol : mol0;    if (mol.empty()  ||  mol == "dna") {        seq->SetInst().SetMol(CSeq_inst::eMol_dna);    } else if (mol == "rna")  {        seq->SetInst().SetMol(CSeq_inst::eMol_rna);    } else if (mol == "protein")  {        seq->SetInst().SetMol(CSeq_inst::eMol_aa);    } else {        x_Warn("unrecognized sequence type " + mol + "; assuming DNA");        seq->SetInst().SetMol(CSeq_inst::eMol_dna);    }    return seq;}void CGFFReader::x_PlaceSeq(CBioseq& seq){    bool found = false;    for (CTypeConstIterator<CBioseq> it(*m_TSE);  it;  ++it) {        if (&*it == &seq) {            found = true;            BREAK(it);        }    }    if ( !found ) {        CRef<CSeq_entry> se(new CSeq_entry);        se->SetSeq(seq);        m_TSE->SetSet().SetSeq_set().push_back(se);    }}END_SCOPE(objects)END_NCBI_SCOPE/** ===========================================================================** $Log: gff_reader.cpp,v $* Revision 1000.1  2004/06/01 19:46:20  gouriano* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6** Revision 1.6  2004/05/21 21:42:55  gorelenk* Added PCH ncbi_pch.hpp** Revision 1.5  2004/05/08 12:13:24  dicuccio* Switched away from using CRangeCollection<> for locations, as this lead to* inadvertent merging of correctly overlapping exons** Revision 1.4  2004/01/06 17:02:40  dicuccio* (From Aaron Ucko): Fixed ordering of intervals in a seq-loc-mix on the negative* strand** Revision 1.3  2003/12/31 12:48:29  dicuccio* Fixed interpretation of positions - GFF/GTF is 1-based, ASN.1 is 0-based** Revision 1.2  2003/12/04 00:58:24  ucko* Fix for WorkShop's context-insensitive make_pair.** Revision 1.1  2003/12/03 20:56:36  ucko* Initial commit.*** ===========================================================================*/

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?