fasta.cpp

来自「ncbi源码」· C++ 代码 · 共 604 行 · 第 1/2 页

CPP
604
字号
{    if (mol != CSeq_inst::eMol_not_set) {        return; // already known; no need to guess    }    if (mol == CSeq_inst::eMol_not_set  &&  !(flags & fReadFasta_ForceType)) {        switch (CFormatGuess::SequenceType(data.data(), data.size())) {        case CFormatGuess::eNucleotide:  mol = CSeq_inst::eMol_na;  return;        case CFormatGuess::eProtein:     mol = CSeq_inst::eMol_aa;  return;        default:                         break;        }    }    // ForceType was set, or CFormatGuess failed, so we have to rely on    // explicit assumptions    if (flags & fReadFasta_AssumeNuc) {        _ASSERT(!(flags & fReadFasta_AssumeProt));        mol = CSeq_inst::eMol_na;    } else if (flags & fReadFasta_AssumeProt) {        mol = CSeq_inst::eMol_aa;    } else {         NCBI_THROW2(CObjReaderParseException, eFormat,                    "ReadFasta: unable to deduce molecule type"                    " from IDs, flags, or sequence",                    in.tellg() - CT_POS_TYPE(0));    }}CRef<CSeq_entry> ReadFasta(CNcbiIstream& in, TReadFastaFlags flags,                           int* counter, vector<CConstRef<CSeq_loc> >* lcv){    if ( !in ) {        NCBI_THROW2(CObjReaderParseException, eFormat,                    "ReadFasta: Input stream no longer valid",                    in.tellg() - CT_POS_TYPE(0));    } else {        CT_INT_TYPE c = in.peek();        if ( !strchr(">#!\n\r", CT_TO_CHAR_TYPE(c)) ) {            NCBI_THROW2                (CObjReaderParseException, eFormat,                 "ReadFasta: Input doesn't start with a defline or comment",                 in.tellg() - CT_POS_TYPE(0));        }    }    CRef<CSeq_entry>       entry(new CSeq_entry);    CBioseq_set::TSeq_set& sset  = entry->SetSet().SetSeq_set();    CRef<CBioseq>          seq(0); // current Bioseq    string                 line;    TSeqPos                pos = 0, lc_start = 0;    bool                   was_lc = false;    CRef<CSeq_id>          best_id;    CRef<CSeq_loc>         lowercase(0);    int                    defcounter = 1;    if ( !counter ) {        counter = &defcounter;    }    while ( !in.eof() ) {        if ((flags & fReadFasta_OneSeq)  &&  seq.NotEmpty()            &&  (in.peek() == '>')) {            break;        }        NcbiGetlineEOL(in, line);        if (in.eof()  &&  line.empty()) {            break;        } else if (line.empty()) {            continue;        }        if (line[0] == '>') {            // new sequence            if (seq) {                s_FixSeqData(seq);                if (was_lc) {                    lowercase->SetPacked_int().AddInterval                        (*best_id, lc_start, pos);                }            }            seq = new CBioseq;            if (flags & fReadFasta_NoSeqData) {                seq->SetInst().SetRepr(CSeq_inst::eRepr_not_set);            } else {                seq->SetInst().SetRepr(CSeq_inst::eRepr_raw);            }            {{                CRef<CSeq_entry> entry2(new CSeq_entry);                entry2->SetSeq(*seq);                sset.push_back(entry2);            }}            string          title;            CSeq_inst::EMol mol = s_ParseFastaDefline(seq->SetId(), title,                                                      line, flags, counter);            if (mol == CSeq_inst::eMol_not_set                &&  (flags & fReadFasta_NoSeqData)) {                if (flags & fReadFasta_AssumeNuc) {                    _ASSERT(!(flags & fReadFasta_AssumeProt));                    mol = CSeq_inst::eMol_na;                } else if (flags & fReadFasta_AssumeProt) {                    mol = CSeq_inst::eMol_aa;                }            }            seq->SetInst().SetMol(mol);            if ( !title.empty() ) {                CRef<CSeqdesc> desc(new CSeqdesc);                desc->SetTitle(title);                seq->SetDescr().Set().push_back(desc);            }            if (lcv) {                pos       = 0;                was_lc    = false;                best_id   = FindBestChoice(seq->GetId(), CSeq_id::Score);                lowercase = new CSeq_loc;                lowercase->SetNull();                lcv->push_back(lowercase);            }        } else if (line[0] == '#'  ||  line[0] == '!') {            continue; // comment        } else if ( !seq ) {            NCBI_THROW2                (CObjReaderParseException, eFormat,                 "ReadFasta: No defline preceding data",                 in.tellg() - CT_POS_TYPE(0));        } else if ( !(flags & fReadFasta_NoSeqData) ) {            // These don't change, but the calls may be relatively expensive,            // esp. with ref-counted implementations.            SIZE_TYPE   line_size = line.size();            const char* line_data = line.data();            // actual data; may contain embedded junk            CSeq_inst&  inst      = seq->SetInst();            string      residues(line_size + 1, '\0');            char*       res_data  = const_cast<char*>(residues.data());            SIZE_TYPE   res_count = 0;            for (SIZE_TYPE i = 0;  i < line_size;  ++i) {                char c = line_data[i];                if (isalpha(c)) {                    if (lowercase) {                        bool is_lc = islower(c) ? true : false;                        if (is_lc && !was_lc) {                            lc_start = pos;                        } else if (was_lc && !is_lc) {                            lowercase->SetPacked_int().AddInterval                                (*best_id, lc_start, pos);                        }                        was_lc = is_lc;                        ++pos;                    }                    res_data[res_count++] = toupper(c);                } else if (c == '-'  &&  (flags & fReadFasta_ParseGaps)) {                    CDelta_ext::Tdata& d = inst.SetExt().SetDelta().Set();                    if (inst.GetRepr() == CSeq_inst::eRepr_raw) {                        CRef<CDelta_seq> ds(new CDelta_seq);                        inst.SetRepr(CSeq_inst::eRepr_delta);                        if (inst.IsSetSeq_data()) {                            ds->SetLiteral().SetSeq_data(inst.SetSeq_data());                            d.push_back(ds);                            inst.ResetSeq_data();                        }                    }                    if ( res_count ) {                        residues.resize(res_count);                        if (inst.GetMol() == CSeq_inst::eMol_not_set) {                            s_GuessMol(inst.SetMol(), residues, flags, in);                        }                        s_AddData(inst, residues);                    }                    {{                        CRef<CDelta_seq> gap(new CDelta_seq);                        gap->SetLoc().SetNull();                        d.push_back(gap);                    }}                    res_count = 0;                } else if (c == ';') {                    continue; // skip rest of line                }            }            if (inst.GetMol() == CSeq_inst::eMol_not_set) {                s_GuessMol(inst.SetMol(), residues, flags, in);            }                        // Add the accumulated data...            residues.resize(res_count);            s_AddData(inst, residues);        }    }    if (seq) {        s_FixSeqData(seq);        if (was_lc) {            lowercase->SetPacked_int().AddInterval(*best_id, lc_start, pos);        }    }    // simplify if possible    if (sset.size() == 1) {        entry->SetSeq(*seq);    }    return entry;}void ReadFastaFileMap(SFastaFileMap* fasta_map, CNcbiIfstream& input){    _ASSERT(fasta_map);    fasta_map->file_map.resize(0);    if (!input.is_open())         return;    while (!input.eof()) {        SFastaFileMap::SFastaEntry  fasta_entry;        fasta_entry.stream_offset = input.tellg() - CT_POS_TYPE(0);        CRef<CSeq_entry> se;        se = ReadFasta(input, fReadFasta_AssumeNuc | fReadFasta_OneSeq);        if (!se->IsSeq())             continue;        const CSeq_entry::TSeq& bioseq = se->GetSeq();        const CSeq_id* sid = bioseq.GetFirstId();        fasta_entry.seq_id = sid->AsFastaString();        if (bioseq.CanGetDescr()) {            const CSeq_descr& d = bioseq.GetDescr();            if (d.CanGet()) {                const CSeq_descr_Base::Tdata& data = d.Get();                if (!data.empty()) {                    CSeq_descr_Base::Tdata::const_iterator it =                                                       data.begin();                    if (it != data.end()) {                        CRef<CSeqdesc> ref_desc = *it;                        ref_desc->GetLabel(&fasta_entry.description,                                            CSeqdesc::eContent);                    }                                                }            }        }        fasta_map->file_map.push_back(fasta_entry);            } // while}END_SCOPE(objects)END_NCBI_SCOPE/** ===========================================================================** $Log: fasta.cpp,v $* Revision 1000.2  2004/06/01 19:46:18  gouriano* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.12** Revision 1.12  2004/05/21 21:42:55  gorelenk* Added PCH ncbi_pch.hpp** Revision 1.11  2004/02/19 22:57:52  ucko* Accommodate stricter implementations of CT_POS_TYPE.** Revision 1.10  2003/12/05 03:00:36  ucko* Validate input better.** Revision 1.9  2003/10/03 15:09:18  ucko* Tweak sequence string allocation to avoid O(n^2) performance from* linear resizing.** Revision 1.8  2003/08/25 21:30:09  ucko* ReadFasta: tweak a bit to improve performance, and fix some bugs in* parsing gaps.** Revision 1.7  2003/08/11 14:39:54  ucko* Populate "lowercase" with Packed_seqints rather than general Seq_loc_mixes.** Revision 1.6  2003/08/08 21:29:12  dondosha* Changed type of lcase_mask in ReadFasta to vector of CConstRefs** Revision 1.5  2003/08/07 21:13:21  ucko* Support a counter for assigning local IDs to sequences with no ID given.* Fix some minor bugs in lowercase-character support.** Revision 1.4  2003/08/06 19:08:33  ucko* Slight interface tweak to ReadFasta: report lowercase locations in a* vector with one entry per Bioseq rather than a consolidated Seq_loc_mix.** Revision 1.3  2003/06/23 20:49:11  kuznets* Changed to use Seq_id::AsFastaString() when reading fasta file map.** Revision 1.2  2003/06/08 16:17:00  lavr* Heed MSVC int->bool performance warning** Revision 1.1  2003/06/04 17:26:22  ucko* Split out from Seq_entry.cpp.*** ===========================================================================*/

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?