fasta.cpp
来自「ncbi源码」· C++ 代码 · 共 604 行 · 第 1/2 页
CPP
604 行
{ if (mol != CSeq_inst::eMol_not_set) { return; // already known; no need to guess } if (mol == CSeq_inst::eMol_not_set && !(flags & fReadFasta_ForceType)) { switch (CFormatGuess::SequenceType(data.data(), data.size())) { case CFormatGuess::eNucleotide: mol = CSeq_inst::eMol_na; return; case CFormatGuess::eProtein: mol = CSeq_inst::eMol_aa; return; default: break; } } // ForceType was set, or CFormatGuess failed, so we have to rely on // explicit assumptions if (flags & fReadFasta_AssumeNuc) { _ASSERT(!(flags & fReadFasta_AssumeProt)); mol = CSeq_inst::eMol_na; } else if (flags & fReadFasta_AssumeProt) { mol = CSeq_inst::eMol_aa; } else { NCBI_THROW2(CObjReaderParseException, eFormat, "ReadFasta: unable to deduce molecule type" " from IDs, flags, or sequence", in.tellg() - CT_POS_TYPE(0)); }}CRef<CSeq_entry> ReadFasta(CNcbiIstream& in, TReadFastaFlags flags, int* counter, vector<CConstRef<CSeq_loc> >* lcv){ if ( !in ) { NCBI_THROW2(CObjReaderParseException, eFormat, "ReadFasta: Input stream no longer valid", in.tellg() - CT_POS_TYPE(0)); } else { CT_INT_TYPE c = in.peek(); if ( !strchr(">#!\n\r", CT_TO_CHAR_TYPE(c)) ) { NCBI_THROW2 (CObjReaderParseException, eFormat, "ReadFasta: Input doesn't start with a defline or comment", in.tellg() - CT_POS_TYPE(0)); } } CRef<CSeq_entry> entry(new CSeq_entry); CBioseq_set::TSeq_set& sset = entry->SetSet().SetSeq_set(); CRef<CBioseq> seq(0); // current Bioseq string line; TSeqPos pos = 0, lc_start = 0; bool was_lc = false; CRef<CSeq_id> best_id; CRef<CSeq_loc> lowercase(0); int defcounter = 1; if ( !counter ) { counter = &defcounter; } while ( !in.eof() ) { if ((flags & fReadFasta_OneSeq) && seq.NotEmpty() && (in.peek() == '>')) { break; } NcbiGetlineEOL(in, line); if (in.eof() && line.empty()) { break; } else if (line.empty()) { continue; } if (line[0] == '>') { // new sequence if (seq) { s_FixSeqData(seq); if (was_lc) { lowercase->SetPacked_int().AddInterval (*best_id, lc_start, pos); } } seq = new CBioseq; if (flags & fReadFasta_NoSeqData) { seq->SetInst().SetRepr(CSeq_inst::eRepr_not_set); } else { seq->SetInst().SetRepr(CSeq_inst::eRepr_raw); } {{ CRef<CSeq_entry> entry2(new CSeq_entry); entry2->SetSeq(*seq); sset.push_back(entry2); }} string title; CSeq_inst::EMol mol = s_ParseFastaDefline(seq->SetId(), title, line, flags, counter); if (mol == CSeq_inst::eMol_not_set && (flags & fReadFasta_NoSeqData)) { if (flags & fReadFasta_AssumeNuc) { _ASSERT(!(flags & fReadFasta_AssumeProt)); mol = CSeq_inst::eMol_na; } else if (flags & fReadFasta_AssumeProt) { mol = CSeq_inst::eMol_aa; } } seq->SetInst().SetMol(mol); if ( !title.empty() ) { CRef<CSeqdesc> desc(new CSeqdesc); desc->SetTitle(title); seq->SetDescr().Set().push_back(desc); } if (lcv) { pos = 0; was_lc = false; best_id = FindBestChoice(seq->GetId(), CSeq_id::Score); lowercase = new CSeq_loc; lowercase->SetNull(); lcv->push_back(lowercase); } } else if (line[0] == '#' || line[0] == '!') { continue; // comment } else if ( !seq ) { NCBI_THROW2 (CObjReaderParseException, eFormat, "ReadFasta: No defline preceding data", in.tellg() - CT_POS_TYPE(0)); } else if ( !(flags & fReadFasta_NoSeqData) ) { // These don't change, but the calls may be relatively expensive, // esp. with ref-counted implementations. SIZE_TYPE line_size = line.size(); const char* line_data = line.data(); // actual data; may contain embedded junk CSeq_inst& inst = seq->SetInst(); string residues(line_size + 1, '\0'); char* res_data = const_cast<char*>(residues.data()); SIZE_TYPE res_count = 0; for (SIZE_TYPE i = 0; i < line_size; ++i) { char c = line_data[i]; if (isalpha(c)) { if (lowercase) { bool is_lc = islower(c) ? true : false; if (is_lc && !was_lc) { lc_start = pos; } else if (was_lc && !is_lc) { lowercase->SetPacked_int().AddInterval (*best_id, lc_start, pos); } was_lc = is_lc; ++pos; } res_data[res_count++] = toupper(c); } else if (c == '-' && (flags & fReadFasta_ParseGaps)) { CDelta_ext::Tdata& d = inst.SetExt().SetDelta().Set(); if (inst.GetRepr() == CSeq_inst::eRepr_raw) { CRef<CDelta_seq> ds(new CDelta_seq); inst.SetRepr(CSeq_inst::eRepr_delta); if (inst.IsSetSeq_data()) { ds->SetLiteral().SetSeq_data(inst.SetSeq_data()); d.push_back(ds); inst.ResetSeq_data(); } } if ( res_count ) { residues.resize(res_count); if (inst.GetMol() == CSeq_inst::eMol_not_set) { s_GuessMol(inst.SetMol(), residues, flags, in); } s_AddData(inst, residues); } {{ CRef<CDelta_seq> gap(new CDelta_seq); gap->SetLoc().SetNull(); d.push_back(gap); }} res_count = 0; } else if (c == ';') { continue; // skip rest of line } } if (inst.GetMol() == CSeq_inst::eMol_not_set) { s_GuessMol(inst.SetMol(), residues, flags, in); } // Add the accumulated data... residues.resize(res_count); s_AddData(inst, residues); } } if (seq) { s_FixSeqData(seq); if (was_lc) { lowercase->SetPacked_int().AddInterval(*best_id, lc_start, pos); } } // simplify if possible if (sset.size() == 1) { entry->SetSeq(*seq); } return entry;}void ReadFastaFileMap(SFastaFileMap* fasta_map, CNcbiIfstream& input){ _ASSERT(fasta_map); fasta_map->file_map.resize(0); if (!input.is_open()) return; while (!input.eof()) { SFastaFileMap::SFastaEntry fasta_entry; fasta_entry.stream_offset = input.tellg() - CT_POS_TYPE(0); CRef<CSeq_entry> se; se = ReadFasta(input, fReadFasta_AssumeNuc | fReadFasta_OneSeq); if (!se->IsSeq()) continue; const CSeq_entry::TSeq& bioseq = se->GetSeq(); const CSeq_id* sid = bioseq.GetFirstId(); fasta_entry.seq_id = sid->AsFastaString(); if (bioseq.CanGetDescr()) { const CSeq_descr& d = bioseq.GetDescr(); if (d.CanGet()) { const CSeq_descr_Base::Tdata& data = d.Get(); if (!data.empty()) { CSeq_descr_Base::Tdata::const_iterator it = data.begin(); if (it != data.end()) { CRef<CSeqdesc> ref_desc = *it; ref_desc->GetLabel(&fasta_entry.description, CSeqdesc::eContent); } } } } fasta_map->file_map.push_back(fasta_entry); } // while}END_SCOPE(objects)END_NCBI_SCOPE/** ===========================================================================** $Log: fasta.cpp,v $* Revision 1000.2 2004/06/01 19:46:18 gouriano* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.12** Revision 1.12 2004/05/21 21:42:55 gorelenk* Added PCH ncbi_pch.hpp** Revision 1.11 2004/02/19 22:57:52 ucko* Accommodate stricter implementations of CT_POS_TYPE.** Revision 1.10 2003/12/05 03:00:36 ucko* Validate input better.** Revision 1.9 2003/10/03 15:09:18 ucko* Tweak sequence string allocation to avoid O(n^2) performance from* linear resizing.** Revision 1.8 2003/08/25 21:30:09 ucko* ReadFasta: tweak a bit to improve performance, and fix some bugs in* parsing gaps.** Revision 1.7 2003/08/11 14:39:54 ucko* Populate "lowercase" with Packed_seqints rather than general Seq_loc_mixes.** Revision 1.6 2003/08/08 21:29:12 dondosha* Changed type of lcase_mask in ReadFasta to vector of CConstRefs** Revision 1.5 2003/08/07 21:13:21 ucko* Support a counter for assigning local IDs to sequences with no ID given.* Fix some minor bugs in lowercase-character support.** Revision 1.4 2003/08/06 19:08:33 ucko* Slight interface tweak to ReadFasta: report lowercase locations in a* vector with one entry per Bioseq rather than a consolidated Seq_loc_mix.** Revision 1.3 2003/06/23 20:49:11 kuznets* Changed to use Seq_id::AsFastaString() when reading fasta file map.** Revision 1.2 2003/06/08 16:17:00 lavr* Heed MSVC int->bool performance warning** Revision 1.1 2003/06/04 17:26:22 ucko* Split out from Seq_entry.cpp.*** ===========================================================================*/
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?