fasta.cpp
来自「ncbi源码」· C++ 代码 · 共 604 行 · 第 1/2 页
CPP
604 行
/* * =========================================================================== * PRODUCTION $Log: fasta.cpp,v $ * PRODUCTION Revision 1000.2 2004/06/01 19:46:18 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.12 * PRODUCTION * =========================================================================== *//* $Id: fasta.cpp,v 1000.2 2004/06/01 19:46:18 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Authors: Aaron Ucko, NCBI; Anatoliy Kuznetsov, NCBI.** File Description:* Reader for FASTA-format sequences. (The writer is CFastaOStream, in* src/objmgr/util/sequence.cpp.)** ===========================================================================*/#include <ncbi_pch.hpp>#include <objtools/readers/fasta.hpp>#include <objtools/readers/reader_exception.hpp>#include <corelib/ncbiutil.hpp>#include <util/format_guess.hpp>#include <objects/general/Object_id.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/Delta_ext.hpp>#include <objects/seq/Delta_seq.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_ext.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_literal.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seq/seqport_util.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/Seq_interval.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqloc/Seq_loc_mix.hpp>#include <objects/seqloc/Seq_point.hpp>#include <objects/seqset/Bioseq_set.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)static SIZE_TYPE s_EndOfFastaID(const string& str, SIZE_TYPE pos){ SIZE_TYPE vbar = str.find('|', pos); if (vbar == NPOS) { return NPOS; // bad } CSeq_id::E_Choice choice = CSeq_id::WhichInverseSeqId(str.substr(pos, vbar - pos).c_str());#if 1 if (choice != CSeq_id::e_not_set) { SIZE_TYPE vbar_prev = vbar; int count; for (count=0; ; ++count, vbar_prev = vbar) { vbar = str.find('|', vbar_prev + 1); if (vbar == NPOS) { break; } choice = CSeq_id::WhichInverseSeqId( str.substr(vbar_prev + 1, vbar - vbar_prev - 1).c_str()); if (choice != CSeq_id::e_not_set) { vbar = vbar_prev; break; } } } else { return NPOS; // bad }#else switch (choice) { case CSeq_id::e_Patent: case CSeq_id::e_Other: // 3 args vbar = str.find('|', vbar + 1); // intentional fall-through - this allows us to correctly // calculate the number of '|' separations for FastA IDs case CSeq_id::e_Genbank: case CSeq_id::e_Embl: case CSeq_id::e_Pir: case CSeq_id::e_Swissprot: case CSeq_id::e_General: case CSeq_id::e_Ddbj: case CSeq_id::e_Prf: case CSeq_id::e_Pdb: case CSeq_id::e_Tpg: case CSeq_id::e_Tpe: case CSeq_id::e_Tpd: // 2 args if (vbar == NPOS) { return NPOS; // bad } vbar = str.find('|', vbar + 1); // intentional fall-through - this allows us to correctly // calculate the number of '|' separations for FastA IDs case CSeq_id::e_Local: case CSeq_id::e_Gibbsq: case CSeq_id::e_Gibbmt: case CSeq_id::e_Giim: case CSeq_id::e_Gi: // 1 arg if (vbar == NPOS) { if (choice == CSeq_id::e_Other) { // this is acceptable - member is optional break; } return NPOS; // bad } vbar = str.find('|', vbar + 1); break; default: // unrecognized or not set return NPOS; // bad }#endif return (vbar == NPOS) ? str.size() : vbar;}static void s_FixSeqData(CBioseq* seq){ _ASSERT(seq); CSeq_inst& inst = seq->SetInst(); switch (inst.GetRepr()) { case CSeq_inst::eRepr_delta: { TSeqPos length = 0; NON_CONST_ITERATE (CDelta_ext::Tdata, it, inst.SetExt().SetDelta().Set()) { if ((*it)->IsLiteral()) { CSeq_literal& lit = (*it)->SetLiteral(); CSeq_data& data = lit.SetSeq_data(); if (data.IsIupacna()) { lit.SetLength(data.GetIupacna().Get().size()); CSeqportUtil::Pack(&data); } else { string& s = data.SetIupacaa().Set(); lit.SetLength(s.size()); s.reserve(s.size()); // free extra allocation } length += lit.GetLength(); } } break; } case CSeq_inst::eRepr_raw: { CSeq_data& data = inst.SetSeq_data(); if (data.IsIupacna()) { inst.SetLength(data.GetIupacna().Get().size()); CSeqportUtil::Pack(&data); } else { string& s = data.SetIupacaa().Set(); inst.SetLength(s.size()); s.reserve(s.size()); // free extra allocation } break; } default: // especially not_set! break; }}void s_AddData(CSeq_inst& inst, const string& residues){ CRef<CSeq_data> data; if (inst.IsSetExt() && inst.GetExt().IsDelta()) { CDelta_ext::Tdata& delta_data = inst.SetExt().SetDelta().Set(); if (delta_data.empty() || !delta_data.back()->IsLiteral()) { CRef<CDelta_seq> delta_seq(new CDelta_seq); delta_data.push_back(delta_seq); data = &delta_seq->SetLiteral().SetSeq_data(); } else { data = &delta_data.back()->SetLiteral().SetSeq_data(); } } else { data = &inst.SetSeq_data(); } string* s = 0; if (inst.GetMol() == CSeq_inst::eMol_aa) { if (data->IsIupacaa()) { s = &data->SetIupacaa().Set(); } else { data->SetIupacaa().Set(residues); } } else { if (data->IsIupacna()) { s = &data->SetIupacna().Set(); } else { data->SetIupacna().Set(residues); } } if (s) { // grow exponentially to avoid O(n^2) behavior if (s->capacity() < s->size() + residues.size()) { s->reserve(s->capacity() + max(residues.size(), s->capacity() / 2)); } *s += residues; }}static CSeq_inst::EMol s_ParseFastaDefline(CBioseq::TId& ids, string& title, const string& line, TReadFastaFlags flags, int* counter){ SIZE_TYPE start = 0; CSeq_inst::EMol mol = CSeq_inst::eMol_not_set; do { ++start; SIZE_TYPE space = line.find_first_of(" \t", start); string name = line.substr(start, space - start), local; if (flags & fReadFasta_NoParseID) { local = name; } else { // try to parse out IDs SIZE_TYPE pos = 0; while (pos < name.size()) { SIZE_TYPE end = s_EndOfFastaID(name, pos); if (end == NPOS) { if (pos > 0) { NCBI_THROW2(CObjReaderParseException, eFormat, "s_ParseFastaDefline: Bad ID " + name.substr(pos), pos); } else { local = name; break; } } CRef<CSeq_id> id(new CSeq_id(name.substr(pos, end - pos))); ids.push_back(id); if (mol == CSeq_inst::eMol_not_set && !(flags & fReadFasta_ForceType)) { CSeq_id::EAccessionInfo ai = id->IdentifyAccession(); if (ai & CSeq_id::fAcc_nuc) { mol = CSeq_inst::eMol_na; } else if (ai & CSeq_id::fAcc_prot) { mol = CSeq_inst::eMol_aa; } } pos = end + 1; } } if ( !local.empty() ) { ids.push_back(CRef<CSeq_id> (new CSeq_id(CSeq_id::e_Local, local, kEmptyStr))); } start = line.find('\1', start); if (space != NPOS && title.empty()) { title.assign(line, space + 1, (start == NPOS) ? NPOS : (start - space - 1)); } } while (start != NPOS && (flags & fReadFasta_AllSeqIds)); if (ids.empty()) { CRef<CSeq_id> id(new CSeq_id); id->SetLocal().SetId(++*counter); ids.push_back(id); } return mol;}static void s_GuessMol(CSeq_inst::EMol& mol, const string& data, TReadFastaFlags flags, istream& in)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?