fasta.cpp

来自「ncbi源码」· C++ 代码 · 共 604 行 · 第 1/2 页

CPP
604
字号
/* * =========================================================================== * PRODUCTION $Log: fasta.cpp,v $ * PRODUCTION Revision 1000.2  2004/06/01 19:46:18  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.12 * PRODUCTION * =========================================================================== *//*  $Id: fasta.cpp,v 1000.2 2004/06/01 19:46:18 gouriano Exp $* ===========================================================================**                            PUBLIC DOMAIN NOTICE*               National Center for Biotechnology Information**  This software/database is a "United States Government Work" under the*  terms of the United States Copyright Act.  It was written as part of*  the author's official duties as a United States Government employee and*  thus cannot be copyrighted.  This software/database is freely available*  to the public for use. The National Library of Medicine and the U.S.*  Government have not placed any restriction on its use or reproduction.**  Although all reasonable efforts have been taken to ensure the accuracy*  and reliability of the software and data, the NLM and the U.S.*  Government do not and cannot warrant the performance or results that*  may be obtained by using this software or data. The NLM and the U.S.*  Government disclaim all warranties, express or implied, including*  warranties of performance, merchantability or fitness for any particular*  purpose.**  Please cite the author in any work or product based on this material.** ===========================================================================** Authors:  Aaron Ucko, NCBI;  Anatoliy Kuznetsov, NCBI.** File Description:*   Reader for FASTA-format sequences.  (The writer is CFastaOStream, in*   src/objmgr/util/sequence.cpp.)** ===========================================================================*/#include <ncbi_pch.hpp>#include <objtools/readers/fasta.hpp>#include <objtools/readers/reader_exception.hpp>#include <corelib/ncbiutil.hpp>#include <util/format_guess.hpp>#include <objects/general/Object_id.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/Delta_ext.hpp>#include <objects/seq/Delta_seq.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/Seq_descr.hpp>#include <objects/seq/Seq_ext.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seq/Seq_literal.hpp>#include <objects/seq/Seqdesc.hpp>#include <objects/seq/seqport_util.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/Seq_interval.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqloc/Seq_loc_mix.hpp>#include <objects/seqloc/Seq_point.hpp>#include <objects/seqset/Bioseq_set.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)static SIZE_TYPE s_EndOfFastaID(const string& str, SIZE_TYPE pos){    SIZE_TYPE vbar = str.find('|', pos);    if (vbar == NPOS) {        return NPOS; // bad    }    CSeq_id::E_Choice choice =        CSeq_id::WhichInverseSeqId(str.substr(pos, vbar - pos).c_str());#if 1    if (choice != CSeq_id::e_not_set) {        SIZE_TYPE vbar_prev = vbar;        int count;        for (count=0; ; ++count, vbar_prev = vbar) {            vbar = str.find('|', vbar_prev + 1);            if (vbar == NPOS) {                break;            }            choice = CSeq_id::WhichInverseSeqId(                str.substr(vbar_prev + 1, vbar - vbar_prev - 1).c_str());            if (choice != CSeq_id::e_not_set) {                vbar = vbar_prev;                break;            }        }    } else {        return NPOS; // bad    }#else    switch (choice) {    case CSeq_id::e_Patent: case CSeq_id::e_Other: // 3 args        vbar = str.find('|', vbar + 1);        // intentional fall-through - this allows us to correctly        // calculate the number of '|' separations for FastA IDs    case CSeq_id::e_Genbank:   case CSeq_id::e_Embl:    case CSeq_id::e_Pir:    case CSeq_id::e_Swissprot: case CSeq_id::e_General: case CSeq_id::e_Ddbj:    case CSeq_id::e_Prf:       case CSeq_id::e_Pdb:     case CSeq_id::e_Tpg:    case CSeq_id::e_Tpe:       case CSeq_id::e_Tpd:        // 2 args        if (vbar == NPOS) {            return NPOS; // bad        }        vbar = str.find('|', vbar + 1);        // intentional fall-through - this allows us to correctly        // calculate the number of '|' separations for FastA IDs    case CSeq_id::e_Local: case CSeq_id::e_Gibbsq: case CSeq_id::e_Gibbmt:    case CSeq_id::e_Giim:  case CSeq_id::e_Gi:        // 1 arg        if (vbar == NPOS) {            if (choice == CSeq_id::e_Other) {                // this is acceptable - member is optional                break;            }            return NPOS; // bad        }        vbar = str.find('|', vbar + 1);        break;    default: // unrecognized or not set        return NPOS; // bad    }#endif    return (vbar == NPOS) ? str.size() : vbar;}static void s_FixSeqData(CBioseq* seq){    _ASSERT(seq);    CSeq_inst& inst = seq->SetInst();    switch (inst.GetRepr()) {    case CSeq_inst::eRepr_delta:    {        TSeqPos length = 0;        NON_CONST_ITERATE (CDelta_ext::Tdata, it,                           inst.SetExt().SetDelta().Set()) {            if ((*it)->IsLiteral()) {                CSeq_literal& lit  = (*it)->SetLiteral();                CSeq_data&    data = lit.SetSeq_data();                if (data.IsIupacna()) {                    lit.SetLength(data.GetIupacna().Get().size());                    CSeqportUtil::Pack(&data);                } else {                    string& s = data.SetIupacaa().Set();                    lit.SetLength(s.size());                    s.reserve(s.size()); // free extra allocation                }                length += lit.GetLength();            }        }        break;    }    case CSeq_inst::eRepr_raw:    {        CSeq_data& data = inst.SetSeq_data();        if (data.IsIupacna()) {            inst.SetLength(data.GetIupacna().Get().size());            CSeqportUtil::Pack(&data);        } else {            string& s = data.SetIupacaa().Set();            inst.SetLength(s.size());            s.reserve(s.size()); // free extra allocation        }                break;    }    default: // especially not_set!        break;    }}void s_AddData(CSeq_inst& inst, const string& residues){    CRef<CSeq_data> data;    if (inst.IsSetExt()  &&  inst.GetExt().IsDelta()) {        CDelta_ext::Tdata& delta_data = inst.SetExt().SetDelta().Set();        if (delta_data.empty()  ||  !delta_data.back()->IsLiteral()) {            CRef<CDelta_seq> delta_seq(new CDelta_seq);            delta_data.push_back(delta_seq);            data = &delta_seq->SetLiteral().SetSeq_data();        } else {            data = &delta_data.back()->SetLiteral().SetSeq_data();        }    } else {        data = &inst.SetSeq_data();    }    string* s = 0;    if (inst.GetMol() == CSeq_inst::eMol_aa) {        if (data->IsIupacaa()) {            s = &data->SetIupacaa().Set();        } else {            data->SetIupacaa().Set(residues);        }    } else {        if (data->IsIupacna()) {            s = &data->SetIupacna().Set();        } else {            data->SetIupacna().Set(residues);        }    }    if (s) {        // grow exponentially to avoid O(n^2) behavior        if (s->capacity() < s->size() + residues.size()) {            s->reserve(s->capacity()                       + max(residues.size(), s->capacity() / 2));        }        *s += residues;    }}static CSeq_inst::EMol s_ParseFastaDefline(CBioseq::TId& ids, string& title,                                           const string& line,                                           TReadFastaFlags flags, int* counter){    SIZE_TYPE       start = 0;    CSeq_inst::EMol mol   = CSeq_inst::eMol_not_set;    do {        ++start;        SIZE_TYPE space = line.find_first_of(" \t", start);        string    name  = line.substr(start, space - start), local;        if (flags & fReadFasta_NoParseID) {            local = name;        } else {            // try to parse out IDs            SIZE_TYPE pos = 0;            while (pos < name.size()) {                SIZE_TYPE end = s_EndOfFastaID(name, pos);                if (end == NPOS) {                    if (pos > 0) {                        NCBI_THROW2(CObjReaderParseException, eFormat,                                    "s_ParseFastaDefline: Bad ID "                                    + name.substr(pos),                                    pos);                    } else {                        local = name;                        break;                    }                }                CRef<CSeq_id> id(new CSeq_id(name.substr(pos, end - pos)));                ids.push_back(id);                if (mol == CSeq_inst::eMol_not_set                    &&  !(flags & fReadFasta_ForceType)) {                    CSeq_id::EAccessionInfo ai = id->IdentifyAccession();                    if (ai & CSeq_id::fAcc_nuc) {                        mol = CSeq_inst::eMol_na;                    } else if (ai & CSeq_id::fAcc_prot) {                        mol = CSeq_inst::eMol_aa;                    }                }                pos = end + 1;            }        }                    if ( !local.empty() ) {            ids.push_back(CRef<CSeq_id>                          (new CSeq_id(CSeq_id::e_Local, local, kEmptyStr)));        }        start = line.find('\1', start);        if (space != NPOS  &&  title.empty()) {            title.assign(line, space + 1,                         (start == NPOS) ? NPOS : (start - space - 1));        }    } while (start != NPOS  &&  (flags & fReadFasta_AllSeqIds));    if (ids.empty()) {        CRef<CSeq_id> id(new CSeq_id);        id->SetLocal().SetId(++*counter);        ids.push_back(id);    }    return mol;}static void s_GuessMol(CSeq_inst::EMol& mol, const string& data,                       TReadFastaFlags flags, istream& in)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?