aln_reader.cpp

来自「ncbi源码」· C++ 代码 · 共 437 行

CPP
437
字号
/* * =========================================================================== * PRODUCTION $Log: aln_reader.cpp,v $ * PRODUCTION Revision 1000.1  2004/06/01 19:46:14  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.10 * PRODUCTION * =========================================================================== *//*  $Id: aln_reader.cpp,v 1000.1 2004/06/01 19:46:14 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors:  Josh Cherry * * File Description:  C++ wrappers for alignment file reading * */#include <ncbi_pch.hpp>#include <objtools/readers/aln_reader.hpp>#include <objtools/readers/reader_exception.hpp>#include <util/creaders/alnread.h>#include <util/format_guess.hpp>#include <objects/seqalign/Dense_seg.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/general/Object_id.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/seqport_util.hpp>BEGIN_NCBI_SCOPEUSING_SCOPE(objects);static char * ALIGNMENT_CALLBACK s_ReadLine(void *user_data){    CNcbiIstream *is = static_cast<CNcbiIstream *>(user_data);    if (!*is) {        return 0;    }    string s;    NcbiGetlineEOL(*is, s);    return strdup(s.c_str());}static void ALIGNMENT_CALLBACK s_ReportError(TErrorInfoPtr err_ptr,                                             void *user_data){    if (err_ptr->category != eAlnErr_Fatal) {        // ignore non-fatal errors        return;    }    string msg = "Error reading alignment file";    if (err_ptr->line_num > -1) {        msg += " at line " + NStr::IntToString(err_ptr->line_num);    }    if (err_ptr->message) {        msg += ":  ";        msg += err_ptr->message;    }    LOG_POST(Error << msg);}CAlnReader::~CAlnReader(){}void CAlnReader::Read(){    if (m_ReadDone) {        return;    }    // make a SSequenceInfo corresponding to our CSequenceInfo argument    SSequenceInfo info;    info.alphabet      = const_cast<char *>(m_Alphabet.c_str());    info.beginning_gap = const_cast<char *>(m_BeginningGap.c_str());    info.end_gap       = const_cast<char *>(m_EndGap.c_str());;    info.middle_gap    = const_cast<char *>(m_MiddleGap.c_str());    info.missing       = const_cast<char *>(m_Missing.c_str());    info.match         = const_cast<char *>(m_Match.c_str());    // read the alignment stream    TAlignmentFilePtr afp;    afp = ReadAlignmentFile(s_ReadLine, (void *) &m_IS,                            s_ReportError, 0, &info);    if (!afp) {        NCBI_THROW2(CObjReaderParseException, eFormat,                   "Error reading alignment", 0);    }    // build the CAlignment    m_Seqs.resize(afp->num_sequences);    m_Ids.resize(afp->num_sequences);    for (int i = 0;  i < afp->num_sequences;  ++i) {        m_Seqs[i] = NStr::ToUpper(afp->sequences[i]);        m_Ids[i] = afp->ids[i];    }    m_Organisms.resize(afp->num_organisms);    for (int i = 0;  i < afp->num_organisms;  ++i) {        if (afp->organisms[i]) {            m_Organisms[i] = afp->organisms[i];        } else {            m_Organisms[i].erase();        }    }    m_Deflines.resize(afp->num_deflines);    for (int i = 0;  i < afp->num_deflines;  ++i) {        if (afp->deflines[i]) {            m_Deflines[i] = afp->deflines[i];        } else {            m_Deflines[i].erase();        }    }    AlignmentFileFree(afp);    {{        m_Dim = m_Ids.size();    }}    m_ReadDone = true;    return;}void CAlnReader::SetClustal(EAlphabet alpha){    SetAlphabet(alpha);    SetAllGap("-");}void CAlnReader::SetPaup(EAlphabet alpha){    SetAlphabet(alpha);    SetAllGap("-");}void CAlnReader::SetPhylip(EAlphabet alpha){    SetAlphabet(alpha);    SetAllGap("-");}CRef<CSeq_align> CAlnReader::GetSeqAlign(){    if (m_Aln) {        return m_Aln;    } else if ( !m_ReadDone ) {        NCBI_THROW2(CObjReaderParseException, eFormat,                   "CAlnReader::GetSeqAlign(): "                   "Seq_align is not available until after Read()", 0);    }    typedef CDense_seg::TNumseg TNumseg;    typedef CDense_seg::TDim TNumrow;    m_Aln = new CSeq_align();    m_Aln->SetType(CSeq_align::eType_not_set);    m_Aln->SetDim(m_Dim);    CDense_seg& ds = m_Aln->SetSegs().SetDenseg();    ds.SetDim(m_Dim);        CDense_seg::TIds&     ids     = ds.SetIds();    CDense_seg::TStarts&  starts  = ds.SetStarts();    //CDense_seg::TStrands& strands = ds.SetStrands();    CDense_seg::TLens&    lens    = ds.SetLens();    ids.resize(m_Dim);    // get the length of the aln row, asuming all rows are the same    TSeqPos aln_stop = m_Seqs[0].size();     for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {        CRef<CSeq_id> seq_id(new CSeq_id(m_Ids[row_i]));        if (seq_id->Which() == CSeq_id::e_not_set) {            seq_id->SetLocal().SetStr(m_Ids[row_i]);        }        ids[row_i] = seq_id;    }    m_SeqVec.resize(m_Dim);    for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {        m_SeqVec[row_i].resize(aln_stop, 0); // size unknown, resize to max    }    m_SeqLen.resize(m_Dim, 0);    vector<bool> is_gap;  is_gap.resize(m_Dim, true);    vector<bool> prev_is_gap;  prev_is_gap.resize(m_Dim, true);    vector<TSignedSeqPos> next_start; next_start.resize(m_Dim, 0);    int starts_i = 0;    TSeqPos prev_aln_pos = 0, prev_len = 0;    bool new_seg = false;    TNumseg numseg = 0;        for (TSeqPos aln_pos = 0; aln_pos < aln_stop; aln_pos++) {        for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {            const char& residue = m_Seqs[row_i][aln_pos];            if (residue != m_MiddleGap[0]  &&                residue != m_EndGap[0]  &&                residue != m_Missing[0]) {                if (is_gap[row_i]) {                    is_gap[row_i] = false;                    new_seg = true;                }                // add to the sequence vector                m_SeqVec[row_i][m_SeqLen[row_i]++] = residue;            } else {                if ( !is_gap[row_i] ) {                    is_gap[row_i] = true;                    new_seg = true;                }            }        }        if (new_seg) {            numseg++;            if (aln_pos) { // if not the first seg                lens.push_back(prev_len = aln_pos - prev_aln_pos);                for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {                    if ( !prev_is_gap[row_i] ) {                        next_start[row_i] += prev_len;                    }                }            }            starts.resize(starts_i + m_Dim);            for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {                if (is_gap[row_i]) {                    starts[starts_i++] = -1;                } else {                    starts[starts_i++] = next_start[row_i];;                }                prev_is_gap[row_i] = is_gap[row_i];            }            prev_aln_pos = aln_pos;            new_seg = false;        }    }    for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {        m_SeqVec[row_i].resize(m_SeqLen[row_i]); // resize down to actual size    }    lens.push_back(aln_stop - prev_aln_pos);    //strands.resize(numseg * m_Dim, eNa_strand_plus);    _ASSERT(lens.size() == numseg);    ds.SetNumseg(numseg);#if _DEBUG    m_Aln->Validate(true);#endif        return m_Aln;}CRef<CSeq_entry> CAlnReader::GetSeqEntry(){    if (m_Entry) {        return m_Entry;    } else if ( !m_ReadDone ) {        NCBI_THROW2(CObjReaderParseException, eFormat,                   "CAlnReader::GetSeqEntry(): "                   "Seq_entry is not available until after Read()", 0);    }    m_Entry = new CSeq_entry();    CRef<CSeq_annot> seq_annot (new CSeq_annot);    seq_annot->SetData().SetAlign().push_back(GetSeqAlign());    m_Entry->SetSet().SetClass(CBioseq_set::eClass_pop_set);    m_Entry->SetSet().SetAnnot().push_back(seq_annot);    CBioseq_set::TSeq_set& seq_set = m_Entry->SetSet().SetSeq_set();    typedef CDense_seg::TDim TNumrow;    for (TNumrow row_i = 0; row_i < m_Dim; row_i++) {        const string& seq_str     = m_SeqVec[row_i];        const size_t& seq_str_len = seq_str.size();        CRef<CSeq_entry> seq_entry (new CSeq_entry);        // seq-id        CRef<CSeq_id> seq_id(new CSeq_id(m_Ids[row_i]));        seq_entry->SetSeq().SetId().push_back(seq_id);        if (seq_id->Which() == CSeq_id::e_not_set) {            seq_id->SetLocal().SetStr(m_Ids[row_i]);        }        // mol        CSeq_inst::EMol mol   = CSeq_inst::eMol_not_set;        CSeq_id::EAccessionInfo ai = seq_id->IdentifyAccession();        if (ai & CSeq_id::fAcc_nuc) {            mol = CSeq_inst::eMol_na;        } else if (ai & CSeq_id::fAcc_prot) {            mol = CSeq_inst::eMol_aa;        } else {            switch (CFormatGuess::SequenceType(seq_str.data(), seq_str_len)) {            case CFormatGuess::eNucleotide:  mol = CSeq_inst::eMol_na;  break;            case CFormatGuess::eProtein:     mol = CSeq_inst::eMol_aa;  break;            default:                         break;            }        }        // seq-inst        CRef<CSeq_inst> seq_inst (new CSeq_inst);        seq_entry->SetSeq().SetInst(*seq_inst);        seq_set.push_back(seq_entry);        // repr        seq_inst->SetRepr(CSeq_inst::eRepr_raw);        // mol        seq_inst->SetMol(mol);        // len        _ASSERT(seq_str_len == m_SeqLen[row_i]);        seq_inst->SetLength(seq_str_len);        // data        CSeq_data& data = seq_inst->SetSeq_data();        if (mol == CSeq_inst::eMol_aa) {            data.SetIupacaa().Set(seq_str);        } else {            data.SetIupacna().Set(seq_str);            CSeqportUtil::Pack(&data);        }    }            return m_Entry;}END_NCBI_SCOPE/* * =========================================================================== * $Log: aln_reader.cpp,v $ * Revision 1000.1  2004/06/01 19:46:14  gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.10 * * Revision 1.10  2004/05/21 21:42:55  gorelenk * Added PCH ncbi_pch.hpp * * Revision 1.9  2004/03/23 19:35:21  jcherry * Set class to pop-set rather than nuc-prot * * Revision 1.8  2004/03/02 20:18:50  jcherry * Don't throw in callback called by C code * * Revision 1.7  2004/03/01 15:26:33  dicuccio * Code clean-up.  Added enum for standard alphabets.  Added new APIs to set * standard parameters for other alignment types (implemented with unclear details * currently).  Added better exception handling. * * Revision 1.6  2004/02/24 17:52:12  jcherry * iupacaa for proteins * * Revision 1.5  2004/02/20 16:40:57  ucko * Fix path to aln_reader.hpp again * * Revision 1.4  2004/02/20 16:21:09  todorov * Better seq id; Try to id the mol type; Packed seq data; * Fixed a couple of bugs in GetSeqAlign * * Revision 1.3  2004/02/20 01:21:35  ucko * Again, erase() strings rather than clear()ing them for compatibility * with G++ 2.95.  (The fix seems to have gotten lost in the recent * move.) * * Revision 1.2  2004/02/19 18:38:13  ucko * Update path to aln_reader.hpp. * * Revision 1.1  2004/02/19 16:54:59  todorov * File moved from util/creaders and renamed to aln_reader * * Revision 1.4  2004/02/18 22:29:31  todorov * Converted to single class. Added methods for creating Seq-align and Seq-entry. A working version, but still need to polish: seq-ids, na/aa recognition, etc. * * Revision 1.3  2004/02/11 17:58:12  gorelenk * Added missed modificator ALIGNMENT_CALLBACK to definitions of s_ReadLine * and ALIGNMENT_CALLBACK. * * Revision 1.2  2004/02/10 02:58:09  ucko * erase() strings rather than clear()ing them for compatibility with G++ 2.95. * * Revision 1.1  2004/02/09 16:02:34  jcherry * Initial versionnnn * * =========================================================================== */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?