aln_reader.cpp
来自「ncbi源码」· C++ 代码 · 共 437 行
CPP
437 行
/* * =========================================================================== * PRODUCTION $Log: aln_reader.cpp,v $ * PRODUCTION Revision 1000.1 2004/06/01 19:46:14 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.10 * PRODUCTION * =========================================================================== *//* $Id: aln_reader.cpp,v 1000.1 2004/06/01 19:46:14 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors: Josh Cherry * * File Description: C++ wrappers for alignment file reading * */#include <ncbi_pch.hpp>#include <objtools/readers/aln_reader.hpp>#include <objtools/readers/reader_exception.hpp>#include <util/creaders/alnread.h>#include <util/format_guess.hpp>#include <objects/seqalign/Dense_seg.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/general/Object_id.hpp>#include <objects/seq/Seq_annot.hpp>#include <objects/seq/Seq_inst.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seq/IUPACna.hpp>#include <objects/seq/IUPACaa.hpp>#include <objects/seq/Bioseq.hpp>#include <objects/seq/seqport_util.hpp>BEGIN_NCBI_SCOPEUSING_SCOPE(objects);static char * ALIGNMENT_CALLBACK s_ReadLine(void *user_data){ CNcbiIstream *is = static_cast<CNcbiIstream *>(user_data); if (!*is) { return 0; } string s; NcbiGetlineEOL(*is, s); return strdup(s.c_str());}static void ALIGNMENT_CALLBACK s_ReportError(TErrorInfoPtr err_ptr, void *user_data){ if (err_ptr->category != eAlnErr_Fatal) { // ignore non-fatal errors return; } string msg = "Error reading alignment file"; if (err_ptr->line_num > -1) { msg += " at line " + NStr::IntToString(err_ptr->line_num); } if (err_ptr->message) { msg += ": "; msg += err_ptr->message; } LOG_POST(Error << msg);}CAlnReader::~CAlnReader(){}void CAlnReader::Read(){ if (m_ReadDone) { return; } // make a SSequenceInfo corresponding to our CSequenceInfo argument SSequenceInfo info; info.alphabet = const_cast<char *>(m_Alphabet.c_str()); info.beginning_gap = const_cast<char *>(m_BeginningGap.c_str()); info.end_gap = const_cast<char *>(m_EndGap.c_str());; info.middle_gap = const_cast<char *>(m_MiddleGap.c_str()); info.missing = const_cast<char *>(m_Missing.c_str()); info.match = const_cast<char *>(m_Match.c_str()); // read the alignment stream TAlignmentFilePtr afp; afp = ReadAlignmentFile(s_ReadLine, (void *) &m_IS, s_ReportError, 0, &info); if (!afp) { NCBI_THROW2(CObjReaderParseException, eFormat, "Error reading alignment", 0); } // build the CAlignment m_Seqs.resize(afp->num_sequences); m_Ids.resize(afp->num_sequences); for (int i = 0; i < afp->num_sequences; ++i) { m_Seqs[i] = NStr::ToUpper(afp->sequences[i]); m_Ids[i] = afp->ids[i]; } m_Organisms.resize(afp->num_organisms); for (int i = 0; i < afp->num_organisms; ++i) { if (afp->organisms[i]) { m_Organisms[i] = afp->organisms[i]; } else { m_Organisms[i].erase(); } } m_Deflines.resize(afp->num_deflines); for (int i = 0; i < afp->num_deflines; ++i) { if (afp->deflines[i]) { m_Deflines[i] = afp->deflines[i]; } else { m_Deflines[i].erase(); } } AlignmentFileFree(afp); {{ m_Dim = m_Ids.size(); }} m_ReadDone = true; return;}void CAlnReader::SetClustal(EAlphabet alpha){ SetAlphabet(alpha); SetAllGap("-");}void CAlnReader::SetPaup(EAlphabet alpha){ SetAlphabet(alpha); SetAllGap("-");}void CAlnReader::SetPhylip(EAlphabet alpha){ SetAlphabet(alpha); SetAllGap("-");}CRef<CSeq_align> CAlnReader::GetSeqAlign(){ if (m_Aln) { return m_Aln; } else if ( !m_ReadDone ) { NCBI_THROW2(CObjReaderParseException, eFormat, "CAlnReader::GetSeqAlign(): " "Seq_align is not available until after Read()", 0); } typedef CDense_seg::TNumseg TNumseg; typedef CDense_seg::TDim TNumrow; m_Aln = new CSeq_align(); m_Aln->SetType(CSeq_align::eType_not_set); m_Aln->SetDim(m_Dim); CDense_seg& ds = m_Aln->SetSegs().SetDenseg(); ds.SetDim(m_Dim); CDense_seg::TIds& ids = ds.SetIds(); CDense_seg::TStarts& starts = ds.SetStarts(); //CDense_seg::TStrands& strands = ds.SetStrands(); CDense_seg::TLens& lens = ds.SetLens(); ids.resize(m_Dim); // get the length of the aln row, asuming all rows are the same TSeqPos aln_stop = m_Seqs[0].size(); for (TNumrow row_i = 0; row_i < m_Dim; row_i++) { CRef<CSeq_id> seq_id(new CSeq_id(m_Ids[row_i])); if (seq_id->Which() == CSeq_id::e_not_set) { seq_id->SetLocal().SetStr(m_Ids[row_i]); } ids[row_i] = seq_id; } m_SeqVec.resize(m_Dim); for (TNumrow row_i = 0; row_i < m_Dim; row_i++) { m_SeqVec[row_i].resize(aln_stop, 0); // size unknown, resize to max } m_SeqLen.resize(m_Dim, 0); vector<bool> is_gap; is_gap.resize(m_Dim, true); vector<bool> prev_is_gap; prev_is_gap.resize(m_Dim, true); vector<TSignedSeqPos> next_start; next_start.resize(m_Dim, 0); int starts_i = 0; TSeqPos prev_aln_pos = 0, prev_len = 0; bool new_seg = false; TNumseg numseg = 0; for (TSeqPos aln_pos = 0; aln_pos < aln_stop; aln_pos++) { for (TNumrow row_i = 0; row_i < m_Dim; row_i++) { const char& residue = m_Seqs[row_i][aln_pos]; if (residue != m_MiddleGap[0] && residue != m_EndGap[0] && residue != m_Missing[0]) { if (is_gap[row_i]) { is_gap[row_i] = false; new_seg = true; } // add to the sequence vector m_SeqVec[row_i][m_SeqLen[row_i]++] = residue; } else { if ( !is_gap[row_i] ) { is_gap[row_i] = true; new_seg = true; } } } if (new_seg) { numseg++; if (aln_pos) { // if not the first seg lens.push_back(prev_len = aln_pos - prev_aln_pos); for (TNumrow row_i = 0; row_i < m_Dim; row_i++) { if ( !prev_is_gap[row_i] ) { next_start[row_i] += prev_len; } } } starts.resize(starts_i + m_Dim); for (TNumrow row_i = 0; row_i < m_Dim; row_i++) { if (is_gap[row_i]) { starts[starts_i++] = -1; } else { starts[starts_i++] = next_start[row_i];; } prev_is_gap[row_i] = is_gap[row_i]; } prev_aln_pos = aln_pos; new_seg = false; } } for (TNumrow row_i = 0; row_i < m_Dim; row_i++) { m_SeqVec[row_i].resize(m_SeqLen[row_i]); // resize down to actual size } lens.push_back(aln_stop - prev_aln_pos); //strands.resize(numseg * m_Dim, eNa_strand_plus); _ASSERT(lens.size() == numseg); ds.SetNumseg(numseg);#if _DEBUG m_Aln->Validate(true);#endif return m_Aln;}CRef<CSeq_entry> CAlnReader::GetSeqEntry(){ if (m_Entry) { return m_Entry; } else if ( !m_ReadDone ) { NCBI_THROW2(CObjReaderParseException, eFormat, "CAlnReader::GetSeqEntry(): " "Seq_entry is not available until after Read()", 0); } m_Entry = new CSeq_entry(); CRef<CSeq_annot> seq_annot (new CSeq_annot); seq_annot->SetData().SetAlign().push_back(GetSeqAlign()); m_Entry->SetSet().SetClass(CBioseq_set::eClass_pop_set); m_Entry->SetSet().SetAnnot().push_back(seq_annot); CBioseq_set::TSeq_set& seq_set = m_Entry->SetSet().SetSeq_set(); typedef CDense_seg::TDim TNumrow; for (TNumrow row_i = 0; row_i < m_Dim; row_i++) { const string& seq_str = m_SeqVec[row_i]; const size_t& seq_str_len = seq_str.size(); CRef<CSeq_entry> seq_entry (new CSeq_entry); // seq-id CRef<CSeq_id> seq_id(new CSeq_id(m_Ids[row_i])); seq_entry->SetSeq().SetId().push_back(seq_id); if (seq_id->Which() == CSeq_id::e_not_set) { seq_id->SetLocal().SetStr(m_Ids[row_i]); } // mol CSeq_inst::EMol mol = CSeq_inst::eMol_not_set; CSeq_id::EAccessionInfo ai = seq_id->IdentifyAccession(); if (ai & CSeq_id::fAcc_nuc) { mol = CSeq_inst::eMol_na; } else if (ai & CSeq_id::fAcc_prot) { mol = CSeq_inst::eMol_aa; } else { switch (CFormatGuess::SequenceType(seq_str.data(), seq_str_len)) { case CFormatGuess::eNucleotide: mol = CSeq_inst::eMol_na; break; case CFormatGuess::eProtein: mol = CSeq_inst::eMol_aa; break; default: break; } } // seq-inst CRef<CSeq_inst> seq_inst (new CSeq_inst); seq_entry->SetSeq().SetInst(*seq_inst); seq_set.push_back(seq_entry); // repr seq_inst->SetRepr(CSeq_inst::eRepr_raw); // mol seq_inst->SetMol(mol); // len _ASSERT(seq_str_len == m_SeqLen[row_i]); seq_inst->SetLength(seq_str_len); // data CSeq_data& data = seq_inst->SetSeq_data(); if (mol == CSeq_inst::eMol_aa) { data.SetIupacaa().Set(seq_str); } else { data.SetIupacna().Set(seq_str); CSeqportUtil::Pack(&data); } } return m_Entry;}END_NCBI_SCOPE/* * =========================================================================== * $Log: aln_reader.cpp,v $ * Revision 1000.1 2004/06/01 19:46:14 gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.10 * * Revision 1.10 2004/05/21 21:42:55 gorelenk * Added PCH ncbi_pch.hpp * * Revision 1.9 2004/03/23 19:35:21 jcherry * Set class to pop-set rather than nuc-prot * * Revision 1.8 2004/03/02 20:18:50 jcherry * Don't throw in callback called by C code * * Revision 1.7 2004/03/01 15:26:33 dicuccio * Code clean-up. Added enum for standard alphabets. Added new APIs to set * standard parameters for other alignment types (implemented with unclear details * currently). Added better exception handling. * * Revision 1.6 2004/02/24 17:52:12 jcherry * iupacaa for proteins * * Revision 1.5 2004/02/20 16:40:57 ucko * Fix path to aln_reader.hpp again * * Revision 1.4 2004/02/20 16:21:09 todorov * Better seq id; Try to id the mol type; Packed seq data; * Fixed a couple of bugs in GetSeqAlign * * Revision 1.3 2004/02/20 01:21:35 ucko * Again, erase() strings rather than clear()ing them for compatibility * with G++ 2.95. (The fix seems to have gotten lost in the recent * move.) * * Revision 1.2 2004/02/19 18:38:13 ucko * Update path to aln_reader.hpp. * * Revision 1.1 2004/02/19 16:54:59 todorov * File moved from util/creaders and renamed to aln_reader * * Revision 1.4 2004/02/18 22:29:31 todorov * Converted to single class. Added methods for creating Seq-align and Seq-entry. A working version, but still need to polish: seq-ids, na/aa recognition, etc. * * Revision 1.3 2004/02/11 17:58:12 gorelenk * Added missed modificator ALIGNMENT_CALLBACK to definitions of s_ReadLine * and ALIGNMENT_CALLBACK. * * Revision 1.2 2004/02/10 02:58:09 ucko * erase() strings rather than clear()ing them for compatibility with G++ 2.95. * * Revision 1.1 2004/02/09 16:02:34 jcherry * Initial versionnnn * * =========================================================================== */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?