📄 seqport_util.hpp
字号:
/* * =========================================================================== * PRODUCTION $Log: seqport_util.hpp,v $ * PRODUCTION Revision 1000.2 2003/11/21 18:20:24 gouriano * PRODUCTION PRODUCTION: UPGRADED [ORIGINAL] Dev-tree R1.10 * PRODUCTION * =========================================================================== */#ifndef OBJECTS_SEQ___SEQPORT_UTIL__HPP#define OBJECTS_SEQ___SEQPORT_UTIL__HPP/* $Id: seqport_util.hpp,v 1000.2 2003/11/21 18:20:24 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Clifford Clausen * (also reviewed/fixed/groomed by Denis Vakatov and Aaron Ucko) * * File Description: */ #include <corelib/ncbi_limits.hpp>#include <objects/seq/Seq_data.hpp>#include <objects/seqcode/Seq_code_type.hpp>#include <util/random_gen.hpp>#include <memory>#include <vector>BEGIN_NCBI_SCOPEBEGIN_objects_SCOPE// CSeqportUtil is a wrapper for a hidden object of class// CSeqportUtil_implementation.class CSeqportUtil_implementation;class NCBI_SEQ_EXPORT CSeqportUtil{public: // TypeDefs typedef unsigned int TIndex; typedef pair<TIndex, TIndex> TPair; // Classes thrown as errors struct NCBI_SEQ_EXPORT CBadIndex : public runtime_error { CBadIndex(TIndex idx, string method) : runtime_error("CSeqportUtil::" + method + " -- bad index specified: " + NStr::UIntToString(idx)) {} }; struct NCBI_SEQ_EXPORT CBadSymbol : public runtime_error { CBadSymbol(string code, string method) : runtime_error("CSeqportUtil::" + method + " -- bad symbol specified: " + code) {} }; struct NCBI_SEQ_EXPORT CBadType : public runtime_error { CBadType(string method) : runtime_error("CSeqportUtil::" + method + " -- specified code or code combination not supported") {} }; // Alphabet conversion function. Function returns the // number of converted codes. static TSeqPos Convert(const CSeq_data& in_seq, CSeq_data* out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx = 0, TSeqPos uLength = 0, bool bAmbig = false, CRandom::TValue seed = 17734276); // Function to provide maximum in-place packing of na // sequences without loss of information. Iupacna // can always be packed to ncbi4na without loss. Iupacna // can sometimes be packed to ncbi2na. Ncbi4na can // sometimes be packed to ncbi2na. Returns number of // residues packed. If in_seq cannot be packed, the // original in_seq is returned unchanged and the return value // from Pack is 0 static TSeqPos Pack(CSeq_data* in_seq, TSeqPos uLength = ncbi::numeric_limits<TSeqPos>::max()); // Performs fast validation of CSeq_data. If all data in the // sequence represent valid elements of a biological sequence, then // FastValidate returns true. Otherwise it returns false static bool FastValidate(const CSeq_data& in_seq, TSeqPos uBeginIdx = 0, TSeqPos uLength = 0); // Performs validation of CSeq_data. Returns a list of indices // corresponding to data that does not represent a valid element // of a biological sequence. static void Validate(const CSeq_data& in_seq, vector<TSeqPos>* badIdx, TSeqPos uBeginIdx = 0, TSeqPos uLength = 0); // Get ambiguous bases. out_indices returns // the indices relative to in_seq of ambiguous bases. // out_seq returns the ambiguous bases. Note, there are // only ambiguous bases for iupacna->ncib2na and // ncib4na->ncbi2na coversions. static TSeqPos GetAmbigs(const CSeq_data& in_seq, CSeq_data* out_seq, vector<TSeqPos>* out_indices, CSeq_data::E_Choice to_code = CSeq_data::e_Ncbi2na, TSeqPos uBeginIdx = 0, TSeqPos uLength = 0); // Get a copy of CSeq_data. No conversion is done. uBeginIdx of the // biological sequence in in_seq will be in position // 0 of out_seq. Usually, uLength bases will be copied // from in_seq to out_seq. If uLength goes beyond the end of // in_seq, it will be shortened to go to the end of in_seq. // For packed sequence formats (ncbi2na and ncbi4na), // only uLength bases are valid copies. For example, // in an ncbi4na encoded sequence, if uLength is odd, the last // sequence returned will be uLength+1 because 2 bases are encoded // per byte in ncbi4na. However, in this case, uLength will be returned // unchanged (it will remain odd unless it goes beyond the end // of in_seq). If uLength=0, then a copy from uBeginIdx to the end // of in_seq is returned. static TSeqPos GetCopy(const CSeq_data& in_seq, CSeq_data* out_seq, TSeqPos uBeginIdx = 0, TSeqPos uLength = 0); // Method to keep only a contiguous piece of a sequence beginning // at uBeginIdx and uLength residues long. Does bit shifting as // needed to put uBeginIdx of original sequence at position zero on output. // Similar to GetCopy(), but done in place. Returns length of // kept sequence. static TSeqPos Keep(CSeq_data* in_seq, TSeqPos uBeginIdx = 0, TSeqPos uLength = 0); // Append in_seq2 to to end of in_seq1. Both in seqs must be // in the same alphabet or this method will throw a runtime_error. // The result of the append will be put into out_seq. // For packed sequences ncbi2na and ncbi4na, Append will shift and // append so as to remove any jaggedness at the append point. static TSeqPos Append(CSeq_data* out_seq, const CSeq_data& in_seq1, TSeqPos uBeginIdx1, TSeqPos uLength1, const CSeq_data& in_seq2, TSeqPos uBeginIdx2, TSeqPos uLength2); // Create a biological complement of an na sequence. // Attempts to complement an aa sequence will throw // a runtime_error. Returns length of complemented sequence. // Complement the input sequence in place static TSeqPos Complement(CSeq_data* in_seq, TSeqPos uBeginIdx = 0, TSeqPos uLength = 0); // Complement the input sequence and put the result in // the output sequence static TSeqPos Complement(const CSeq_data& in_seq, CSeq_data* out_seq, TSeqPos uBeginIdx = 0, TSeqPos uLength = 0); // Create a biological sequence that is the reversse
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -