reader_snp.cpp

来自「ncbi源码」· C++ 代码 · 共 476 行

CPP
476
字号
/* * =========================================================================== * PRODUCTION $Log: reader_snp.cpp,v $ * PRODUCTION Revision 1000.1  2004/06/01 19:41:43  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.12 * PRODUCTION * =========================================================================== *//*  $Id: reader_snp.cpp,v 1000.1 2004/06/01 19:41:43 gouriano Exp $ * =========================================================================== *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * *  Author:  Anton Butanaev, Eugene Vasilchenko * *  File Description: Data reader from ID1 * */#include <ncbi_pch.hpp>#include <objtools/data_loaders/genbank/reader_snp.hpp>#include <objtools/data_loaders/genbank/reader.hpp>#include <objects/general/Object_id.hpp>#include <objects/general/User_object.hpp>#include <objects/general/User_field.hpp>#include <objects/general/Dbtag.hpp>#include <objects/seqloc/Seq_id.hpp>#include <objects/seqloc/Seq_point.hpp>#include <objects/seqloc/Seq_loc.hpp>#include <objects/seqfeat/Seq_feat.hpp>#include <objects/seqfeat/SeqFeatData.hpp>#include <objects/seqfeat/Imp_feat.hpp>#include <objects/seqfeat/Gb_qual.hpp>#include <objects/seqset/Seq_entry.hpp>#include <objects/seqset/Bioseq_set.hpp>#include <objects/seq/Seq_annot.hpp>#include <objmgr/objmgr_exception.hpp>#include <serial/objectinfo.hpp>#include <serial/objectiter.hpp>#include <serial/objectio.hpp>#include <serial/serial.hpp>#include <serial/objistr.hpp>#include <serial/objistrasnb.hpp>#include <serial/objostrasnb.hpp>#include <util/reader_writer.hpp>#include <algorithm>#include <numeric>// for debugging#include <serial/objostrasn.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(objects)class CSNP_Ftable_hook : public CReadChoiceVariantHook{public:    CSNP_Ftable_hook(CSeq_annot_SNP_Info& annot_snp_info)        : m_Seq_annot_SNP_Info(annot_snp_info)        {        }    void ReadChoiceVariant(CObjectIStream& in,                           const CObjectInfoCV& variant);private:    CSeq_annot_SNP_Info&   m_Seq_annot_SNP_Info;};class CSNP_Seq_feat_hook : public CReadContainerElementHook{public:    CSNP_Seq_feat_hook(CSeq_annot_SNP_Info& annot_snp_info,                       CSeq_annot::TData::TFtable& ftable);    ~CSNP_Seq_feat_hook(void);    void ReadContainerElement(CObjectIStream& in,                              const CObjectInfo& ftable);private:    CSeq_annot_SNP_Info&        m_Seq_annot_SNP_Info;    CSeq_annot::TData::TFtable& m_Ftable;    CRef<CSeq_feat>             m_Feat;    size_t                      m_Count[SSNP_Info::eSNP_Type_last];};void CSNP_Ftable_hook::ReadChoiceVariant(CObjectIStream& in,                                         const CObjectInfoCV& variant){    CObjectInfo data_info = variant.GetChoiceObject();    CObjectInfo ftable_info = *variant;    CSeq_annot::TData& data = *CType<CSeq_annot::TData>::Get(data_info);    _ASSERT(ftable_info.GetObjectPtr() == static_cast<TConstObjectPtr>(&data.GetFtable()));    CSNP_Seq_feat_hook hook(m_Seq_annot_SNP_Info, data.SetFtable());    ftable_info.ReadContainer(in, hook);}static int s_GetEnvInt(const char* env, int def_val){    const char* val = ::getenv(env);    if ( val ) {        try {            return NStr::StringToInt(val);        }        catch (...) {        }    }    return def_val;}static bool s_SNP_stat = s_GetEnvInt("GENBANK_SNP_TABLE_STAT", 0) > 0;CSNP_Seq_feat_hook::CSNP_Seq_feat_hook(CSeq_annot_SNP_Info& annot_snp_info,                                       CSeq_annot::TData::TFtable& ftable)    : m_Seq_annot_SNP_Info(annot_snp_info),      m_Ftable(ftable){    fill(m_Count, m_Count+SSNP_Info::eSNP_Type_last, 0);}static size_t s_TotalCount[SSNP_Info::eSNP_Type_last] = { 0 };CSNP_Seq_feat_hook::~CSNP_Seq_feat_hook(void){    if ( s_SNP_stat ) {        size_t total =            accumulate(m_Count, m_Count+SSNP_Info::eSNP_Type_last, 0);        NcbiCout << "CSeq_annot_SNP_Info statistic:\n";        for ( size_t i = 0; i < SSNP_Info::eSNP_Type_last; ++i ) {            NcbiCout <<                setw(40) << SSNP_Info::s_SNP_Type_Label[i] << ": " <<                setw(6) << m_Count[i] << "  " <<                setw(3) << int(m_Count[i]*100.0/total+.5) << "%\n";            s_TotalCount[i] += m_Count[i];        }        NcbiCout << NcbiEndl;        total =            accumulate(s_TotalCount, s_TotalCount+SSNP_Info::eSNP_Type_last,0);        NcbiCout << "cumulative CSeq_annot_SNP_Info statistic:\n";        for ( size_t i = 0; i < SSNP_Info::eSNP_Type_last; ++i ) {            NcbiCout <<                setw(40) << SSNP_Info::s_SNP_Type_Label[i] << ": " <<                setw(6) << s_TotalCount[i] << "  " <<                setw(3) << int(s_TotalCount[i]*100.0/total+.5) << "%\n";        }        NcbiCout << NcbiEndl;    }}void CSNP_Seq_feat_hook::ReadContainerElement(CObjectIStream& in,                                              const CObjectInfo& /*ftable*/){    if ( !m_Feat ) {        m_Feat.Reset(new CSeq_feat);    }    in.ReadObject(&*m_Feat, m_Feat->GetTypeInfo());    SSNP_Info snp_info;    SSNP_Info::ESNP_Type snp_type =        snp_info.ParseSeq_feat(*m_Feat, m_Seq_annot_SNP_Info);    ++m_Count[snp_type];    if ( snp_type == SSNP_Info::eSNP_Simple ) {        m_Seq_annot_SNP_Info.x_AddSNP(snp_info);    }    else {#ifdef _DEBUG        static int dump_feature = s_GetEnvInt("GENBANK_SNP_TABLE_DUMP", 0);        if ( dump_feature ) {            --dump_feature;            NcbiCerr <<                "CSNP_Seq_feat_hook::ReadContainerElement: complex SNP: " <<                SSNP_Info::s_SNP_Type_Label[snp_type] << ":\n" <<                 MSerial_AsnText << *m_Feat;        }#endif        m_Ftable.push_back(m_Feat);        m_Feat.Reset();    }}static void write_unsigned(CNcbiOstream& stream, unsigned n){    stream.write(reinterpret_cast<const char*>(&n), sizeof(n));}static unsigned read_unsigned(CNcbiIstream& stream){    unsigned n;    stream.read(reinterpret_cast<char*>(&n), sizeof(n));    return n;}static void write_size(CNcbiOstream& stream, unsigned size){    // use ASN.1 binary like format    while ( size >= (1<<7) ) {        stream.put(char(size | (1<<7)));        size >>= 7;    }    stream.put(char(size));}static unsigned read_size(CNcbiIstream& stream){    unsigned size = 0;    int shift = 0;    char c = char(1<<7);    while ( c & (1<<7) ) {        c = stream.get();        size |= (c & ((1<<7)-1)) << shift;        shift += 7;    }    return size;}void CIndexedStrings::StoreTo(CNcbiOstream& stream) const{    write_size(stream, m_Strings.size());    ITERATE ( TStrings, it, m_Strings ) {        unsigned size = it->size();        write_size(stream, size);        stream.write(it->data(), size);    }}void CIndexedStrings::LoadFrom(CNcbiIstream& stream,                               size_t max_index,                               size_t max_length){    Clear();    unsigned count = read_size(stream);    if ( !stream || (count > unsigned(max_index+1)) ) {        NCBI_THROW(CLoaderException, eLoaderFailed,                   "Bad format of SNP table");    }    m_Strings.resize(count);    AutoPtr<char, ArrayDeleter<char> > buf(new char[max_length]);    NON_CONST_ITERATE ( TStrings, it, m_Strings ) {        unsigned size = read_size(stream);        if ( !stream || (size > max_length) ) {            Clear();            NCBI_THROW(CLoaderException, eLoaderFailed,                       "Bad format of SNP table");        }        stream.read(buf.get(), size);        if ( !stream ) {            Clear();            NCBI_THROW(CLoaderException, eLoaderFailed,                       "Bad format of SNP table");        }        it->assign(buf.get(), buf.get()+size);    }}void CSeq_annot_SNP_Info_Reader::Parse(CObjectIStream& in,                                       CSeq_annot_SNP_Info& snp_info){    snp_info.m_Seq_annot.Reset(new CSeq_annot); // Seq-annot object    CReader::SetSNPReadHooks(in);    if ( CReader::TrySNPTable() ) { // set SNP hook        CObjectTypeInfo type = CType<CSeq_annot::TData>();        CObjectTypeInfoVI ftable = type.FindVariant("ftable");        ftable.SetLocalReadHook(in, new CSNP_Ftable_hook(snp_info));    }        in >> *snp_info.m_Seq_annot;    // we don't need index maps anymore    snp_info.m_Comments.ClearIndices();    snp_info.m_Alleles.ClearIndices();    sort(snp_info.m_SNP_Set.begin(), snp_info.m_SNP_Set.end());}static const unsigned MAGIC = 0x12340002;void CSeq_annot_SNP_Info_Reader::Write(CNcbiOstream& stream,                                       const CSeq_annot_SNP_Info& snp_info){    // header    write_unsigned(stream, MAGIC);    write_unsigned(stream, snp_info.GetGi());    // strings    snp_info.m_Comments.StoreTo(stream);    snp_info.m_Alleles.StoreTo(stream);    // simple SNPs    unsigned count = snp_info.m_SNP_Set.size();    write_size(stream, count);    stream.write(reinterpret_cast<const char*>(&snp_info.m_SNP_Set[0]),                 count*sizeof(SSNP_Info));    // complex SNPs    CObjectOStreamAsnBinary obj_stream(stream);    obj_stream << *snp_info.m_Seq_annot;}void CSeq_annot_SNP_Info_Reader::Read(CNcbiIstream& stream,                                      CSeq_annot_SNP_Info& snp_info){    snp_info.Reset();    // header    unsigned magic = read_unsigned(stream);    if ( !stream || magic != MAGIC ) {        NCBI_THROW(CLoaderException, eLoaderFailed,                   "Incompatible version of SNP table");    }    snp_info.x_SetGi(read_unsigned(stream));    // strings    snp_info.m_Comments.LoadFrom(stream,                                 SSNP_Info::kMax_CommentIndex,                                 SSNP_Info::kMax_CommentLength);    snp_info.m_Alleles.LoadFrom(stream,                                SSNP_Info::kMax_AlleleIndex,                                SSNP_Info::kMax_AlleleLength);    // simple SNPs    unsigned count = read_size(stream);    if ( stream ) {        snp_info.m_SNP_Set.resize(count);        stream.read(reinterpret_cast<char*>(&snp_info.m_SNP_Set[0]),                    count*sizeof(SSNP_Info));    }    size_t comments_size = snp_info.m_Comments.GetSize();    size_t alleles_size = snp_info.m_Alleles.GetSize();    ITERATE ( CSeq_annot_SNP_Info::TSNP_Set, it, snp_info.m_SNP_Set ) {        size_t index = it->m_CommentIndex;        if ( index != SSNP_Info::kNo_CommentIndex &&             index >= comments_size ) {            snp_info.Reset();            NCBI_THROW(CLoaderException, eLoaderFailed,                       "Bad format of SNP table");        }        for ( size_t i = 0; i < SSNP_Info::kMax_AllelesCount; ++i ) {            index = it->m_AllelesIndices[i];            if ( index != SSNP_Info::kNo_AlleleIndex &&                 index >= alleles_size ) {                snp_info.Reset();                NCBI_THROW(CLoaderException, eLoaderFailed,                           "Bad format of SNP table");            }        }    }    // complex SNPs    CObjectIStreamAsnBinary obj_stream(stream);    if ( !snp_info.m_Seq_annot ) {        snp_info.m_Seq_annot.Reset(new CSeq_annot);    }    obj_stream >> *snp_info.m_Seq_annot;    if ( !stream ) {        snp_info.m_Seq_annot.Reset();        NCBI_THROW(CLoaderException, eLoaderFailed,                   "Bad format of SNP table");    }}END_SCOPE(objects)END_NCBI_SCOPE/* * $Log: reader_snp.cpp,v $ * Revision 1000.1  2004/06/01 19:41:43  gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.12 * * Revision 1.12  2004/05/21 21:42:52  gorelenk * Added PCH ncbi_pch.hpp * * Revision 1.11  2004/03/16 16:04:20  vasilche * Removed conversion warning * * Revision 1.10  2004/02/06 16:13:19  vasilche * Added parsing "replace" as a synonym of "allele" in SNP qualifiers. * More compact format of SNP table in cache. SNP table version increased. * Fixed null pointer exception when SNP features are loaded from cache. * * Revision 1.9  2004/01/13 16:55:55  vasilche * CReader, CSeqref and some more classes moved from xobjmgr to separate lib. * Headers moved from include/objmgr to include/objtools/data_loaders/genbank. * * Revision 1.8  2003/10/23 13:47:56  vasilche * Fixed Reset() method: m_Gi should be -1. * * Revision 1.7  2003/10/21 16:29:13  vasilche * Added check for errors in SNP table loaded from cache. * * Revision 1.6  2003/10/21 15:21:21  vasilche * Avoid use of non-constant array sizes of stack arrays. * * Revision 1.5  2003/10/21 14:27:35  vasilche * Added caching of gi -> sat,satkey,version resolution. * SNP blobs are stored in cache in preprocessed format (platform dependent). * Limit number of connections to GenBank servers. * Added collection of ID1 loader statistics. * * Revision 1.4  2003/09/30 16:22:03  vasilche * Updated internal object manager classes to be able to load ID2 data. * SNP blobs are loaded as ID2 split blobs - readers convert them automatically. * Scope caches results of requests for data to data loaders. * Optimized CSeq_id_Handle for gis. * Optimized bioseq lookup in scope. * Reduced object allocations in annotation iterators. * CScope is allowed to be destroyed before other objects using this scope are * deleted (feature iterators, bioseq handles etc). * Optimized lookup for matching Seq-ids in CSeq_id_Mapper. * Added 'adaptive' option to objmgr_demo application. * * Revision 1.3  2003/08/19 18:35:21  vasilche * CPackString classes were moved to SERIAL library. * * Revision 1.2  2003/08/15 19:19:16  vasilche * Fixed memory leak in string packing hooks. * Fixed processing of 'partial' flag of features. * Allow table packing of non-point SNP. * Allow table packing of SNP with long alleles. * * Revision 1.1  2003/08/14 20:05:19  vasilche * Simple SNP features are stored as table internally. * They are recreated when needed using CFeat_CI. * */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?