⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 format_guess.cpp

📁 ncbi源码
💻 CPP
字号:
/* * =========================================================================== * PRODUCTION $Log: format_guess.cpp,v $ * PRODUCTION Revision 1000.5  2004/06/01 19:40:07  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.17 * PRODUCTION * =========================================================================== *//*  $Id: format_guess.cpp,v 1000.5 2004/06/01 19:40:07 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Anatoliy Kuznetsov * * File Description:  Implemented methods to identify file formats. * */#include <ncbi_pch.hpp>#include <util/format_guess.hpp>#include <corelib/ncbifile.hpp>BEGIN_NCBI_SCOPEstatic bool isDNA_Alphabet(char ch){    return ::strchr("ATGCN", ch) != 0;}// Check if letter belongs to amino acid alphabetstatic bool isProtein_Alphabet(char ch){    return ::strchr("ACDEFGHIKLMNPQRSTVWYBZ", ch) != 0;}// Check if character belongs to the CR/LF group of symbolsstatic inline bool isLineEnd(char ch){    return ch == 0x0D || ch == 0x0A || ch == '\n';}CFormatGuess::ESequenceType CFormatGuess::SequenceType(const char* str, unsigned length){    if (length == 0)        length = (unsigned)::strlen(str);    unsigned ATGC_content = 0;    unsigned amino_acid_content = 0;    for (unsigned i = 0; i < length; ++i) {        unsigned char ch = str[i];        char upch = toupper(ch);        if (isDNA_Alphabet(upch)) {            ++ATGC_content;        }        if (isProtein_Alphabet(upch)) {            ++amino_acid_content;        }    }    double dna_content = (double)ATGC_content / (double)length;    double prot_content = (double)amino_acid_content / (double)length;    if (dna_content > 0.7) {        return eNucleotide;    }    if (prot_content > 0.7) {        return eProtein;    }    return eUndefined;}CFormatGuess::EFormat CFormatGuess::Format(const string& path){    CNcbiIfstream input(path.c_str(), IOS_BASE::in | IOS_BASE::binary);    if (!input.is_open()) {        return eUnknown;    }    return Format(input);}CFormatGuess::EFormat CFormatGuess::Format(CNcbiIstream& input){    EFormat format = eUnknown;    CT_POS_TYPE orig_pos = input.tellg();    unsigned char buf[1024];    input.read((char*)buf, sizeof(buf));    size_t count = input.gcount();	input.clear();  // in case we reached eof	input.seekg(orig_pos);    if (!count) {        return eUnknown;    }    // Buffer analysis (completely ad-hoc heuristics).    // Check for XML signature...    {{        if (count > 5) {            const char* xml_sig = "<?XML";            bool xml_flag = true;            for (unsigned i = 0; i < 5; ++i) {                unsigned char ch = buf[i];                char upch = toupper(ch);                if (upch != xml_sig[i]) {                    xml_flag = false;                    break;                }            }            if (xml_flag) {                                return eXml;            }        }    }}    // check for binary ASN.1 - the presence of any non-printing characters    // can confirm this    unsigned int i = 0;    {{        for (i = 0;  i < count;  ++i) {            if ( !isgraph(buf[i])  &&  !isspace(buf[i]) ) {                return eBinaryASN;            }        }    }}        unsigned ATGC_content = 0;    unsigned amino_acid_content = 0;    unsigned seq_length = (unsigned)count;    unsigned alpha_content = 0;    if (buf[0] == '>') { // FASTA ?        for (i = 0; (!isLineEnd(buf[i])) && i < count; ++i) {            // skip the first line (presumed this is free-text information)            unsigned char ch = buf[i];            if (isalnum(ch) || isspace(ch)) {                ++alpha_content;            }        }        seq_length = (unsigned)count - i;        if (seq_length == 0) {            return eUnknown;   // No way to tell what format is this...        }    }    for (i = 0; i < count; ++i) {        unsigned char ch = buf[i];        char upch = toupper(ch);        if (isalnum(ch) || isspace(ch)) {            ++alpha_content;        }        if (isDNA_Alphabet(upch)) {            ++ATGC_content;        }        if (isProtein_Alphabet(upch)) {            ++amino_acid_content;        }        if (isLineEnd(ch)) {            ++alpha_content;            --seq_length;        }    }    double dna_content = (double)ATGC_content / (double)seq_length;    double prot_content = (double)amino_acid_content / (double)seq_length;    double a_content = (double)alpha_content / (double)count;    if (buf[0] == '>') {        if (dna_content > 0.7 && a_content > 0.91) {            return eFasta;  // DNA fasta file        }        if (prot_content > 0.7 && a_content > 0.91) {            return eFasta;  // Protein fasta file        }    }    if (a_content > 0.80) {  // Text ASN ?        // extract first line        char line[1024] = {0,};        char* ptr = line;        for (i = 0; i < count; ++i) {            if (isLineEnd(buf[i])) {                break;            }            *ptr = buf[i];            ++ptr;        }        // roll it back to last non-space character...        while (ptr > line) {            --ptr;            if (!isspace(*ptr)) break;        }        if (*ptr == '{') {  // "{" symbol says it's most likely ASN text            return eTextASN;        }    }    // Signature check    if (buf[1] == 0x80) {        if (buf[0] == 0x30 || buf[0] == 0x31) {            //return eBinaryASN;        }        if (buf[0] >= 0xA0) {            //return eBinaryASN;        }    }    return format;}END_NCBI_SCOPE/* * =========================================================================== * $Log: format_guess.cpp,v $ * Revision 1000.5  2004/06/01 19:40:07  gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.17 * * Revision 1.17  2004/05/17 21:06:02  gorelenk * Added include of PCH ncbi_pch.hpp * * Revision 1.16  2004/04/01 12:17:20  kuznets * Added 'N' to the legal DNA alphabet. (isDNA_Alphabet) * * Revision 1.15  2004/03/24 20:35:34  ucko * Use the correct type (CT_POS_TYPE rather than int) for tellg's return value. * * Revision 1.14  2004/03/23 23:04:23  jcherry * clear() stream after reading; seekg() in just one place * * Revision 1.13  2004/03/23 22:29:50  jcherry * Added Format(CNcbiIstream& input) * * Revision 1.12  2004/03/02 20:06:59  johnson * bug fix: missing loop initializers * * Revision 1.11  2004/03/01 15:49:54  dicuccio * Added explicit check for binary ASN * * Revision 1.10  2003/12/02 20:16:09  kuznets * Improved ASN binary recognition by checking ASN specific signatures * * Revision 1.9  2003/11/26 14:34:16  kuznets * Fine tuned ascii content coefficient to better recognize binary asns * * Revision 1.8  2003/11/07 17:16:23  ivanov * Fixed  warnings on 64-bit Workshop compiler * * Revision 1.7  2003/07/10 19:58:25  ivanov * Get rid of compilation warning: removed double variable declaration * * Revision 1.6  2003/07/08 20:30:50  kuznets * Fixed bug with different "\n" coding in DOS-Windows and Unix. * * Revision 1.5  2003/07/07 19:54:06  kuznets * Improved format recognition of short fasta files * * Revision 1.4  2003/06/20 20:58:04  kuznets * Cleaned up amino-acid alphabet recognition. * * Revision 1.3  2003/05/13 15:18:02  kuznets * added sequence type guessing function * * Revision 1.2  2003/05/09 14:08:28  ucko * ios_base:: -> IOS_BASE:: for gcc 2.9x compatibility * * Revision 1.1  2003/05/08 19:46:34  kuznets * Initial revision * * =========================================================================== */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -