📄 format_guess.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: format_guess.cpp,v $ * PRODUCTION Revision 1000.5 2004/06/01 19:40:07 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.17 * PRODUCTION * =========================================================================== *//* $Id: format_guess.cpp,v 1000.5 2004/06/01 19:40:07 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Anatoliy Kuznetsov * * File Description: Implemented methods to identify file formats. * */#include <ncbi_pch.hpp>#include <util/format_guess.hpp>#include <corelib/ncbifile.hpp>BEGIN_NCBI_SCOPEstatic bool isDNA_Alphabet(char ch){ return ::strchr("ATGCN", ch) != 0;}// Check if letter belongs to amino acid alphabetstatic bool isProtein_Alphabet(char ch){ return ::strchr("ACDEFGHIKLMNPQRSTVWYBZ", ch) != 0;}// Check if character belongs to the CR/LF group of symbolsstatic inline bool isLineEnd(char ch){ return ch == 0x0D || ch == 0x0A || ch == '\n';}CFormatGuess::ESequenceType CFormatGuess::SequenceType(const char* str, unsigned length){ if (length == 0) length = (unsigned)::strlen(str); unsigned ATGC_content = 0; unsigned amino_acid_content = 0; for (unsigned i = 0; i < length; ++i) { unsigned char ch = str[i]; char upch = toupper(ch); if (isDNA_Alphabet(upch)) { ++ATGC_content; } if (isProtein_Alphabet(upch)) { ++amino_acid_content; } } double dna_content = (double)ATGC_content / (double)length; double prot_content = (double)amino_acid_content / (double)length; if (dna_content > 0.7) { return eNucleotide; } if (prot_content > 0.7) { return eProtein; } return eUndefined;}CFormatGuess::EFormat CFormatGuess::Format(const string& path){ CNcbiIfstream input(path.c_str(), IOS_BASE::in | IOS_BASE::binary); if (!input.is_open()) { return eUnknown; } return Format(input);}CFormatGuess::EFormat CFormatGuess::Format(CNcbiIstream& input){ EFormat format = eUnknown; CT_POS_TYPE orig_pos = input.tellg(); unsigned char buf[1024]; input.read((char*)buf, sizeof(buf)); size_t count = input.gcount(); input.clear(); // in case we reached eof input.seekg(orig_pos); if (!count) { return eUnknown; } // Buffer analysis (completely ad-hoc heuristics). // Check for XML signature... {{ if (count > 5) { const char* xml_sig = "<?XML"; bool xml_flag = true; for (unsigned i = 0; i < 5; ++i) { unsigned char ch = buf[i]; char upch = toupper(ch); if (upch != xml_sig[i]) { xml_flag = false; break; } } if (xml_flag) { return eXml; } } }} // check for binary ASN.1 - the presence of any non-printing characters // can confirm this unsigned int i = 0; {{ for (i = 0; i < count; ++i) { if ( !isgraph(buf[i]) && !isspace(buf[i]) ) { return eBinaryASN; } } }} unsigned ATGC_content = 0; unsigned amino_acid_content = 0; unsigned seq_length = (unsigned)count; unsigned alpha_content = 0; if (buf[0] == '>') { // FASTA ? for (i = 0; (!isLineEnd(buf[i])) && i < count; ++i) { // skip the first line (presumed this is free-text information) unsigned char ch = buf[i]; if (isalnum(ch) || isspace(ch)) { ++alpha_content; } } seq_length = (unsigned)count - i; if (seq_length == 0) { return eUnknown; // No way to tell what format is this... } } for (i = 0; i < count; ++i) { unsigned char ch = buf[i]; char upch = toupper(ch); if (isalnum(ch) || isspace(ch)) { ++alpha_content; } if (isDNA_Alphabet(upch)) { ++ATGC_content; } if (isProtein_Alphabet(upch)) { ++amino_acid_content; } if (isLineEnd(ch)) { ++alpha_content; --seq_length; } } double dna_content = (double)ATGC_content / (double)seq_length; double prot_content = (double)amino_acid_content / (double)seq_length; double a_content = (double)alpha_content / (double)count; if (buf[0] == '>') { if (dna_content > 0.7 && a_content > 0.91) { return eFasta; // DNA fasta file } if (prot_content > 0.7 && a_content > 0.91) { return eFasta; // Protein fasta file } } if (a_content > 0.80) { // Text ASN ? // extract first line char line[1024] = {0,}; char* ptr = line; for (i = 0; i < count; ++i) { if (isLineEnd(buf[i])) { break; } *ptr = buf[i]; ++ptr; } // roll it back to last non-space character... while (ptr > line) { --ptr; if (!isspace(*ptr)) break; } if (*ptr == '{') { // "{" symbol says it's most likely ASN text return eTextASN; } } // Signature check if (buf[1] == 0x80) { if (buf[0] == 0x30 || buf[0] == 0x31) { //return eBinaryASN; } if (buf[0] >= 0xA0) { //return eBinaryASN; } } return format;}END_NCBI_SCOPE/* * =========================================================================== * $Log: format_guess.cpp,v $ * Revision 1000.5 2004/06/01 19:40:07 gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.17 * * Revision 1.17 2004/05/17 21:06:02 gorelenk * Added include of PCH ncbi_pch.hpp * * Revision 1.16 2004/04/01 12:17:20 kuznets * Added 'N' to the legal DNA alphabet. (isDNA_Alphabet) * * Revision 1.15 2004/03/24 20:35:34 ucko * Use the correct type (CT_POS_TYPE rather than int) for tellg's return value. * * Revision 1.14 2004/03/23 23:04:23 jcherry * clear() stream after reading; seekg() in just one place * * Revision 1.13 2004/03/23 22:29:50 jcherry * Added Format(CNcbiIstream& input) * * Revision 1.12 2004/03/02 20:06:59 johnson * bug fix: missing loop initializers * * Revision 1.11 2004/03/01 15:49:54 dicuccio * Added explicit check for binary ASN * * Revision 1.10 2003/12/02 20:16:09 kuznets * Improved ASN binary recognition by checking ASN specific signatures * * Revision 1.9 2003/11/26 14:34:16 kuznets * Fine tuned ascii content coefficient to better recognize binary asns * * Revision 1.8 2003/11/07 17:16:23 ivanov * Fixed warnings on 64-bit Workshop compiler * * Revision 1.7 2003/07/10 19:58:25 ivanov * Get rid of compilation warning: removed double variable declaration * * Revision 1.6 2003/07/08 20:30:50 kuznets * Fixed bug with different "\n" coding in DOS-Windows and Unix. * * Revision 1.5 2003/07/07 19:54:06 kuznets * Improved format recognition of short fasta files * * Revision 1.4 2003/06/20 20:58:04 kuznets * Cleaned up amino-acid alphabet recognition. * * Revision 1.3 2003/05/13 15:18:02 kuznets * added sequence type guessing function * * Revision 1.2 2003/05/09 14:08:28 ucko * ios_base:: -> IOS_BASE:: for gcc 2.9x compatibility * * Revision 1.1 2003/05/08 19:46:34 kuznets * Initial revision * * =========================================================================== */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -