⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utf8.cpp

📁 ncbi源码
💻 CPP
字号:
/* * =========================================================================== * PRODUCTION $Log: utf8.cpp,v $ * PRODUCTION Revision 1000.2  2004/06/01 19:40:43  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6 * PRODUCTION * =========================================================================== *//*  $Id: utf8.cpp,v 1000.2 2004/06/01 19:40:43 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Aleksey Vinokurov, Vladimir Ivanov * * File Description:  UTF8 converter functions * */#include <ncbi_pch.hpp>#include <util/utf8.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(utf8)// Translation tables.// I've put codes from ASCII-7 table here. So in this table should be only // 7-bit characters and two special characters - 0x00 (unable to translate) // and 0xFF (character should be skipped).static unsigned char tblTrans[] ={    // Latin Base // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 , // 08    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 , // 09    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  'a',  0,  '"',  0,   0,  '-', // 0A   0xFF, 0,  '2', '3','\'',  0,   0,  '.',  0,  '1', 'o',  0,  '"',  0,   0,   0 , // 0B   'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', // 0C   'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'Y',  0,  'B', // 0D    'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', // 0E    'o', 'n', 'o', 'o', 'o', 'o', 'o', '-', 'o', 'u', 'u', 'u', 'u', 'y',  0,  'y', // 0F     // Latin A // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', // 10    'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', // 11    'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', // 12    'I', 'i', 'J', 'j', 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', // 13    'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'N', 'n', 'O', 'o', 'O', 'o', // 14    'O', 'o', 'O', 'o', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', // 15    'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', // 16    'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z',  0 , // 17     // Latin B // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   'b', 'B',  0 ,  0 ,  0 ,  0 ,  0 , 'C', 'c', 'D', 'D',  0 ,  0 ,  0 ,  0 ,  0 , // 18    'E', 'F', 'f', 'G',  0 ,  0 ,  0 , 'I', 'K', 'k',  0 ,  0 ,  0 , 'N', 'n',  0 , // 19    'O', 'o',  0 ,  0 , 'P', 'p', 'R',  0 ,  0 ,  0 ,  0 , 't', 'T', 't', 'T', 'U', // 1A    'u',  0 ,  0 , 'Y', 'y', 'Z', 'z', 'Z',  0 ,  0 , 'z',  0 ,  0 ,  0 ,  0 ,  0 , // 1B     0 ,  0 ,  0 , '!', 'D', 'd', 'd', 'L', 'L', 'l', 'N', 'N', 'n', 'A', 'a', 'I', // 1C    'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u',  0 , 'A', 'a', // 1D    'A', 'a', 'A', 'a', 'G', 'g', 'G', 'g', 'K', 'k', 'O', 'o', 'O', 'o', 'Z', 'z', // 1E   'j', 'D', 'D', 'd', 'G', 'g',  0 ,  0 , 'N', 'n', 'A', 'a',  0,   0 , 'O', 'o', // 1F    'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', // 20    'R', 'r', 'R', 'r', 'U', 'u', 'U', 'u', 'S', 's', 'T', 't',  0 ,  0 , 'H', 'h', // 21     0 ,  0 ,  0 ,  0 , 'Z', 'z', 'A', 'a', 'E', 'e', 'O', 'o', 'O', 'o', 'O', 'o', // 22    'O', 'o', 'Y', 'y',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , // 23    0 ,  0,   0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , // 24    // IPA Extensions // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F    0 , 'a',  0 ,  0 ,  0 ,  0 , 'd', 'd',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , // 25   'g', 'g', 'G',  0 ,  0 ,  0 , 'h' ,'h', 'i', 'i', 'I',  0 ,  0 ,  0 ,  0 ,  0 , // 26    0,  'm',  0,  'n', 'N',  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 , // 27   'R',  0,  's',  0,   0,   0,   0,   0,  't', 'u',  0,   0,   0,   0,   0,  'Y', // 28   'Z', 'Z', 'z',  'z', 0,   0,   0,   0,  'O', 'B',  0,  'G', 'H', 'j',  0,  'L', // 29   'q',  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0 , // 2A    // Spacing Modifiers // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   'h', 'h', 'j', 'r',  0 ,  0 ,  0 , 'w', 'y','\'', '"','\'','\'','\'','\'','\'', // 2B   '?', '?', '<', '>', '^', 'v', '^', 'v','\'', '-','\'', '`','\'', '_','\'', '`', // 2C    0,   0, '\'','\'',  0 ,  0 , '+', '-', '~', '.', '.',  0,  '~', '"' , 0 , 'x', // 2D    0 ,  0,   0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , // 2E    0 , 'l', 's', 'x',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 , 'v' ,'=', '"',  0   // 2F};static unsigned char tblTransA[] ={    // Spacing Modifiers // 0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   'A', 'a', 'B', 'b', 'B', 'b', 'B', 'b', 'C', 'c', 'D', 'd', 'D', 'd', 'D', 'd', // 1E0   'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'F', 'f', // 1E1   'G', 'g', 'H', 'h', 'H', 'h', 'H', 'h', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', // 1E2   'K', 'k', 'K', 'k', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'M', 'm', // 1E3   'M', 'm', 'M', 'm', 'N', 'n', 'N', 'n', 'N', 'n', 'N', 'n', 'O', 'o', 'O', 'o', // 1E4   'O', 'o', 'O', 'o', 'P', 'p', 'P', 'p', 'R', 'r', 'R', 'r', 'R', 'r', 'R', 'r', // 1E5   'S', 's', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', // 1E6   'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'V', 'v', 'V', 'v', // 1E7   'W', 'w', 'W', 'w', 'W', 'w', 'W', 'w', 'W', 'w', 'X', 'x', 'X', 'x', 'Y', 'y', // 1E8   'Z', 'z', 'Z', 'z', 'Z', 'z', 'h', 't', 'w', 'y', 'a', 'f',  0 ,  0 ,  0 ,  0 , // 1E9   'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', // 1EA   'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', // 1EB   'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', // 1EC   'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', // 1ED   'O', 'o', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', // 1EE   'U', 'u', 'Y', 'y', 'Y', 'y', 'Y', 'y', 'Y', 'y',  0 ,  0 ,  0 ,  0,   0,   0   // 1EF};// Macro for return character together with status// Using in functions returning status their work//#define RETURN_S(ch,res)\{\    if (status) *status = res;\    return ch;\}// Macro for return character together with status and length // Using in functions returning status and length their work//#define RETURN_LS(ch,len,res)\{\    if (seq_len) *seq_len = len;\    if (status) *status = res;\    return ch;\}// Convert first UTF-8 symbol of "src" into ASCII-7 character.// "ascii_table" specifies whether to use ASCII-7 translation tables.// Length of the retrieved UTF-8 symbol is returned in "*seq_len"// (if "seq_len" is not NULL).// Return resulting ASCII-7 character.// NOTE:  If the UTF-8 symbol has no ASCII-7 equivalent, then return//        kOutrangeChar or hSkipChar.//char StringToChar(const string&      src,                  size_t*            seq_len,                  bool               ascii_table,                  EConversionStatus* status){    long              dst_code;  // UTF-code symbol code    unsigned char     dst_char;  // Result character    EConversionStatus stat;      // Temporary status         // Process one UTF character    dst_code = StringToCode(src, seq_len, &stat);    if (status) *status = stat;    // If it was happily    if (stat == eSuccess) {        // Conversion        if (ascii_table) {            // Convert into appropriate 7-bit character via conversion table             dst_char = CodeToChar(dst_code, status);            return dst_char;        }            else        {            // if character greater than 127 (0x7F) than substitute it             // with kOutrangeChar, else leave it as is.            if (dst_code > 0x7F) {                RETURN_S (kOutrangeChar, eOutrange);            }        }    }    // Was error translate char    return (char)dst_code;}// Convert UTF-8 string "src" into the ASCII-7 string with// graphically similar characters -- using StringToChar().// Return resulting ASCII-7 string.//string StringToAscii(const string& src, bool ascii_table){    string  dst;      // String to result     char    ch;       // Temporary UTF symbol code    size_t  utf_len;  // Length of UTF symbol    size_t  src_len;  // Length source string    src_len = src.size();    for (size_t i = 0; i < src_len; )    {        // Process one UTF character        ch = StringToChar(src.data() + i, &utf_len, ascii_table);        // Add character to the result vector        if ( ch != kSkipChar ) dst += ch;        i += utf_len;    }    return dst;}// Convert first UTF-8 symbol of "src" into a Unicode symbol code.// Length of the retrieved UTF-8 symbol is returned in "*seq_len"// (if "seq_len" is not NULL).// Return resulting Unicode symbol code.// NOTE:  If the UTF-8 symbol has no Unicode equivalent, then return//        kOutrangeChar or hSkipChar.//long StringToCode(const string&      src,                  size_t*            seq_len,                  EConversionStatus* status){    unsigned char ch = src.data()[0];    size_t utf_len = 0;    long dst_code = 0;            // If character less then 0x80 we put it as is    if (ch < 0x80)    {        RETURN_LS (ch, 1, eSuccess)    }     else    {        // Determine the length of the UTF-8 symbol in bytes        if      ((ch & 0xFC) == 0xFC) utf_len = 6; // 6 bytes length        else if ((ch & 0xF8) == 0xF8) utf_len = 5; // 5 bytes length        else if ((ch & 0xF0) == 0xF0) utf_len = 4; // 4 bytes length        else if ((ch & 0xE0) == 0xE0) utf_len = 3; // 3 bytes length        else if ((ch & 0xC0) == 0xC0) utf_len = 2; // 2 bytes length        else        {            // Bad character. Save it as kOutrangeChar            RETURN_LS (kOutrangeChar, 1, eOutrange)        }    }    // Broken unicode sequence    if (utf_len > src.size()) {        RETURN_LS ((long)kSkipChar, 1, eSkip);    }            unsigned char mask = 0xFF;    mask = mask >> utf_len;     dst_code = ch & mask;    for (size_t j = 1; j < utf_len; j++)    {        dst_code = dst_code << 6;        ch = src.data()[j];        ch &= 0x3F;        dst_code = dst_code | ch;    }    // Return result    RETURN_LS (dst_code, utf_len, eSuccess)}// Convert UTF-8 string "src" into the vector of Unicode symbol codes// using StringToCode().// Return resulting vector.//vector<long> StringToVector (const string& src){    vector<long> dst;      // String to result     long         ch;       // Unicode symbol code    size_t       utf_len;  // Length of Unicode symbol    size_t       src_len;  // Length of source string    src_len = src.size();    for (size_t i = 0; i < src_len; )    {        // Process one UTF character        ch = StringToCode(src.data()+i, &utf_len);        // Add character to the result vector        dst.push_back(ch);        i += utf_len;    }    return dst;}// Translate Unicode symbol code "src" into graphically similar ASCII-7// character.// Return resulting ASCII-7 character.// NOTE:  If the Unicode symbol has no ASCII-7 equivalent, then return//        kOutrangeChar or hSkipChar.//char CodeToChar(const long src, EConversionStatus* status){    unsigned char ch;    if (src < 0x80) RETURN_S ((char)src, eSuccess);    if ((src >= 0x0300) && (src <= 0x036F)) RETURN_S (kSkipChar, eSkip);    if ((src >= 0x1E00) && (src <= 0x1EFF))    {      ch = tblTransA[src-0x1E00];      if (!ch) RETURN_S (kOutrangeChar, eOutrange)      else     RETURN_S ((char)ch, eSuccess);    }    if ((src >= 0xFE20) && (src <= 0xFE2F)) RETURN_S (kSkipChar, eSkip);    if (src > 0x2FF) RETURN_S (kOutrangeChar, eOutrange);    ch = tblTrans[src-0x80];    if (!ch) RETURN_S (kOutrangeChar, eOutrange);    RETURN_S ((char)ch, eSuccess);}END_SCOPE(utf8)END_NCBI_SCOPE/* * =========================================================================== * $Log: utf8.cpp,v $ * Revision 1000.2  2004/06/01 19:40:43  gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6 * * Revision 1.6  2004/05/17 21:06:02  gorelenk * Added include of PCH ncbi_pch.hpp * * Revision 1.5  2002/01/24 20:10:21  vinokuro * Skip characters processing has been fixed in StringToAscii function. * * Revision 1.4  2002/01/18 19:24:13  ivanov * Changed result char's upper limit from 0xFF to 0x7F in StringToChar() * * Revision 1.3  2001/05/17 15:07:15  lavr * Typos corrected * * Revision 1.2  2001/04/18 16:31:59  ivanov * Change types TUnicodeChar, TUnicodeString to simple types. * TUnicode char to long, TUnicodeString to vector<long>. * * Revision 1.1  2001/04/06 19:14:37  ivanov * Initial revision * =========================================================================== */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -