📄 utf8.cpp
字号:
/* * =========================================================================== * PRODUCTION $Log: utf8.cpp,v $ * PRODUCTION Revision 1000.2 2004/06/01 19:40:43 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6 * PRODUCTION * =========================================================================== *//* $Id: utf8.cpp,v 1000.2 2004/06/01 19:40:43 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Aleksey Vinokurov, Vladimir Ivanov * * File Description: UTF8 converter functions * */#include <ncbi_pch.hpp>#include <util/utf8.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(utf8)// Translation tables.// I've put codes from ASCII-7 table here. So in this table should be only // 7-bit characters and two special characters - 0x00 (unable to translate) // and 0xFF (character should be skipped).static unsigned char tblTrans[] ={ // Latin Base // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , // 08 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , // 09 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'a', 0, '"', 0, 0, '-', // 0A 0xFF, 0, '2', '3','\'', 0, 0, '.', 0, '1', 'o', 0, '"', 0, 0, 0 , // 0B 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', // 0C 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'Y', 0, 'B', // 0D 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', // 0E 'o', 'n', 'o', 'o', 'o', 'o', 'o', '-', 'o', 'u', 'u', 'u', 'u', 'y', 0, 'y', // 0F // Latin A // 0 1 2 3 4 5 6 7 8 9 A B C D E F 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', // 10 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', // 11 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', // 12 'I', 'i', 'J', 'j', 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', // 13 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'N', 'n', 'O', 'o', 'O', 'o', // 14 'O', 'o', 'O', 'o', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', // 15 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', // 16 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 0 , // 17 // Latin B // 0 1 2 3 4 5 6 7 8 9 A B C D E F 'b', 'B', 0 , 0 , 0 , 0 , 0 , 'C', 'c', 'D', 'D', 0 , 0 , 0 , 0 , 0 , // 18 'E', 'F', 'f', 'G', 0 , 0 , 0 , 'I', 'K', 'k', 0 , 0 , 0 , 'N', 'n', 0 , // 19 'O', 'o', 0 , 0 , 'P', 'p', 'R', 0 , 0 , 0 , 0 , 't', 'T', 't', 'T', 'U', // 1A 'u', 0 , 0 , 'Y', 'y', 'Z', 'z', 'Z', 0 , 0 , 'z', 0 , 0 , 0 , 0 , 0 , // 1B 0 , 0 , 0 , '!', 'D', 'd', 'd', 'L', 'L', 'l', 'N', 'N', 'n', 'A', 'a', 'I', // 1C 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 0 , 'A', 'a', // 1D 'A', 'a', 'A', 'a', 'G', 'g', 'G', 'g', 'K', 'k', 'O', 'o', 'O', 'o', 'Z', 'z', // 1E 'j', 'D', 'D', 'd', 'G', 'g', 0 , 0 , 'N', 'n', 'A', 'a', 0, 0 , 'O', 'o', // 1F 'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', // 20 'R', 'r', 'R', 'r', 'U', 'u', 'U', 'u', 'S', 's', 'T', 't', 0 , 0 , 'H', 'h', // 21 0 , 0 , 0 , 0 , 'Z', 'z', 'A', 'a', 'E', 'e', 'O', 'o', 'O', 'o', 'O', 'o', // 22 'O', 'o', 'Y', 'y', 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 23 0 , 0, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 24 // IPA Extensions // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 , 'a', 0 , 0 , 0 , 0 , 'd', 'd', 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 25 'g', 'g', 'G', 0 , 0 , 0 , 'h' ,'h', 'i', 'i', 'I', 0 , 0 , 0 , 0 , 0 , // 26 0, 'm', 0, 'n', 'N', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , // 27 'R', 0, 's', 0, 0, 0, 0, 0, 't', 'u', 0, 0, 0, 0, 0, 'Y', // 28 'Z', 'Z', 'z', 'z', 0, 0, 0, 0, 'O', 'B', 0, 'G', 'H', 'j', 0, 'L', // 29 'q', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , // 2A // Spacing Modifiers // 0 1 2 3 4 5 6 7 8 9 A B C D E F 'h', 'h', 'j', 'r', 0 , 0 , 0 , 'w', 'y','\'', '"','\'','\'','\'','\'','\'', // 2B '?', '?', '<', '>', '^', 'v', '^', 'v','\'', '-','\'', '`','\'', '_','\'', '`', // 2C 0, 0, '\'','\'', 0 , 0 , '+', '-', '~', '.', '.', 0, '~', '"' , 0 , 'x', // 2D 0 , 0, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 2E 0 , 'l', 's', 'x', 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 'v' ,'=', '"', 0 // 2F};static unsigned char tblTransA[] ={ // Spacing Modifiers // 0 1 2 3 4 5 6 7 8 9 A B C D E F 'A', 'a', 'B', 'b', 'B', 'b', 'B', 'b', 'C', 'c', 'D', 'd', 'D', 'd', 'D', 'd', // 1E0 'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'F', 'f', // 1E1 'G', 'g', 'H', 'h', 'H', 'h', 'H', 'h', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', // 1E2 'K', 'k', 'K', 'k', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'M', 'm', // 1E3 'M', 'm', 'M', 'm', 'N', 'n', 'N', 'n', 'N', 'n', 'N', 'n', 'O', 'o', 'O', 'o', // 1E4 'O', 'o', 'O', 'o', 'P', 'p', 'P', 'p', 'R', 'r', 'R', 'r', 'R', 'r', 'R', 'r', // 1E5 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', // 1E6 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'V', 'v', 'V', 'v', // 1E7 'W', 'w', 'W', 'w', 'W', 'w', 'W', 'w', 'W', 'w', 'X', 'x', 'X', 'x', 'Y', 'y', // 1E8 'Z', 'z', 'Z', 'z', 'Z', 'z', 'h', 't', 'w', 'y', 'a', 'f', 0 , 0 , 0 , 0 , // 1E9 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', // 1EA 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', // 1EB 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', // 1EC 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', // 1ED 'O', 'o', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', // 1EE 'U', 'u', 'Y', 'y', 'Y', 'y', 'Y', 'y', 'Y', 'y', 0 , 0 , 0 , 0, 0, 0 // 1EF};// Macro for return character together with status// Using in functions returning status their work//#define RETURN_S(ch,res)\{\ if (status) *status = res;\ return ch;\}// Macro for return character together with status and length // Using in functions returning status and length their work//#define RETURN_LS(ch,len,res)\{\ if (seq_len) *seq_len = len;\ if (status) *status = res;\ return ch;\}// Convert first UTF-8 symbol of "src" into ASCII-7 character.// "ascii_table" specifies whether to use ASCII-7 translation tables.// Length of the retrieved UTF-8 symbol is returned in "*seq_len"// (if "seq_len" is not NULL).// Return resulting ASCII-7 character.// NOTE: If the UTF-8 symbol has no ASCII-7 equivalent, then return// kOutrangeChar or hSkipChar.//char StringToChar(const string& src, size_t* seq_len, bool ascii_table, EConversionStatus* status){ long dst_code; // UTF-code symbol code unsigned char dst_char; // Result character EConversionStatus stat; // Temporary status // Process one UTF character dst_code = StringToCode(src, seq_len, &stat); if (status) *status = stat; // If it was happily if (stat == eSuccess) { // Conversion if (ascii_table) { // Convert into appropriate 7-bit character via conversion table dst_char = CodeToChar(dst_code, status); return dst_char; } else { // if character greater than 127 (0x7F) than substitute it // with kOutrangeChar, else leave it as is. if (dst_code > 0x7F) { RETURN_S (kOutrangeChar, eOutrange); } } } // Was error translate char return (char)dst_code;}// Convert UTF-8 string "src" into the ASCII-7 string with// graphically similar characters -- using StringToChar().// Return resulting ASCII-7 string.//string StringToAscii(const string& src, bool ascii_table){ string dst; // String to result char ch; // Temporary UTF symbol code size_t utf_len; // Length of UTF symbol size_t src_len; // Length source string src_len = src.size(); for (size_t i = 0; i < src_len; ) { // Process one UTF character ch = StringToChar(src.data() + i, &utf_len, ascii_table); // Add character to the result vector if ( ch != kSkipChar ) dst += ch; i += utf_len; } return dst;}// Convert first UTF-8 symbol of "src" into a Unicode symbol code.// Length of the retrieved UTF-8 symbol is returned in "*seq_len"// (if "seq_len" is not NULL).// Return resulting Unicode symbol code.// NOTE: If the UTF-8 symbol has no Unicode equivalent, then return// kOutrangeChar or hSkipChar.//long StringToCode(const string& src, size_t* seq_len, EConversionStatus* status){ unsigned char ch = src.data()[0]; size_t utf_len = 0; long dst_code = 0; // If character less then 0x80 we put it as is if (ch < 0x80) { RETURN_LS (ch, 1, eSuccess) } else { // Determine the length of the UTF-8 symbol in bytes if ((ch & 0xFC) == 0xFC) utf_len = 6; // 6 bytes length else if ((ch & 0xF8) == 0xF8) utf_len = 5; // 5 bytes length else if ((ch & 0xF0) == 0xF0) utf_len = 4; // 4 bytes length else if ((ch & 0xE0) == 0xE0) utf_len = 3; // 3 bytes length else if ((ch & 0xC0) == 0xC0) utf_len = 2; // 2 bytes length else { // Bad character. Save it as kOutrangeChar RETURN_LS (kOutrangeChar, 1, eOutrange) } } // Broken unicode sequence if (utf_len > src.size()) { RETURN_LS ((long)kSkipChar, 1, eSkip); } unsigned char mask = 0xFF; mask = mask >> utf_len; dst_code = ch & mask; for (size_t j = 1; j < utf_len; j++) { dst_code = dst_code << 6; ch = src.data()[j]; ch &= 0x3F; dst_code = dst_code | ch; } // Return result RETURN_LS (dst_code, utf_len, eSuccess)}// Convert UTF-8 string "src" into the vector of Unicode symbol codes// using StringToCode().// Return resulting vector.//vector<long> StringToVector (const string& src){ vector<long> dst; // String to result long ch; // Unicode symbol code size_t utf_len; // Length of Unicode symbol size_t src_len; // Length of source string src_len = src.size(); for (size_t i = 0; i < src_len; ) { // Process one UTF character ch = StringToCode(src.data()+i, &utf_len); // Add character to the result vector dst.push_back(ch); i += utf_len; } return dst;}// Translate Unicode symbol code "src" into graphically similar ASCII-7// character.// Return resulting ASCII-7 character.// NOTE: If the Unicode symbol has no ASCII-7 equivalent, then return// kOutrangeChar or hSkipChar.//char CodeToChar(const long src, EConversionStatus* status){ unsigned char ch; if (src < 0x80) RETURN_S ((char)src, eSuccess); if ((src >= 0x0300) && (src <= 0x036F)) RETURN_S (kSkipChar, eSkip); if ((src >= 0x1E00) && (src <= 0x1EFF)) { ch = tblTransA[src-0x1E00]; if (!ch) RETURN_S (kOutrangeChar, eOutrange) else RETURN_S ((char)ch, eSuccess); } if ((src >= 0xFE20) && (src <= 0xFE2F)) RETURN_S (kSkipChar, eSkip); if (src > 0x2FF) RETURN_S (kOutrangeChar, eOutrange); ch = tblTrans[src-0x80]; if (!ch) RETURN_S (kOutrangeChar, eOutrange); RETURN_S ((char)ch, eSuccess);}END_SCOPE(utf8)END_NCBI_SCOPE/* * =========================================================================== * $Log: utf8.cpp,v $ * Revision 1000.2 2004/06/01 19:40:43 gouriano * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6 * * Revision 1.6 2004/05/17 21:06:02 gorelenk * Added include of PCH ncbi_pch.hpp * * Revision 1.5 2002/01/24 20:10:21 vinokuro * Skip characters processing has been fixed in StringToAscii function. * * Revision 1.4 2002/01/18 19:24:13 ivanov * Changed result char's upper limit from 0xFF to 0x7F in StringToChar() * * Revision 1.3 2001/05/17 15:07:15 lavr * Typos corrected * * Revision 1.2 2001/04/18 16:31:59 ivanov * Change types TUnicodeChar, TUnicodeString to simple types. * TUnicode char to long, TUnicodeString to vector<long>. * * Revision 1.1 2001/04/06 19:14:37 ivanov * Initial revision * =========================================================================== */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -