📄 swiutfconversions.c
字号:
/* SWIutfconversions, Unicode conversions */ /****************License************************************************ * * Copyright 2000-2003. ScanSoft, Inc. * * Use of this software is subject to notices and obligations set forth * in the SpeechWorks Public License - Software Version 1.2 which is * included with this software. * * ScanSoft is a registered trademark of ScanSoft, Inc., and OpenSpeech, * SpeechWorks and the SpeechWorks logo are registered trademarks or * trademarks of SpeechWorks International, Inc. in the United States * and other countries. * ***********************************************************************/ /* -----1=0-------2=0-------3=0-------4=0-------5=0-------6=0-------7=0-------8 */ #include <stdio.h> #include <string.h> #include <SWIutfconversions.h> #if 0 #include "Encoding.h" #include <cstring> #include <cwchar> #include <vector> #include <algorithm> bool initialized = false; // --------------------------------------------------------------------------- // Define a registry of decoder functions. // --------------------------------------------------------------------------- typedef bool (*DECODERFUNCTION)(const char *, std::basic_string<wchar_t> &); class EncoderEntry { public: const char * name; DECODERFUNCTION function; EncoderEntry(const char * x, DECODERFUNCTION y) : name(x), function(y) { } EncoderEntry(const EncoderEntry & x) : name(x.name), function(x.function) { } EncoderEntry & operator=(const EncoderEntry & x) { if (this != &x) { name = x.name; function = x.function; } return *this; } }; bool operator<(const EncoderEntry & x, const EncoderEntry & y) { return strcmp(x.name, y.name) < 0; } // --------------------------------------------------------------------------- typedef std::vector<EncoderEntry> DECODER_REGISTRY; DECODER_REGISTRY decoderRegistry; void InitializeDecoder() { if(initialized) return; decoderRegistry.push_back(EncoderEntry("utf-8", DecodeUTF8)); decoderRegistry.push_back(EncoderEntry("utf8", DecodeUTF8)); decoderRegistry.push_back(EncoderEntry("us-ascii", DecodeASCII)); decoderRegistry.push_back(EncoderEntry("us_ascii", DecodeASCII)); decoderRegistry.push_back(EncoderEntry("usascii", DecodeASCII)); decoderRegistry.push_back(EncoderEntry("ascii", DecodeASCII)); decoderRegistry.push_back(EncoderEntry("iso8859-1", DecodeISO8859_1)); decoderRegistry.push_back(EncoderEntry("iso-8859-1", DecodeISO8859_1)); decoderRegistry.push_back(EncoderEntry("iso_8859-1", DecodeISO8859_1)); decoderRegistry.push_back(EncoderEntry("latin1", DecodeISO8859_1)); decoderRegistry.push_back(EncoderEntry("latin-1", DecodeISO8859_1)); decoderRegistry.push_back(EncoderEntry("latin_1", DecodeISO8859_1)); decoderRegistry.push_back(EncoderEntry("ibm-819", DecodeISO8859_1)); decoderRegistry.push_back(EncoderEntry("ibm819", DecodeISO8859_1)); decoderRegistry.push_back(EncoderEntry("iso8859-2", DecodeISO8859_2)); decoderRegistry.push_back(EncoderEntry("iso-8859-2", DecodeISO8859_2)); decoderRegistry.push_back(EncoderEntry("iso_8859-2", DecodeISO8859_2)); decoderRegistry.push_back(EncoderEntry("latin2", DecodeISO8859_2)); decoderRegistry.push_back(EncoderEntry("latin-2", DecodeISO8859_2)); decoderRegistry.push_back(EncoderEntry("latin_2", DecodeISO8859_2)); decoderRegistry.push_back(EncoderEntry("iso8859-3", DecodeISO8859_3)); decoderRegistry.push_back(EncoderEntry("iso-8859-3", DecodeISO8859_3)); decoderRegistry.push_back(EncoderEntry("iso_8859-3", DecodeISO8859_3)); decoderRegistry.push_back(EncoderEntry("latin3", DecodeISO8859_3)); decoderRegistry.push_back(EncoderEntry("latin-3", DecodeISO8859_3)); decoderRegistry.push_back(EncoderEntry("latin_3", DecodeISO8859_3)); decoderRegistry.push_back(EncoderEntry("iso8859-4", DecodeISO8859_4)); decoderRegistry.push_back(EncoderEntry("iso-8859-4", DecodeISO8859_4)); decoderRegistry.push_back(EncoderEntry("iso_8859-4", DecodeISO8859_4)); decoderRegistry.push_back(EncoderEntry("latin4", DecodeISO8859_4)); decoderRegistry.push_back(EncoderEntry("latin-4", DecodeISO8859_4)); decoderRegistry.push_back(EncoderEntry("latin_4", DecodeISO8859_4)); decoderRegistry.push_back(EncoderEntry("iso8859-15", DecodeISO8859_15)); decoderRegistry.push_back(EncoderEntry("iso-8859-15", DecodeISO8859_15)); decoderRegistry.push_back(EncoderEntry("iso_8859-15", DecodeISO8859_15)); decoderRegistry.push_back(EncoderEntry("latin9", DecodeISO8859_15)); decoderRegistry.push_back(EncoderEntry("latin-9", DecodeISO8859_15)); decoderRegistry.push_back(EncoderEntry("latin_9", DecodeISO8859_15)); std::sort(decoderRegistry.begin(), decoderRegistry.end()); initialized = true; } static bool DoInitialization() { if(initialized) return true; InitializeDecoder(); return true; } bool do_initialization = DoInitialization(); int DecodeString(const char * encodingName, const char * inputString, std::basic_string<wchar_t> & outputString) { if (!initialized || encodingName == NULL || inputString == NULL) return -1; // (1) Convert string to lowercase. std::basic_string<char> encoding(encodingName); for (unsigned int i = 0; i < encoding.length(); ++i) if (encoding[i] < 0x5B && encoding[i] > 0x40) encoding[i] += 0x20; DECODER_REGISTRY::iterator j = std::lower_bound(decoderRegistry.begin(), decoderRegistry.end(), EncoderEntry(encoding.c_str(), NULL)); if (j == decoderRegistry.end() || encoding != (*j).name) return -1; if ((*j).function(inputString, outputString)) return 0; return 1; } // --------------------------------------------------------------------------- // Now we define the 'simple' decoder functions // --------------------------------------------------------------------------- bool DecodeASCII(const char * in, std::basic_string<wchar_t> & out) { out.erase(); while (*in != '\0') { char c = *in; if (c > 0x7f || c < 0) return false; out += wchar_t(c); ++in; } return true; } bool DecodeISO8859_1(const char * in, std::basic_string<wchar_t> & out) { out.erase(); while (*in != '\0') { out += wchar_t(*in); ++in; } return true; } bool DecodeISO8859_2(const char * in, std::basic_string<wchar_t> & out) { out.erase(); wchar_t w; while (*in != '\0') { switch (*in) { case 0xA1: w = 0x0104; break; // LATIN CAPITAL LETTER A WITH OGONEK case 0xA2: w = 0x02D8; break; // BREVE case 0xA3: w = 0x0141; break; // LATIN CAPITAL LETTER L WITH STROKE case 0xA5: w = 0x013D; break; // LATIN CAPITAL LETTER L WITH CARON case 0xA6: w = 0x015A; break; // LATIN CAPITAL LETTER S WITH ACUTE case 0xA9: w = 0x0160; break; // LATIN CAPITAL LETTER S WITH CARON case 0xAA: w = 0x015E; break; // LATIN CAPITAL LETTER S WITH CEDILLA case 0xAB: w = 0x0164; break; // LATIN CAPITAL LETTER T WITH CARON case 0xAC: w = 0x0179; break; // LATIN CAPITAL LETTER Z WITH ACUTE case 0xAE: w = 0x017D; break; // LATIN CAPITAL LETTER Z WITH CARON case 0xAF: w = 0x017B; break; // LATIN CAPITAL LETTER Z WITH DOT ABOVE case 0xB1: w = 0x0105; break; // LATIN SMALL LETTER A WITH OGONEK case 0xB2: w = 0x02DB; break; // OGONEK case 0xB3: w = 0x0142; break; // LATIN SMALL LETTER L WITH STROKE case 0xB5: w = 0x013E; break; // LATIN SMALL LETTER L WITH CARON case 0xB6: w = 0x015B; break; // LATIN SMALL LETTER S WITH ACUTE case 0xB7: w = 0x02C7; break; // CARON case 0xB9: w = 0x0161; break; // LATIN SMALL LETTER S WITH CARON case 0xBA: w = 0x015F; break; // LATIN SMALL LETTER S WITH CEDILLA case 0xBB: w = 0x0165; break; // LATIN SMALL LETTER T WITH CARON case 0xBC: w = 0x017A; break; // LATIN SMALL LETTER Z WITH ACUTE case 0xBD: w = 0x02DD; break; // DOUBLE ACUTE ACCENT case 0xBE: w = 0x017E; break; // LATIN SMALL LETTER Z WITH CARON case 0xBF: w = 0x017C; break; // LATIN SMALL LETTER Z WITH DOT ABOVE case 0xC0: w = 0x0154; break; // LATIN CAPITAL LETTER R WITH ACUTE case 0xC3: w = 0x0102; break; // LATIN CAPITAL LETTER A WITH BREVE case 0xC5: w = 0x0139; break; // LATIN CAPITAL LETTER L WITH ACUTE case 0xC6: w = 0x0106; break; // LATIN CAPITAL LETTER C WITH ACUTE case 0xC8: w = 0x010C; break; // LATIN CAPITAL LETTER C WITH CARON case 0xCA: w = 0x0118; break; // LATIN CAPITAL LETTER E WITH OGONEK case 0xCC: w = 0x011A; break; // LATIN CAPITAL LETTER E WITH CARON case 0xCF: w = 0x010E; break; // LATIN CAPITAL LETTER D WITH CARON case 0xD0: w = 0x0110; break; // LATIN CAPITAL LETTER D WITH STROKE case 0xD1: w = 0x0143; break; // LATIN CAPITAL LETTER N WITH ACUTE case 0xD2: w = 0x0147; break; // LATIN CAPITAL LETTER N WITH CARON case 0xD5: w = 0x0150; break; // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE case 0xD8: w = 0x0158; break; // LATIN CAPITAL LETTER R WITH CARON case 0xD9: w = 0x016E; break; // LATIN CAPITAL LETTER U WITH RING ABOVE case 0xDB: w = 0x0170; break; // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE case 0xDE: w = 0x0162; break; // LATIN CAPITAL LETTER T WITH CEDILLA case 0xE0: w = 0x0155; break; // LATIN SMALL LETTER R WITH ACUTE case 0xE3: w = 0x0103; break; // LATIN SMALL LETTER A WITH BREVE case 0xE5: w = 0x013A; break; // LATIN SMALL LETTER L WITH ACUTE case 0xE6: w = 0x0107; break; // LATIN SMALL LETTER C WITH ACUTE case 0xE8: w = 0x010D; break; // LATIN SMALL LETTER C WITH CARON case 0xEA: w = 0x0119; break; // LATIN SMALL LETTER E WITH OGONEK case 0xEC: w = 0x011B; break; // LATIN SMALL LETTER E WITH CARON case 0xEF: w = 0x010F; break; // LATIN SMALL LETTER D WITH CARON case 0xF0: w = 0x0111; break; // LATIN SMALL LETTER D WITH STROKE case 0xF1: w = 0x0144; break; // LATIN SMALL LETTER N WITH ACUTE case 0xF2: w = 0x0148; break; // LATIN SMALL LETTER N WITH CARON case 0xF5: w = 0x0151; break; // LATIN SMALL LETTER O WITH DOUBLE ACUTE case 0xF8: w = 0x0159; break; // LATIN SMALL LETTER R WITH CARON case 0xF9: w = 0x016F; break; // LATIN SMALL LETTER U WITH RING ABOVE case 0xFB: w = 0x0171; break; // LATIN SMALL LETTER U WITH DOUBLE ACUTE case 0xFE: w = 0x0163; break; // LATIN SMALL LETTER T WITH CEDILLA case 0xFF: w = 0x02D9; break; // DOT ABOVE default: w = wchar_t(*in); break; } out += w; ++in; } return true; } bool DecodeISO8859_3(const char * in, std::basic_string<wchar_t> & out) { out.erase(); wchar_t w; while (*in != '\0') { switch (*in) { case 0xA1: w = 0x0126; break; // LATIN CAPITAL LETTER H WITH STROKE case 0xA2: w = 0x02D8; break; // BREVE case 0xA6: w = 0x0124; break; // LATIN CAPITAL LETTER H WITH CIRCUMFLEX case 0xA9: w = 0x0130; break; // LATIN CAPITAL LETTER I WITH DOT ABOVE case 0xAA: w = 0x015E; break; // LATIN CAPITAL LETTER S WITH CEDILLA case 0xAB: w = 0x011E; break; // LATIN CAPITAL LETTER G WITH BREVE case 0xAC: w = 0x0134; break; // LATIN CAPITAL LETTER J WITH CIRCUMFLEX case 0xAF: w = 0x017B; break; // LATIN CAPITAL LETTER Z WITH DOT ABOVE case 0xB1: w = 0x0127; break; // LATIN SMALL LETTER H WITH STROKE case 0xB6: w = 0x0125; break; // LATIN SMALL LETTER H WITH CIRCUMFLEX case 0xB9: w = 0x0131; break; // LATIN SMALL LETTER DOTLESS I case 0xBA: w = 0x015F; break; // LATIN SMALL LETTER S WITH CEDILLA case 0xBB: w = 0x011F; break; // LATIN SMALL LETTER G WITH BREVE case 0xBC: w = 0x0135; break; // LATIN SMALL LETTER J WITH CIRCUMFLEX case 0xBF: w = 0x017C; break; // LATIN SMALL LETTER Z WITH DOT ABOVE case 0xC5: w = 0x010A; break; // LATIN CAPITAL LETTER C WITH DOT ABOVE case 0xC6: w = 0x0108; break; // LATIN CAPITAL LETTER C WITH CIRCUMFLEX case 0xD5: w = 0x0120; break; // LATIN CAPITAL LETTER G WITH DOT ABOVE case 0xD8: w = 0x011C; break; // LATIN CAPITAL LETTER G WITH CIRCUMFLEX case 0xDD: w = 0x016C; break; // LATIN CAPITAL LETTER U WITH BREVE case 0xDE: w = 0x015C; break; // LATIN CAPITAL LETTER S WITH CIRCUMFLEX case 0xE5: w = 0x010B; break; // LATIN SMALL LETTER C WITH DOT ABOVE case 0xE6: w = 0x0109; break; // LATIN SMALL LETTER C WITH CIRCUMFLEX case 0xF5: w = 0x0121; break; // LATIN SMALL LETTER G WITH DOT ABOVE case 0xF8: w = 0x011D; break; // LATIN SMALL LETTER G WITH CIRCUMFLEX case 0xFD: w = 0x016D; break; // LATIN SMALL LETTER U WITH BREVE case 0xFE: w = 0x015D; break; // LATIN SMALL LETTER S WITH CIRCUMFLEX case 0xFF: w = 0x02D9; break; // DOT ABOVE default: w = wchar_t(*in); break; } out += w; ++in; } return true; } bool DecodeISO8859_4(const char * in, std::basic_string<wchar_t> & out) { out.erase(); wchar_t w; while (*in != '\0') { switch (*in) { case 0xA1: w = 0x0104; break; // LATIN CAPITAL LETTER A WITH OGONEK case 0xA2: w = 0x0138; break; // LATIN SMALL LETTER KRA case 0xA3: w = 0x0156; break; // LATIN CAPITAL LETTER R WITH CEDILLA case 0xA5: w = 0x0128; break; // LATIN CAPITAL LETTER I WITH TILDE case 0xA6: w = 0x013B; break; // LATIN CAPITAL LETTER L WITH CEDILLA case 0xA9: w = 0x0160; break; // LATIN CAPITAL LETTER S WITH CARON case 0xAA: w = 0x0112; break; // LATIN CAPITAL LETTER E WITH MACRON case 0xAB: w = 0x0122; break; // LATIN CAPITAL LETTER G WITH CEDILLA case 0xAC: w = 0x0166; break; // LATIN CAPITAL LETTER T WITH STROKE case 0xAE: w = 0x017D; break; // LATIN CAPITAL LETTER Z WITH CARON case 0xB1: w = 0x0105; break; // LATIN SMALL LETTER A WITH OGONEK case 0xB2: w = 0x02DB; break; // OGONEK case 0xB3: w = 0x0157; break; // LATIN SMALL LETTER R WITH CEDILLA case 0xB5: w = 0x0129; break; // LATIN SMALL LETTER I WITH TILDE case 0xB6: w = 0x013C; break; // LATIN SMALL LETTER L WITH CEDILLA case 0xB7: w = 0x02C7; break; // CARON case 0xB9: w = 0x0161; break; // LATIN SMALL LETTER S WITH CARON case 0xBA: w = 0x0113; break; // LATIN SMALL LETTER E WITH MACRON case 0xBB: w = 0x0123; break; // LATIN SMALL LETTER G WITH CEDILLA case 0xBC: w = 0x0167; break; // LATIN SMALL LETTER T WITH STROKE case 0xBD: w = 0x014A; break; // LATIN CAPITAL LETTER ENG case 0xBE: w = 0x017E; break; // LATIN SMALL LETTER Z WITH CARON case 0xBF: w = 0x014B; break; // LATIN SMALL LETTER ENG case 0xC0: w = 0x0100; break; // LATIN CAPITAL LETTER A WITH MACRON case 0xC7: w = 0x012E; break; // LATIN CAPITAL LETTER I WITH OGONEK case 0xC8: w = 0x010C; break; // LATIN CAPITAL LETTER C WITH CARON case 0xCA: w = 0x0118; break; // LATIN CAPITAL LETTER E WITH OGONEK case 0xCC: w = 0x0116; break; // LATIN CAPITAL LETTER E WITH DOT ABOVE case 0xCF: w = 0x012A; break; // LATIN CAPITAL LETTER I WITH MACRON case 0xD0: w = 0x0110; break; // LATIN CAPITAL LETTER D WITH STROKE case 0xD1: w = 0x0145; break; // LATIN CAPITAL LETTER N WITH CEDILLA case 0xD2: w = 0x014C; break; // LATIN CAPITAL LETTER O WITH MACRON
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -