📄 swiutfconversions.c

📁 OSB-PIK-OpenVXI-3.0.0源代码 “中国XML论坛 - 专业的XML技术讨论区--XML在语音技术中的应用”
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* SWIutfconversions, Unicode conversions */ /****************License************************************************  *  * Copyright 2000-2003.  ScanSoft, Inc.      *  * Use of this software is subject to notices and obligations set forth   * in the SpeechWorks Public License - Software Version 1.2 which is   * included with this software.   *  * ScanSoft is a registered trademark of ScanSoft, Inc., and OpenSpeech,   * SpeechWorks and the SpeechWorks logo are registered trademarks or   * trademarks of SpeechWorks International, Inc. in the United States   * and other countries.  *  ***********************************************************************/  /* -----1=0-------2=0-------3=0-------4=0-------5=0-------6=0-------7=0-------8  */  #include <stdio.h> #include <string.h> #include <SWIutfconversions.h>  #if 0 #include "Encoding.h" #include <cstring> #include <cwchar> #include <vector> #include <algorithm>  bool initialized = false;   // --------------------------------------------------------------------------- // Define a registry of decoder functions. // ---------------------------------------------------------------------------  typedef bool (*DECODERFUNCTION)(const char *,                                 std::basic_string<wchar_t> &);  class EncoderEntry { public:   const char * name;   DECODERFUNCTION function;    EncoderEntry(const char * x, DECODERFUNCTION y)     : name(x), function(y) { }    EncoderEntry(const EncoderEntry & x) : name(x.name), function(x.function) { }    EncoderEntry & operator=(const EncoderEntry & x)   { if (this != &x) { name = x.name;  function = x.function; }     return *this; } };  bool operator<(const EncoderEntry & x, const EncoderEntry & y) { return strcmp(x.name, y.name) < 0; }  // ---------------------------------------------------------------------------  typedef std::vector<EncoderEntry> DECODER_REGISTRY; DECODER_REGISTRY decoderRegistry;  void InitializeDecoder() {   if(initialized)     return;   decoderRegistry.push_back(EncoderEntry("utf-8",       DecodeUTF8));   decoderRegistry.push_back(EncoderEntry("utf8",        DecodeUTF8));    decoderRegistry.push_back(EncoderEntry("us-ascii",    DecodeASCII));   decoderRegistry.push_back(EncoderEntry("us_ascii",    DecodeASCII));   decoderRegistry.push_back(EncoderEntry("usascii",     DecodeASCII));   decoderRegistry.push_back(EncoderEntry("ascii",       DecodeASCII));    decoderRegistry.push_back(EncoderEntry("iso8859-1",   DecodeISO8859_1));   decoderRegistry.push_back(EncoderEntry("iso-8859-1",  DecodeISO8859_1));   decoderRegistry.push_back(EncoderEntry("iso_8859-1",  DecodeISO8859_1));   decoderRegistry.push_back(EncoderEntry("latin1",      DecodeISO8859_1));   decoderRegistry.push_back(EncoderEntry("latin-1",     DecodeISO8859_1));   decoderRegistry.push_back(EncoderEntry("latin_1",     DecodeISO8859_1));   decoderRegistry.push_back(EncoderEntry("ibm-819",     DecodeISO8859_1));   decoderRegistry.push_back(EncoderEntry("ibm819",      DecodeISO8859_1));    decoderRegistry.push_back(EncoderEntry("iso8859-2",   DecodeISO8859_2));   decoderRegistry.push_back(EncoderEntry("iso-8859-2",  DecodeISO8859_2));   decoderRegistry.push_back(EncoderEntry("iso_8859-2",  DecodeISO8859_2));   decoderRegistry.push_back(EncoderEntry("latin2",      DecodeISO8859_2));   decoderRegistry.push_back(EncoderEntry("latin-2",     DecodeISO8859_2));   decoderRegistry.push_back(EncoderEntry("latin_2",     DecodeISO8859_2));    decoderRegistry.push_back(EncoderEntry("iso8859-3",   DecodeISO8859_3));   decoderRegistry.push_back(EncoderEntry("iso-8859-3",  DecodeISO8859_3));   decoderRegistry.push_back(EncoderEntry("iso_8859-3",  DecodeISO8859_3));   decoderRegistry.push_back(EncoderEntry("latin3",      DecodeISO8859_3));   decoderRegistry.push_back(EncoderEntry("latin-3",     DecodeISO8859_3));   decoderRegistry.push_back(EncoderEntry("latin_3",     DecodeISO8859_3));    decoderRegistry.push_back(EncoderEntry("iso8859-4",   DecodeISO8859_4));   decoderRegistry.push_back(EncoderEntry("iso-8859-4",  DecodeISO8859_4));   decoderRegistry.push_back(EncoderEntry("iso_8859-4",  DecodeISO8859_4));   decoderRegistry.push_back(EncoderEntry("latin4",      DecodeISO8859_4));   decoderRegistry.push_back(EncoderEntry("latin-4",     DecodeISO8859_4));   decoderRegistry.push_back(EncoderEntry("latin_4",     DecodeISO8859_4));    decoderRegistry.push_back(EncoderEntry("iso8859-15",  DecodeISO8859_15));   decoderRegistry.push_back(EncoderEntry("iso-8859-15", DecodeISO8859_15));   decoderRegistry.push_back(EncoderEntry("iso_8859-15", DecodeISO8859_15));   decoderRegistry.push_back(EncoderEntry("latin9",      DecodeISO8859_15));   decoderRegistry.push_back(EncoderEntry("latin-9",     DecodeISO8859_15));   decoderRegistry.push_back(EncoderEntry("latin_9",     DecodeISO8859_15));    std::sort(decoderRegistry.begin(), decoderRegistry.end());    initialized = true; }  static bool DoInitialization() {   if(initialized)     return true;   InitializeDecoder();   return true; } bool do_initialization = DoInitialization();   int DecodeString(const char * encodingName,                  const char * inputString,                  std::basic_string<wchar_t> & outputString) {   if (!initialized || encodingName == NULL || inputString == NULL)     return -1;    // (1) Convert string to lowercase.   std::basic_string<char> encoding(encodingName);   for (unsigned int i = 0; i < encoding.length(); ++i)     if (encoding[i] < 0x5B && encoding[i] > 0x40)       encoding[i] += 0x20;    DECODER_REGISTRY::iterator j      = std::lower_bound(decoderRegistry.begin(), decoderRegistry.end(),                        EncoderEntry(encoding.c_str(), NULL));    if (j == decoderRegistry.end() || encoding != (*j).name) return -1;    if ((*j).function(inputString, outputString)) return 0;   return 1; }   // --------------------------------------------------------------------------- // Now we define the 'simple' decoder functions // ---------------------------------------------------------------------------  bool DecodeASCII(const char * in, std::basic_string<wchar_t> & out) {   out.erase();    while (*in != '\0') {     char c = *in;     if (c > 0x7f || c < 0) return false;     out += wchar_t(c);     ++in;   }    return true; }   bool DecodeISO8859_1(const char * in, std::basic_string<wchar_t> & out) {   out.erase();    while (*in != '\0') {     out += wchar_t(*in);     ++in;   }    return true; }   bool DecodeISO8859_2(const char * in, std::basic_string<wchar_t> & out) {   out.erase();   wchar_t w;    while (*in != '\0') {     switch (*in) {     case 0xA1:  w = 0x0104;  break; // LATIN CAPITAL LETTER A WITH OGONEK     case 0xA2:  w = 0x02D8;  break; // BREVE     case 0xA3:  w = 0x0141;  break; // LATIN CAPITAL LETTER L WITH STROKE     case 0xA5:  w = 0x013D;  break; // LATIN CAPITAL LETTER L WITH CARON     case 0xA6:  w = 0x015A;  break; // LATIN CAPITAL LETTER S WITH ACUTE     case 0xA9:  w = 0x0160;  break; // LATIN CAPITAL LETTER S WITH CARON     case 0xAA:  w = 0x015E;  break; // LATIN CAPITAL LETTER S WITH CEDILLA     case 0xAB:  w = 0x0164;  break; // LATIN CAPITAL LETTER T WITH CARON     case 0xAC:  w = 0x0179;  break; // LATIN CAPITAL LETTER Z WITH ACUTE     case 0xAE:  w = 0x017D;  break; // LATIN CAPITAL LETTER Z WITH CARON     case 0xAF:  w = 0x017B;  break; // LATIN CAPITAL LETTER Z WITH DOT ABOVE     case 0xB1:  w = 0x0105;  break; // LATIN SMALL LETTER A WITH OGONEK     case 0xB2:  w = 0x02DB;  break; // OGONEK     case 0xB3:  w = 0x0142;  break; // LATIN SMALL LETTER L WITH STROKE     case 0xB5:  w = 0x013E;  break; // LATIN SMALL LETTER L WITH CARON     case 0xB6:  w = 0x015B;  break; // LATIN SMALL LETTER S WITH ACUTE     case 0xB7:  w = 0x02C7;  break; // CARON     case 0xB9:  w = 0x0161;  break; // LATIN SMALL LETTER S WITH CARON     case 0xBA:  w = 0x015F;  break; // LATIN SMALL LETTER S WITH CEDILLA     case 0xBB:  w = 0x0165;  break; // LATIN SMALL LETTER T WITH CARON     case 0xBC:  w = 0x017A;  break; // LATIN SMALL LETTER Z WITH ACUTE     case 0xBD:  w = 0x02DD;  break; // DOUBLE ACUTE ACCENT     case 0xBE:  w = 0x017E;  break; // LATIN SMALL LETTER Z WITH CARON     case 0xBF:  w = 0x017C;  break; // LATIN SMALL LETTER Z WITH DOT ABOVE     case 0xC0:  w = 0x0154;  break; // LATIN CAPITAL LETTER R WITH ACUTE     case 0xC3:  w = 0x0102;  break; // LATIN CAPITAL LETTER A WITH BREVE     case 0xC5:  w = 0x0139;  break; // LATIN CAPITAL LETTER L WITH ACUTE     case 0xC6:  w = 0x0106;  break; // LATIN CAPITAL LETTER C WITH ACUTE     case 0xC8:  w = 0x010C;  break; // LATIN CAPITAL LETTER C WITH CARON     case 0xCA:  w = 0x0118;  break; // LATIN CAPITAL LETTER E WITH OGONEK     case 0xCC:  w = 0x011A;  break; // LATIN CAPITAL LETTER E WITH CARON     case 0xCF:  w = 0x010E;  break; // LATIN CAPITAL LETTER D WITH CARON     case 0xD0:  w = 0x0110;  break; // LATIN CAPITAL LETTER D WITH STROKE     case 0xD1:  w = 0x0143;  break; // LATIN CAPITAL LETTER N WITH ACUTE     case 0xD2:  w = 0x0147;  break; // LATIN CAPITAL LETTER N WITH CARON     case 0xD5:  w = 0x0150;  break; // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE     case 0xD8:  w = 0x0158;  break; // LATIN CAPITAL LETTER R WITH CARON     case 0xD9:  w = 0x016E;  break; // LATIN CAPITAL LETTER U WITH RING ABOVE     case 0xDB:  w = 0x0170;  break; // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE     case 0xDE:  w = 0x0162;  break; // LATIN CAPITAL LETTER T WITH CEDILLA     case 0xE0:  w = 0x0155;  break; // LATIN SMALL LETTER R WITH ACUTE     case 0xE3:  w = 0x0103;  break; // LATIN SMALL LETTER A WITH BREVE     case 0xE5:  w = 0x013A;  break; // LATIN SMALL LETTER L WITH ACUTE     case 0xE6:  w = 0x0107;  break; // LATIN SMALL LETTER C WITH ACUTE     case 0xE8:  w = 0x010D;  break; // LATIN SMALL LETTER C WITH CARON     case 0xEA:  w = 0x0119;  break; // LATIN SMALL LETTER E WITH OGONEK     case 0xEC:  w = 0x011B;  break; // LATIN SMALL LETTER E WITH CARON     case 0xEF:  w = 0x010F;  break; // LATIN SMALL LETTER D WITH CARON     case 0xF0:  w = 0x0111;  break; // LATIN SMALL LETTER D WITH STROKE     case 0xF1:  w = 0x0144;  break; // LATIN SMALL LETTER N WITH ACUTE     case 0xF2:  w = 0x0148;  break; // LATIN SMALL LETTER N WITH CARON     case 0xF5:  w = 0x0151;  break; // LATIN SMALL LETTER O WITH DOUBLE ACUTE     case 0xF8:  w = 0x0159;  break; // LATIN SMALL LETTER R WITH CARON     case 0xF9:  w = 0x016F;  break; // LATIN SMALL LETTER U WITH RING ABOVE     case 0xFB:  w = 0x0171;  break; // LATIN SMALL LETTER U WITH DOUBLE ACUTE     case 0xFE:  w = 0x0163;  break; // LATIN SMALL LETTER T WITH CEDILLA     case 0xFF:  w = 0x02D9;  break; // DOT ABOVE     default:       w = wchar_t(*in);       break;     }     out += w;     ++in;   }    return true; }   bool DecodeISO8859_3(const char * in, std::basic_string<wchar_t> & out) {   out.erase();   wchar_t w;    while (*in != '\0') {     switch (*in) {     case 0xA1:  w = 0x0126;  break; // LATIN CAPITAL LETTER H WITH STROKE     case 0xA2:  w = 0x02D8;  break; // BREVE     case 0xA6:  w = 0x0124;  break; // LATIN CAPITAL LETTER H WITH CIRCUMFLEX     case 0xA9:  w = 0x0130;  break; // LATIN CAPITAL LETTER I WITH DOT ABOVE     case 0xAA:  w = 0x015E;  break; // LATIN CAPITAL LETTER S WITH CEDILLA     case 0xAB:  w = 0x011E;  break; // LATIN CAPITAL LETTER G WITH BREVE     case 0xAC:  w = 0x0134;  break; // LATIN CAPITAL LETTER J WITH CIRCUMFLEX     case 0xAF:  w = 0x017B;  break; // LATIN CAPITAL LETTER Z WITH DOT ABOVE     case 0xB1:  w = 0x0127;  break; // LATIN SMALL LETTER H WITH STROKE     case 0xB6:  w = 0x0125;  break; // LATIN SMALL LETTER H WITH CIRCUMFLEX     case 0xB9:  w = 0x0131;  break; // LATIN SMALL LETTER DOTLESS I     case 0xBA:  w = 0x015F;  break; // LATIN SMALL LETTER S WITH CEDILLA     case 0xBB:  w = 0x011F;  break; // LATIN SMALL LETTER G WITH BREVE     case 0xBC:  w = 0x0135;  break; // LATIN SMALL LETTER J WITH CIRCUMFLEX     case 0xBF:  w = 0x017C;  break; // LATIN SMALL LETTER Z WITH DOT ABOVE     case 0xC5:  w = 0x010A;  break; // LATIN CAPITAL LETTER C WITH DOT ABOVE     case 0xC6:  w = 0x0108;  break; // LATIN CAPITAL LETTER C WITH CIRCUMFLEX     case 0xD5:  w = 0x0120;  break; // LATIN CAPITAL LETTER G WITH DOT ABOVE     case 0xD8:  w = 0x011C;  break; // LATIN CAPITAL LETTER G WITH CIRCUMFLEX     case 0xDD:  w = 0x016C;  break; // LATIN CAPITAL LETTER U WITH BREVE     case 0xDE:  w = 0x015C;  break; // LATIN CAPITAL LETTER S WITH CIRCUMFLEX     case 0xE5:  w = 0x010B;  break; // LATIN SMALL LETTER C WITH DOT ABOVE     case 0xE6:  w = 0x0109;  break; // LATIN SMALL LETTER C WITH CIRCUMFLEX     case 0xF5:  w = 0x0121;  break; // LATIN SMALL LETTER G WITH DOT ABOVE     case 0xF8:  w = 0x011D;  break; // LATIN SMALL LETTER G WITH CIRCUMFLEX     case 0xFD:  w = 0x016D;  break; // LATIN SMALL LETTER U WITH BREVE     case 0xFE:  w = 0x015D;  break; // LATIN SMALL LETTER S WITH CIRCUMFLEX     case 0xFF:  w = 0x02D9;  break; // DOT ABOVE     default:       w = wchar_t(*in);       break;     }     out += w;     ++in;   }    return true; }   bool DecodeISO8859_4(const char * in, std::basic_string<wchar_t> & out) {   out.erase();   wchar_t w;    while (*in != '\0') {     switch (*in) {     case 0xA1:  w = 0x0104;  break; // LATIN CAPITAL LETTER A WITH OGONEK     case 0xA2:  w = 0x0138;  break; // LATIN SMALL LETTER KRA     case 0xA3:  w = 0x0156;  break; // LATIN CAPITAL LETTER R WITH CEDILLA     case 0xA5:  w = 0x0128;  break; // LATIN CAPITAL LETTER I WITH TILDE     case 0xA6:  w = 0x013B;  break; // LATIN CAPITAL LETTER L WITH CEDILLA     case 0xA9:  w = 0x0160;  break; // LATIN CAPITAL LETTER S WITH CARON     case 0xAA:  w = 0x0112;  break; // LATIN CAPITAL LETTER E WITH MACRON     case 0xAB:  w = 0x0122;  break; // LATIN CAPITAL LETTER G WITH CEDILLA     case 0xAC:  w = 0x0166;  break; // LATIN CAPITAL LETTER T WITH STROKE     case 0xAE:  w = 0x017D;  break; // LATIN CAPITAL LETTER Z WITH CARON     case 0xB1:  w = 0x0105;  break; // LATIN SMALL LETTER A WITH OGONEK     case 0xB2:  w = 0x02DB;  break; // OGONEK     case 0xB3:  w = 0x0157;  break; // LATIN SMALL LETTER R WITH CEDILLA     case 0xB5:  w = 0x0129;  break; // LATIN SMALL LETTER I WITH TILDE     case 0xB6:  w = 0x013C;  break; // LATIN SMALL LETTER L WITH CEDILLA     case 0xB7:  w = 0x02C7;  break; // CARON     case 0xB9:  w = 0x0161;  break; // LATIN SMALL LETTER S WITH CARON     case 0xBA:  w = 0x0113;  break; // LATIN SMALL LETTER E WITH MACRON     case 0xBB:  w = 0x0123;  break; // LATIN SMALL LETTER G WITH CEDILLA     case 0xBC:  w = 0x0167;  break; // LATIN SMALL LETTER T WITH STROKE     case 0xBD:  w = 0x014A;  break; // LATIN CAPITAL LETTER ENG     case 0xBE:  w = 0x017E;  break; // LATIN SMALL LETTER Z WITH CARON     case 0xBF:  w = 0x014B;  break; // LATIN SMALL LETTER ENG     case 0xC0:  w = 0x0100;  break; // LATIN CAPITAL LETTER A WITH MACRON     case 0xC7:  w = 0x012E;  break; // LATIN CAPITAL LETTER I WITH OGONEK     case 0xC8:  w = 0x010C;  break; // LATIN CAPITAL LETTER C WITH CARON     case 0xCA:  w = 0x0118;  break; // LATIN CAPITAL LETTER E WITH OGONEK     case 0xCC:  w = 0x0116;  break; // LATIN CAPITAL LETTER E WITH DOT ABOVE     case 0xCF:  w = 0x012A;  break; // LATIN CAPITAL LETTER I WITH MACRON     case 0xD0:  w = 0x0110;  break; // LATIN CAPITAL LETTER D WITH STROKE     case 0xD1:  w = 0x0145;  break; // LATIN CAPITAL LETTER N WITH CEDILLA     case 0xD2:  w = 0x014C;  break; // LATIN CAPITAL LETTER O WITH MACRON
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -