📄 swiutfconversions.c
字号:
/* SWIutfconversions, Unicode conversions */
/****************License************************************************
*
* Copyright 2000-2003. ScanSoft, Inc.
*
* Use of this software is subject to notices and obligations set forth
* in the SpeechWorks Public License - Software Version 1.2 which is
* included with this software.
*
* ScanSoft is a registered trademark of ScanSoft, Inc., and OpenSpeech,
* SpeechWorks and the SpeechWorks logo are registered trademarks or
* trademarks of SpeechWorks International, Inc. in the United States
* and other countries.
*
***********************************************************************/
/* -----1=0-------2=0-------3=0-------4=0-------5=0-------6=0-------7=0-------8
*/
#include <vxibuildopts.h>
#if P_VXI
#include <stdio.h>
#include <string.h>
#include <vxi/SWIutfconversions.h>
#if 0
#include "Encoding.h"
#include <cstring>
#include <cwchar>
#include <vector>
#include <algorithm>
bool initialized = false;
// ---------------------------------------------------------------------------
// Define a registry of decoder functions.
// ---------------------------------------------------------------------------
typedef bool (*DECODERFUNCTION)(const char *,
std::basic_string<wchar_t> &);
class EncoderEntry {
public:
const char * name;
DECODERFUNCTION function;
EncoderEntry(const char * x, DECODERFUNCTION y)
: name(x), function(y) { }
EncoderEntry(const EncoderEntry & x) : name(x.name), function(x.function) { }
EncoderEntry & operator=(const EncoderEntry & x)
{ if (this != &x) { name = x.name; function = x.function; }
return *this; }
};
bool operator<(const EncoderEntry & x, const EncoderEntry & y)
{ return strcmp(x.name, y.name) < 0; }
// ---------------------------------------------------------------------------
typedef std::vector<EncoderEntry> DECODER_REGISTRY;
DECODER_REGISTRY decoderRegistry;
void InitializeDecoder()
{
if(initialized)
return;
decoderRegistry.push_back(EncoderEntry("utf-8", DecodeUTF8));
decoderRegistry.push_back(EncoderEntry("utf8", DecodeUTF8));
decoderRegistry.push_back(EncoderEntry("us-ascii", DecodeASCII));
decoderRegistry.push_back(EncoderEntry("us_ascii", DecodeASCII));
decoderRegistry.push_back(EncoderEntry("usascii", DecodeASCII));
decoderRegistry.push_back(EncoderEntry("ascii", DecodeASCII));
decoderRegistry.push_back(EncoderEntry("iso8859-1", DecodeISO8859_1));
decoderRegistry.push_back(EncoderEntry("iso-8859-1", DecodeISO8859_1));
decoderRegistry.push_back(EncoderEntry("iso_8859-1", DecodeISO8859_1));
decoderRegistry.push_back(EncoderEntry("latin1", DecodeISO8859_1));
decoderRegistry.push_back(EncoderEntry("latin-1", DecodeISO8859_1));
decoderRegistry.push_back(EncoderEntry("latin_1", DecodeISO8859_1));
decoderRegistry.push_back(EncoderEntry("ibm-819", DecodeISO8859_1));
decoderRegistry.push_back(EncoderEntry("ibm819", DecodeISO8859_1));
decoderRegistry.push_back(EncoderEntry("iso8859-2", DecodeISO8859_2));
decoderRegistry.push_back(EncoderEntry("iso-8859-2", DecodeISO8859_2));
decoderRegistry.push_back(EncoderEntry("iso_8859-2", DecodeISO8859_2));
decoderRegistry.push_back(EncoderEntry("latin2", DecodeISO8859_2));
decoderRegistry.push_back(EncoderEntry("latin-2", DecodeISO8859_2));
decoderRegistry.push_back(EncoderEntry("latin_2", DecodeISO8859_2));
decoderRegistry.push_back(EncoderEntry("iso8859-3", DecodeISO8859_3));
decoderRegistry.push_back(EncoderEntry("iso-8859-3", DecodeISO8859_3));
decoderRegistry.push_back(EncoderEntry("iso_8859-3", DecodeISO8859_3));
decoderRegistry.push_back(EncoderEntry("latin3", DecodeISO8859_3));
decoderRegistry.push_back(EncoderEntry("latin-3", DecodeISO8859_3));
decoderRegistry.push_back(EncoderEntry("latin_3", DecodeISO8859_3));
decoderRegistry.push_back(EncoderEntry("iso8859-4", DecodeISO8859_4));
decoderRegistry.push_back(EncoderEntry("iso-8859-4", DecodeISO8859_4));
decoderRegistry.push_back(EncoderEntry("iso_8859-4", DecodeISO8859_4));
decoderRegistry.push_back(EncoderEntry("latin4", DecodeISO8859_4));
decoderRegistry.push_back(EncoderEntry("latin-4", DecodeISO8859_4));
decoderRegistry.push_back(EncoderEntry("latin_4", DecodeISO8859_4));
decoderRegistry.push_back(EncoderEntry("iso8859-15", DecodeISO8859_15));
decoderRegistry.push_back(EncoderEntry("iso-8859-15", DecodeISO8859_15));
decoderRegistry.push_back(EncoderEntry("iso_8859-15", DecodeISO8859_15));
decoderRegistry.push_back(EncoderEntry("latin9", DecodeISO8859_15));
decoderRegistry.push_back(EncoderEntry("latin-9", DecodeISO8859_15));
decoderRegistry.push_back(EncoderEntry("latin_9", DecodeISO8859_15));
std::sort(decoderRegistry.begin(), decoderRegistry.end());
initialized = true;
}
static bool DoInitialization()
{
if(initialized)
return true;
InitializeDecoder();
return true;
}
bool do_initialization = DoInitialization();
int DecodeString(const char * encodingName,
const char * inputString,
std::basic_string<wchar_t> & outputString)
{
if (!initialized || encodingName == NULL || inputString == NULL)
return -1;
// (1) Convert string to lowercase.
std::basic_string<char> encoding(encodingName);
for (unsigned int i = 0; i < encoding.length(); ++i)
if (encoding[i] < 0x5B && encoding[i] > 0x40)
encoding[i] += 0x20;
DECODER_REGISTRY::iterator j
= std::lower_bound(decoderRegistry.begin(), decoderRegistry.end(),
EncoderEntry(encoding.c_str(), NULL));
if (j == decoderRegistry.end() || encoding != (*j).name) return -1;
if ((*j).function(inputString, outputString)) return 0;
return 1;
}
// ---------------------------------------------------------------------------
// Now we define the 'simple' decoder functions
// ---------------------------------------------------------------------------
bool DecodeASCII(const char * in, std::basic_string<wchar_t> & out)
{
out.erase();
while (*in != '\0') {
char c = *in;
if (c > 0x7f || c < 0) return false;
out += wchar_t(c);
++in;
}
return true;
}
bool DecodeISO8859_1(const char * in, std::basic_string<wchar_t> & out)
{
out.erase();
while (*in != '\0') {
out += wchar_t(*in);
++in;
}
return true;
}
bool DecodeISO8859_2(const char * in, std::basic_string<wchar_t> & out)
{
out.erase();
wchar_t w;
while (*in != '\0') {
switch (*in) {
case 0xA1: w = 0x0104; break; // LATIN CAPITAL LETTER A WITH OGONEK
case 0xA2: w = 0x02D8; break; // BREVE
case 0xA3: w = 0x0141; break; // LATIN CAPITAL LETTER L WITH STROKE
case 0xA5: w = 0x013D; break; // LATIN CAPITAL LETTER L WITH CARON
case 0xA6: w = 0x015A; break; // LATIN CAPITAL LETTER S WITH ACUTE
case 0xA9: w = 0x0160; break; // LATIN CAPITAL LETTER S WITH CARON
case 0xAA: w = 0x015E; break; // LATIN CAPITAL LETTER S WITH CEDILLA
case 0xAB: w = 0x0164; break; // LATIN CAPITAL LETTER T WITH CARON
case 0xAC: w = 0x0179; break; // LATIN CAPITAL LETTER Z WITH ACUTE
case 0xAE: w = 0x017D; break; // LATIN CAPITAL LETTER Z WITH CARON
case 0xAF: w = 0x017B; break; // LATIN CAPITAL LETTER Z WITH DOT ABOVE
case 0xB1: w = 0x0105; break; // LATIN SMALL LETTER A WITH OGONEK
case 0xB2: w = 0x02DB; break; // OGONEK
case 0xB3: w = 0x0142; break; // LATIN SMALL LETTER L WITH STROKE
case 0xB5: w = 0x013E; break; // LATIN SMALL LETTER L WITH CARON
case 0xB6: w = 0x015B; break; // LATIN SMALL LETTER S WITH ACUTE
case 0xB7: w = 0x02C7; break; // CARON
case 0xB9: w = 0x0161; break; // LATIN SMALL LETTER S WITH CARON
case 0xBA: w = 0x015F; break; // LATIN SMALL LETTER S WITH CEDILLA
case 0xBB: w = 0x0165; break; // LATIN SMALL LETTER T WITH CARON
case 0xBC: w = 0x017A; break; // LATIN SMALL LETTER Z WITH ACUTE
case 0xBD: w = 0x02DD; break; // DOUBLE ACUTE ACCENT
case 0xBE: w = 0x017E; break; // LATIN SMALL LETTER Z WITH CARON
case 0xBF: w = 0x017C; break; // LATIN SMALL LETTER Z WITH DOT ABOVE
case 0xC0: w = 0x0154; break; // LATIN CAPITAL LETTER R WITH ACUTE
case 0xC3: w = 0x0102; break; // LATIN CAPITAL LETTER A WITH BREVE
case 0xC5: w = 0x0139; break; // LATIN CAPITAL LETTER L WITH ACUTE
case 0xC6: w = 0x0106; break; // LATIN CAPITAL LETTER C WITH ACUTE
case 0xC8: w = 0x010C; break; // LATIN CAPITAL LETTER C WITH CARON
case 0xCA: w = 0x0118; break; // LATIN CAPITAL LETTER E WITH OGONEK
case 0xCC: w = 0x011A; break; // LATIN CAPITAL LETTER E WITH CARON
case 0xCF: w = 0x010E; break; // LATIN CAPITAL LETTER D WITH CARON
case 0xD0: w = 0x0110; break; // LATIN CAPITAL LETTER D WITH STROKE
case 0xD1: w = 0x0143; break; // LATIN CAPITAL LETTER N WITH ACUTE
case 0xD2: w = 0x0147; break; // LATIN CAPITAL LETTER N WITH CARON
case 0xD5: w = 0x0150; break; // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
case 0xD8: w = 0x0158; break; // LATIN CAPITAL LETTER R WITH CARON
case 0xD9: w = 0x016E; break; // LATIN CAPITAL LETTER U WITH RING ABOVE
case 0xDB: w = 0x0170; break; // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
case 0xDE: w = 0x0162; break; // LATIN CAPITAL LETTER T WITH CEDILLA
case 0xE0: w = 0x0155; break; // LATIN SMALL LETTER R WITH ACUTE
case 0xE3: w = 0x0103; break; // LATIN SMALL LETTER A WITH BREVE
case 0xE5: w = 0x013A; break; // LATIN SMALL LETTER L WITH ACUTE
case 0xE6: w = 0x0107; break; // LATIN SMALL LETTER C WITH ACUTE
case 0xE8: w = 0x010D; break; // LATIN SMALL LETTER C WITH CARON
case 0xEA: w = 0x0119; break; // LATIN SMALL LETTER E WITH OGONEK
case 0xEC: w = 0x011B; break; // LATIN SMALL LETTER E WITH CARON
case 0xEF: w = 0x010F; break; // LATIN SMALL LETTER D WITH CARON
case 0xF0: w = 0x0111; break; // LATIN SMALL LETTER D WITH STROKE
case 0xF1: w = 0x0144; break; // LATIN SMALL LETTER N WITH ACUTE
case 0xF2: w = 0x0148; break; // LATIN SMALL LETTER N WITH CARON
case 0xF5: w = 0x0151; break; // LATIN SMALL LETTER O WITH DOUBLE ACUTE
case 0xF8: w = 0x0159; break; // LATIN SMALL LETTER R WITH CARON
case 0xF9: w = 0x016F; break; // LATIN SMALL LETTER U WITH RING ABOVE
case 0xFB: w = 0x0171; break; // LATIN SMALL LETTER U WITH DOUBLE ACUTE
case 0xFE: w = 0x0163; break; // LATIN SMALL LETTER T WITH CEDILLA
case 0xFF: w = 0x02D9; break; // DOT ABOVE
default:
w = wchar_t(*in);
break;
}
out += w;
++in;
}
return true;
}
bool DecodeISO8859_3(const char * in, std::basic_string<wchar_t> & out)
{
out.erase();
wchar_t w;
while (*in != '\0') {
switch (*in) {
case 0xA1: w = 0x0126; break; // LATIN CAPITAL LETTER H WITH STROKE
case 0xA2: w = 0x02D8; break; // BREVE
case 0xA6: w = 0x0124; break; // LATIN CAPITAL LETTER H WITH CIRCUMFLEX
case 0xA9: w = 0x0130; break; // LATIN CAPITAL LETTER I WITH DOT ABOVE
case 0xAA: w = 0x015E; break; // LATIN CAPITAL LETTER S WITH CEDILLA
case 0xAB: w = 0x011E; break; // LATIN CAPITAL LETTER G WITH BREVE
case 0xAC: w = 0x0134; break; // LATIN CAPITAL LETTER J WITH CIRCUMFLEX
case 0xAF: w = 0x017B; break; // LATIN CAPITAL LETTER Z WITH DOT ABOVE
case 0xB1: w = 0x0127; break; // LATIN SMALL LETTER H WITH STROKE
case 0xB6: w = 0x0125; break; // LATIN SMALL LETTER H WITH CIRCUMFLEX
case 0xB9: w = 0x0131; break; // LATIN SMALL LETTER DOTLESS I
case 0xBA: w = 0x015F; break; // LATIN SMALL LETTER S WITH CEDILLA
case 0xBB: w = 0x011F; break; // LATIN SMALL LETTER G WITH BREVE
case 0xBC: w = 0x0135; break; // LATIN SMALL LETTER J WITH CIRCUMFLEX
case 0xBF: w = 0x017C; break; // LATIN SMALL LETTER Z WITH DOT ABOVE
case 0xC5: w = 0x010A; break; // LATIN CAPITAL LETTER C WITH DOT ABOVE
case 0xC6: w = 0x0108; break; // LATIN CAPITAL LETTER C WITH CIRCUMFLEX
case 0xD5: w = 0x0120; break; // LATIN CAPITAL LETTER G WITH DOT ABOVE
case 0xD8: w = 0x011C; break; // LATIN CAPITAL LETTER G WITH CIRCUMFLEX
case 0xDD: w = 0x016C; break; // LATIN CAPITAL LETTER U WITH BREVE
case 0xDE: w = 0x015C; break; // LATIN CAPITAL LETTER S WITH CIRCUMFLEX
case 0xE5: w = 0x010B; break; // LATIN SMALL LETTER C WITH DOT ABOVE
case 0xE6: w = 0x0109; break; // LATIN SMALL LETTER C WITH CIRCUMFLEX
case 0xF5: w = 0x0121; break; // LATIN SMALL LETTER G WITH DOT ABOVE
case 0xF8: w = 0x011D; break; // LATIN SMALL LETTER G WITH CIRCUMFLEX
case 0xFD: w = 0x016D; break; // LATIN SMALL LETTER U WITH BREVE
case 0xFE: w = 0x015D; break; // LATIN SMALL LETTER S WITH CIRCUMFLEX
case 0xFF: w = 0x02D9; break; // DOT ABOVE
default:
w = wchar_t(*in);
break;
}
out += w;
++in;
}
return true;
}
bool DecodeISO8859_4(const char * in, std::basic_string<wchar_t> & out)
{
out.erase();
wchar_t w;
while (*in != '\0') {
switch (*in) {
case 0xA1: w = 0x0104; break; // LATIN CAPITAL LETTER A WITH OGONEK
case 0xA2: w = 0x0138; break; // LATIN SMALL LETTER KRA
case 0xA3: w = 0x0156; break; // LATIN CAPITAL LETTER R WITH CEDILLA
case 0xA5: w = 0x0128; break; // LATIN CAPITAL LETTER I WITH TILDE
case 0xA6: w = 0x013B; break; // LATIN CAPITAL LETTER L WITH CEDILLA
case 0xA9: w = 0x0160; break; // LATIN CAPITAL LETTER S WITH CARON
case 0xAA: w = 0x0112; break; // LATIN CAPITAL LETTER E WITH MACRON
case 0xAB: w = 0x0122; break; // LATIN CAPITAL LETTER G WITH CEDILLA
case 0xAC: w = 0x0166; break; // LATIN CAPITAL LETTER T WITH STROKE
case 0xAE: w = 0x017D; break; // LATIN CAPITAL LETTER Z WITH CARON
case 0xB1: w = 0x0105; break; // LATIN SMALL LETTER A WITH OGONEK
case 0xB2: w = 0x02DB; break; // OGONEK
case 0xB3: w = 0x0157; break; // LATIN SMALL LETTER R WITH CEDILLA
case 0xB5: w = 0x0129; break; // LATIN SMALL LETTER I WITH TILDE
case 0xB6: w = 0x013C; break; // LATIN SMALL LETTER L WITH CEDILLA
case 0xB7: w = 0x02C7; break; // CARON
case 0xB9: w = 0x0161; break; // LATIN SMALL LETTER S WITH CARON
case 0xBA: w = 0x0113; break; // LATIN SMALL LETTER E WITH MACRON
case 0xBB: w = 0x0123; break; // LATIN SMALL LETTER G WITH CEDILLA
case 0xBC: w = 0x0167; break; // LATIN SMALL LETTER T WITH STROKE
case 0xBD: w = 0x014A; break; // LATIN CAPITAL LETTER ENG
case 0xBE: w = 0x017E; break; // LATIN SMALL LETTER Z WITH CARON
case 0xBF: w = 0x014B; break; // LATIN SMALL LETTER ENG
case 0xC0: w = 0x0100; break; // LATIN CAPITAL LETTER A WITH MACRON
case 0xC7: w = 0x012E; break; // LATIN CAPITAL LETTER I WITH OGONEK
case 0xC8: w = 0x010C; break; // LATIN CAPITAL LETTER C WITH CARON
case 0xCA: w = 0x0118; break; // LATIN CAPITAL LETTER E WITH OGONEK
case 0xCC: w = 0x0116; break; // LATIN CAPITAL LETTER E WITH DOT ABOVE
case 0xCF: w = 0x012A; break; // LATIN CAPITAL LETTER I WITH MACRON
case 0xD0: w = 0x0110; break; // LATIN CAPITAL LETTER D WITH STROKE
case 0xD1: w = 0x0145; break; // LATIN CAPITAL LETTER N WITH CEDILLA
case 0xD2: w = 0x014C; break; // LATIN CAPITAL LETTER O WITH MACRON
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -