📄 swiutfconversions.c
字号:
case 0xD3: w = 0x0136; break; // LATIN CAPITAL LETTER K WITH CEDILLA
case 0xD9: w = 0x0172; break; // LATIN CAPITAL LETTER U WITH OGONEK
case 0xDD: w = 0x0168; break; // LATIN CAPITAL LETTER U WITH TILDE
case 0xDE: w = 0x016A; break; // LATIN CAPITAL LETTER U WITH MACRON
case 0xE0: w = 0x0101; break; // LATIN SMALL LETTER A WITH MACRON
case 0xE7: w = 0x012F; break; // LATIN SMALL LETTER I WITH OGONEK
case 0xE8: w = 0x010D; break; // LATIN SMALL LETTER C WITH CARON
case 0xEA: w = 0x0119; break; // LATIN SMALL LETTER E WITH OGONEK
case 0xEC: w = 0x0117; break; // LATIN SMALL LETTER E WITH DOT ABOVE
case 0xEF: w = 0x012B; break; // LATIN SMALL LETTER I WITH MACRON
case 0xF0: w = 0x0111; break; // LATIN SMALL LETTER D WITH STROKE
case 0xF1: w = 0x0146; break; // LATIN SMALL LETTER N WITH CEDILLA
case 0xF2: w = 0x014D; break; // LATIN SMALL LETTER O WITH MACRON
case 0xF3: w = 0x0137; break; // LATIN SMALL LETTER K WITH CEDILLA
case 0xF9: w = 0x0173; break; // LATIN SMALL LETTER U WITH OGONEK
case 0xFD: w = 0x0169; break; // LATIN SMALL LETTER U WITH TILDE
case 0xFE: w = 0x016B; break; // LATIN SMALL LETTER U WITH MACRON
case 0xFF: w = 0x02D9; break; // DOT ABOVE
default:
w = wchar_t(*in);
break;
}
out += w;
++in;
}
return true;
}
bool DecodeISO8859_15(const char * in, std::basic_string<wchar_t> & out)
{
out.erase();
wchar_t w;
while (*in != '\0') {
switch (*in) {
case 0xA4: w = 0x20AC; break; // EURO SIGN
case 0xA6: w = 0x0160; break; // LATIN CAPITAL LETTER S WITH CARON
case 0xA8: w = 0x0161; break; // LATIN SMALL LETTER S WITH CARON
case 0xB4: w = 0x017D; break; // LATIN CAPITAL LETTER Z WITH CARON
case 0xB8: w = 0x017E; break; // LATIN SMALL LETTER Z WITH CARON
case 0xBC: w = 0x0152; break; // LATIN CAPITAL LIGATURE OE
case 0xBD: w = 0x0153; break; // LATIN SMALL LIGATURE OE
case 0xBE: w = 0x0178; break; // LATIN CAPITAL LETTER Y WITH DIAERESIS
default:
w = wchar_t(*in);
break;
}
out += w;
++in;
}
return true;
}
#endif
// ---------------------------------------------------------------------------
// Jerry Carter writes:
//
// The UTF-8 encoding / decoding routines were modified from the Apache Xerces
// project. The original translated from UTF-8 to UTF-16. In this version, I
// have removed support for surrogate characters. This removes the difference
// between platforms which treat wchar_t as UTF-16 (Windows) and those which
// use UTF-32 (Linux, MacOS, etc.).
//
// The Apache license appears below (as required).
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
// gUTFBytes
// A list of counts of trailing bytes for each initial byte in the input.
//
// gUTFOffsets
// A list of values to offset each result char type, according to how
// many source bytes when into making it.
//
// gFirstByteMark
// A list of values to mask onto the first byte of an encoded sequence,
// indexed by the number of bytes used to create the sequence.
static const char gUTFBytes[256] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
static const unsigned long gUTFOffsets[6] =
{ 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 };
static const unsigned char gFirstByteMark[7] =
{ 0x00, 0x00, 0xC0, 0xE0,
0xF0, 0xF8, 0xFC };
int SWIutf8towcslen(const unsigned char* src)
{
int len = 0;
while (*src != '\0') {
unsigned int trailingBytes;
// Get the next leading byte out
const unsigned char firstByte = (unsigned char) *src;
// See how many trailing src bytes this sequence is going to require
trailingBytes = gUTFBytes[firstByte];
src += trailingBytes + 1;
len++;
}
return len;
}
SWIcharResult SWIutf8towcs( const unsigned char *src, wchar_t *dst, int maxdstlen )
{
// Get pointers to our start and end points of the input buffer
const unsigned char* srcPtr = src;
const unsigned char* srcEnd = src + strlen((const char *)src);
wchar_t *dstEnd = dst+maxdstlen; /* leave room for null */
// We now loop until we run out of input data.
while (srcPtr < srcEnd) {
unsigned int trailingBytes;
unsigned long tmpVal = 0;
// Get the next leading byte out
const unsigned char firstByte = (unsigned char) *srcPtr;
// Special-case ASCII, which is a leading byte value of <= 127
if (firstByte <= 127) {
*dst++ = (wchar_t) firstByte;
srcPtr++;
continue;
}
// See how many trailing src bytes this sequence is going to require
trailingBytes = gUTFBytes[firstByte];
// If there are not enough source bytes to do this one, then we
// are done. Note that we done >= here because we are implicitly
// counting the 1 byte we get no matter what.
if (srcPtr + trailingBytes >= srcEnd)
return SWIchar_FAIL; // ??
// Looks ok, so lets build up the value
switch (trailingBytes) {
case 5: tmpVal += *srcPtr++; tmpVal <<= 6;
case 4: tmpVal += *srcPtr++; tmpVal <<= 6;
case 3: tmpVal += *srcPtr++; tmpVal <<= 6;
case 2: tmpVal += *srcPtr++; tmpVal <<= 6;
case 1: tmpVal += *srcPtr++; tmpVal <<= 6;
case 0: tmpVal += *srcPtr++;
break;
default:
return SWIchar_ERROR;
}
tmpVal -= gUTFOffsets[trailingBytes];
// If surrogate pairs would be required for 16-bit characters, fail.
if (tmpVal & 0xFFFF0000)
return SWIchar_FAIL;
if ( dst >= dstEnd ) {
return SWIchar_BUFFER_OVERFLOW;
}
*dst++ = (wchar_t)tmpVal;
}
*dst = L'\0';
return SWIchar_SUCCESS; // check this (CARO)
}
int SWIwcstoutf8len(const wchar_t* src)
{
int len = 0;
// Get pointers to our start and end points of the input buffer.
while (*src != 0) {
unsigned int encodedBytes;
wchar_t curVal = (*src++) & 0x0000FFFF;
// Watchout for surrogates.
if ((curVal >= 0xD800 && curVal <= 0xDBFF) || curVal == 0xFFFE ||
curVal == 0xFFFF)
return -2;
// Figure out how many bytes we need
if (curVal < 0x80) encodedBytes = 1;
else if (curVal < 0x800) encodedBytes = 2;
else if (curVal < 0x10000) encodedBytes = 3;
else if (curVal < 0x200000) encodedBytes = 4;
else if (curVal < 0x4000000) encodedBytes = 5;
else if (curVal <= 0x7FFFFFFF) encodedBytes = 6;
else {
// THIS SHOULD NOT HAPPEN!
return -2;
}
// And spit out the bytes. We spit them out in reverse order
// here, so bump up the output pointer and work down as we go.
len += encodedBytes;
}
return len;
}
SWIcharResult SWIwcstoutf8(const wchar_t *src, unsigned char *dst, int maxdstlen)
{
// Get pointers to our start and end points of the input buffer.
const wchar_t* srcPtr = src;
const wchar_t* srcEnd = srcPtr + wcslen(src);
unsigned char *dstEnd = dst+maxdstlen;
while (srcPtr < srcEnd) {
unsigned int encodedBytes;
wchar_t curVal = (*srcPtr++) & 0x0000FFFF;
// Watchout for surrogates.
if ( ((curVal >= 0xD800) && (curVal <= 0xDFFF)) ||
((curVal == 0xFFFE) || curVal == 0xFFFF) )
return SWIchar_FAIL;
// Figure out how many bytes we need
if (curVal < 0x80) encodedBytes = 1;
else if (curVal < 0x800) encodedBytes = 2;
else if (curVal < 0x10000) encodedBytes = 3;
else if (curVal < 0x200000) encodedBytes = 4;
else if (curVal < 0x4000000) encodedBytes = 5;
else if (curVal <= 0x7FFFFFFF) encodedBytes = 6;
else {
// THIS SHOULD NOT HAPPEN!
return SWIchar_ERROR;
}
// And spit out the bytes. We spit them out in reverse order
// here, so bump up the output pointer and work down as we go.
dst += encodedBytes;
if ( dst > dstEnd ) {
return SWIchar_BUFFER_OVERFLOW;
}
switch(encodedBytes) {
case 6 : *--dst = (unsigned char) ((curVal | 0x80) & 0xBF);
curVal >>= 6;
case 5 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF);
curVal >>= 6;
case 4 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF);
curVal >>= 6;
case 3 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF);
curVal >>= 6;
case 2 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF);
curVal >>= 6;
case 1 : *--dst = (unsigned char)(curVal | gFirstByteMark[encodedBytes]);
}
dst += encodedBytes;
}
*dst = '\0';
return SWIchar_SUCCESS; // check this (CARO)
}
#endif
#ifdef _MSC_VER
#pragma warning( disable:4206)
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -