📄 swiutfconversions.c
字号:
case 0xD3: w = 0x0136; break; // LATIN CAPITAL LETTER K WITH CEDILLA case 0xD9: w = 0x0172; break; // LATIN CAPITAL LETTER U WITH OGONEK case 0xDD: w = 0x0168; break; // LATIN CAPITAL LETTER U WITH TILDE case 0xDE: w = 0x016A; break; // LATIN CAPITAL LETTER U WITH MACRON case 0xE0: w = 0x0101; break; // LATIN SMALL LETTER A WITH MACRON case 0xE7: w = 0x012F; break; // LATIN SMALL LETTER I WITH OGONEK case 0xE8: w = 0x010D; break; // LATIN SMALL LETTER C WITH CARON case 0xEA: w = 0x0119; break; // LATIN SMALL LETTER E WITH OGONEK case 0xEC: w = 0x0117; break; // LATIN SMALL LETTER E WITH DOT ABOVE case 0xEF: w = 0x012B; break; // LATIN SMALL LETTER I WITH MACRON case 0xF0: w = 0x0111; break; // LATIN SMALL LETTER D WITH STROKE case 0xF1: w = 0x0146; break; // LATIN SMALL LETTER N WITH CEDILLA case 0xF2: w = 0x014D; break; // LATIN SMALL LETTER O WITH MACRON case 0xF3: w = 0x0137; break; // LATIN SMALL LETTER K WITH CEDILLA case 0xF9: w = 0x0173; break; // LATIN SMALL LETTER U WITH OGONEK case 0xFD: w = 0x0169; break; // LATIN SMALL LETTER U WITH TILDE case 0xFE: w = 0x016B; break; // LATIN SMALL LETTER U WITH MACRON case 0xFF: w = 0x02D9; break; // DOT ABOVE default: w = wchar_t(*in); break; } out += w; ++in; } return true; } bool DecodeISO8859_15(const char * in, std::basic_string<wchar_t> & out) { out.erase(); wchar_t w; while (*in != '\0') { switch (*in) { case 0xA4: w = 0x20AC; break; // EURO SIGN case 0xA6: w = 0x0160; break; // LATIN CAPITAL LETTER S WITH CARON case 0xA8: w = 0x0161; break; // LATIN SMALL LETTER S WITH CARON case 0xB4: w = 0x017D; break; // LATIN CAPITAL LETTER Z WITH CARON case 0xB8: w = 0x017E; break; // LATIN SMALL LETTER Z WITH CARON case 0xBC: w = 0x0152; break; // LATIN CAPITAL LIGATURE OE case 0xBD: w = 0x0153; break; // LATIN SMALL LIGATURE OE case 0xBE: w = 0x0178; break; // LATIN CAPITAL LETTER Y WITH DIAERESIS default: w = wchar_t(*in); break; } out += w; ++in; } return true; } #endif // --------------------------------------------------------------------------- // Jerry Carter writes: // // The UTF-8 encoding / decoding routines were modified from the Apache Xerces // project. The original translated from UTF-8 to UTF-16. In this version, I // have removed support for surrogate characters. This removes the difference // between platforms which treat wchar_t as UTF-16 (Windows) and those which // use UTF-32 (Linux, MacOS, etc.). // // The Apache license appears below (as required). /* * The Apache Software License, Version 1.1 * * Copyright (c) 1999-2000 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Xerces" and "Apache Software Foundation" must * not be used to endorse or promote products derived from this * software without prior written permission. For written * permission, please contact apache\@apache.org. * * 5. Products derived from this software may not be called "Apache", * nor may "Apache" appear in their name, without prior written * permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation, and was * originally based on software copyright (c) 1999, International * Business Machines, Inc., http://www.ibm.com . For more information * on the Apache Software Foundation, please see * <http://www.apache.org/>. */ // gUTFBytes // A list of counts of trailing bytes for each initial byte in the input. // // gUTFOffsets // A list of values to offset each result char type, according to how // many source bytes when into making it. // // gFirstByteMark // A list of values to mask onto the first byte of an encoded sequence, // indexed by the number of bytes used to create the sequence. static const char gUTFBytes[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; static const unsigned long gUTFOffsets[6] = { 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 }; static const unsigned char gFirstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; int SWIutf8towcslen(const unsigned char* src) { int len = 0; while (*src != '\0') { unsigned int trailingBytes; // Get the next leading byte out const unsigned char firstByte = (unsigned char) *src; // See how many trailing src bytes this sequence is going to require trailingBytes = gUTFBytes[firstByte]; src += trailingBytes + 1; len++; } return len; } SWIcharResult SWIutf8towcs( const unsigned char *src, wchar_t *dst, int maxdstlen ) { // Get pointers to our start and end points of the input buffer const unsigned char* srcPtr = src; const unsigned char* srcEnd = src + strlen((const char *)src); wchar_t *dstSave = dst; wchar_t *dstEnd = dst+maxdstlen; /* leave room for null */ // We now loop until we run out of input data. while (srcPtr < srcEnd) { unsigned int trailingBytes; unsigned long tmpVal = 0; // Get the next leading byte out const unsigned char firstByte = (unsigned char) *srcPtr; // Special-case ASCII, which is a leading byte value of <= 127 if (firstByte <= 127) { *dst++ = (wchar_t) firstByte; srcPtr++; continue; } // See how many trailing src bytes this sequence is going to require trailingBytes = gUTFBytes[firstByte]; // If there are not enough source bytes to do this one, then we // are done. Note that we done >= here because we are implicitly // counting the 1 byte we get no matter what. if (srcPtr + trailingBytes >= srcEnd) return SWIchar_FAIL; // ?? // Looks ok, so lets build up the value switch (trailingBytes) { case 5: tmpVal += *srcPtr++; tmpVal <<= 6; case 4: tmpVal += *srcPtr++; tmpVal <<= 6; case 3: tmpVal += *srcPtr++; tmpVal <<= 6; case 2: tmpVal += *srcPtr++; tmpVal <<= 6; case 1: tmpVal += *srcPtr++; tmpVal <<= 6; case 0: tmpVal += *srcPtr++; break; default: return SWIchar_ERROR; } tmpVal -= gUTFOffsets[trailingBytes]; // If surrogate pairs would be required for 16-bit characters, fail. if (tmpVal & 0xFFFF0000) return SWIchar_FAIL; if ( dst >= dstEnd ) { return SWIchar_BUFFER_OVERFLOW; } *dst++ = (wchar_t)tmpVal; } *dst = L'\0'; // return dst-dstSave; return SWIchar_SUCCESS; // check this (CARO) } int SWIwcstoutf8len(const wchar_t* src) { int len = 0; // Get pointers to our start and end points of the input buffer. const wchar_t* srcPtr = src; const wchar_t* srcEnd = srcPtr + wcslen(src); while (*src != 0) { unsigned int encodedBytes; wchar_t curVal = (*src++) & 0x0000FFFF; // Watchout for surrogates. if ((curVal >= 0xD800 && curVal <= 0xDBFF) || curVal == 0xFFFE || curVal == 0xFFFF) return -2; // Figure out how many bytes we need if (curVal < 0x80) encodedBytes = 1; else if (curVal < 0x800) encodedBytes = 2; else if (curVal < 0x10000) encodedBytes = 3; else if (curVal < 0x200000) encodedBytes = 4; else if (curVal < 0x4000000) encodedBytes = 5; else if (curVal <= 0x7FFFFFFF) encodedBytes = 6; else { // THIS SHOULD NOT HAPPEN! return -2; } // And spit out the bytes. We spit them out in reverse order // here, so bump up the output pointer and work down as we go. len += encodedBytes; } return len; } SWIcharResult SWIwcstoutf8(const wchar_t *src, unsigned char *dst, int maxdstlen) { // Get pointers to our start and end points of the input buffer. const wchar_t* srcPtr = src; const wchar_t* srcEnd = srcPtr + wcslen(src); unsigned char *dstSave = dst; unsigned char *dstEnd = dst+maxdstlen; while (srcPtr < srcEnd) { unsigned int encodedBytes; wchar_t curVal = (*srcPtr++) & 0x0000FFFF; // Watchout for surrogates. if ( ((curVal >= 0xD800) && (curVal <= 0xDFFF)) || ((curVal == 0xFFFE) || curVal == 0xFFFF) ) return SWIchar_FAIL; // Figure out how many bytes we need if (curVal < 0x80) encodedBytes = 1; else if (curVal < 0x800) encodedBytes = 2; else if (curVal < 0x10000) encodedBytes = 3; else if (curVal < 0x200000) encodedBytes = 4; else if (curVal < 0x4000000) encodedBytes = 5; else if (curVal <= 0x7FFFFFFF) encodedBytes = 6; else { // THIS SHOULD NOT HAPPEN! return SWIchar_ERROR; } // And spit out the bytes. We spit them out in reverse order // here, so bump up the output pointer and work down as we go. dst += encodedBytes; if ( dst > dstEnd ) { return SWIchar_BUFFER_OVERFLOW; } switch(encodedBytes) { case 6 : *--dst = (unsigned char) ((curVal | 0x80) & 0xBF); curVal >>= 6; case 5 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF); curVal >>= 6; case 4 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF); curVal >>= 6; case 3 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF); curVal >>= 6; case 2 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF); curVal >>= 6; case 1 : *--dst = (unsigned char)(curVal | gFirstByteMark[encodedBytes]); } dst += encodedBytes; } *dst = '\0'; // return dst-dstSave return SWIchar_SUCCESS; // check this (CARO) }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -