📄 swiutfconversions.c

📁 OSB-PIK-OpenVXI-3.0.0源代码 “中国XML论坛 - 专业的XML技术讨论区--XML在语音技术中的应用”
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
     case 0xD3:  w = 0x0136;  break; // LATIN CAPITAL LETTER K WITH CEDILLA     case 0xD9:  w = 0x0172;  break; // LATIN CAPITAL LETTER U WITH OGONEK     case 0xDD:  w = 0x0168;  break; // LATIN CAPITAL LETTER U WITH TILDE     case 0xDE:  w = 0x016A;  break; // LATIN CAPITAL LETTER U WITH MACRON     case 0xE0:  w = 0x0101;  break; // LATIN SMALL LETTER A WITH MACRON     case 0xE7:  w = 0x012F;  break; // LATIN SMALL LETTER I WITH OGONEK     case 0xE8:  w = 0x010D;  break; // LATIN SMALL LETTER C WITH CARON     case 0xEA:  w = 0x0119;  break; // LATIN SMALL LETTER E WITH OGONEK     case 0xEC:  w = 0x0117;  break; // LATIN SMALL LETTER E WITH DOT ABOVE     case 0xEF:  w = 0x012B;  break; // LATIN SMALL LETTER I WITH MACRON     case 0xF0:  w = 0x0111;  break; // LATIN SMALL LETTER D WITH STROKE     case 0xF1:  w = 0x0146;  break; // LATIN SMALL LETTER N WITH CEDILLA     case 0xF2:  w = 0x014D;  break; // LATIN SMALL LETTER O WITH MACRON     case 0xF3:  w = 0x0137;  break; // LATIN SMALL LETTER K WITH CEDILLA     case 0xF9:  w = 0x0173;  break; // LATIN SMALL LETTER U WITH OGONEK     case 0xFD:  w = 0x0169;  break; // LATIN SMALL LETTER U WITH TILDE     case 0xFE:  w = 0x016B;  break; // LATIN SMALL LETTER U WITH MACRON     case 0xFF:  w = 0x02D9;  break; // DOT ABOVE     default:       w = wchar_t(*in);       break;     }     out += w;     ++in;   }    return true; }   bool DecodeISO8859_15(const char * in, std::basic_string<wchar_t> & out) {   out.erase();    wchar_t w;    while (*in != '\0') {     switch (*in) {     case 0xA4:  w = 0x20AC;  break; // EURO SIGN     case 0xA6:  w = 0x0160;  break; // LATIN CAPITAL LETTER S WITH CARON     case 0xA8:  w = 0x0161;  break; // LATIN SMALL LETTER S WITH CARON     case 0xB4:  w = 0x017D;  break; // LATIN CAPITAL LETTER Z WITH CARON     case 0xB8:  w = 0x017E;  break; // LATIN SMALL LETTER Z WITH CARON     case 0xBC:  w = 0x0152;  break; // LATIN CAPITAL LIGATURE OE     case 0xBD:  w = 0x0153;  break; // LATIN SMALL LIGATURE OE     case 0xBE:  w = 0x0178;  break; // LATIN CAPITAL LETTER Y WITH DIAERESIS     default:       w = wchar_t(*in);       break;     }     out += w;     ++in;   }    return true; } #endif  // --------------------------------------------------------------------------- // Jerry Carter writes: // // The UTF-8 encoding / decoding routines were modified from the Apache Xerces // project.  The original translated from UTF-8 to UTF-16.  In this version, I // have removed support for surrogate characters.  This removes the difference // between platforms which treat wchar_t as UTF-16 (Windows) and those which // use UTF-32 (Linux, MacOS, etc.). // // The Apache license appears below (as required).  /*  * The Apache Software License, Version 1.1  *   * Copyright (c) 1999-2000 The Apache Software Foundation.  All rights  * reserved.  *   * Redistribution and use in source and binary forms, with or without  * modification, are permitted provided that the following conditions  * are met:  *   * 1. Redistributions of source code must retain the above copyright  *    notice, this list of conditions and the following disclaimer.   *   * 2. Redistributions in binary form must reproduce the above copyright  *    notice, this list of conditions and the following disclaimer in  *    the documentation and/or other materials provided with the  *    distribution.  *   * 3. The end-user documentation included with the redistribution,  *    if any, must include the following acknowledgment:    *       "This product includes software developed by the  *        Apache Software Foundation (http://www.apache.org/)."  *    Alternately, this acknowledgment may appear in the software itself,  *    if and wherever such third-party acknowledgments normally appear.  *   * 4. The names "Xerces" and "Apache Software Foundation" must  *    not be used to endorse or promote products derived from this  *    software without prior written permission. For written   *    permission, please contact apache\@apache.org.  *   * 5. Products derived from this software may not be called "Apache",  *    nor may "Apache" appear in their name, without prior written  *    permission of the Apache Software Foundation.  *   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF  * SUCH DAMAGE.  * ====================================================================  *   * This software consists of voluntary contributions made by many  * individuals on behalf of the Apache Software Foundation, and was  * originally based on software copyright (c) 1999, International  * Business Machines, Inc., http://www.ibm.com .  For more information  * on the Apache Software Foundation, please see  * <http://www.apache.org/>.  */  //  gUTFBytes //      A list of counts of trailing bytes for each initial byte in the input. // //  gUTFOffsets //      A list of values to offset each result char type, according to how //      many source bytes when into making it. // //  gFirstByteMark //      A list of values to mask onto the first byte of an encoded sequence, //      indexed by the number of bytes used to create the sequence.  static const char gUTFBytes[256] = {         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0     ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1     ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1     ,   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2     ,   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };  static const unsigned long gUTFOffsets[6] = {  0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 };  static const unsigned char gFirstByteMark[7] = {  0x00, 0x00, 0xC0, 0xE0,    0xF0, 0xF8, 0xFC };   int SWIutf8towcslen(const unsigned char* src) {   int len = 0;    while (*src != '\0') {     unsigned int trailingBytes;      // Get the next leading byte out     const unsigned char firstByte = (unsigned char) *src;      // See how many trailing src bytes this sequence is going to require     trailingBytes = gUTFBytes[firstByte];      src += trailingBytes + 1;     len++;   }   return len; }   SWIcharResult SWIutf8towcs( const unsigned char *src, wchar_t *dst, int maxdstlen ) {   // Get pointers to our start and end points of the input buffer   const unsigned char* srcPtr = src;   const unsigned char* srcEnd = src + strlen((const char *)src);   wchar_t *dstSave = dst;   wchar_t *dstEnd = dst+maxdstlen;  /* leave room for null */    //  We now loop until we run out of input data.   while (srcPtr < srcEnd) {     unsigned int trailingBytes;     unsigned long tmpVal = 0;      // Get the next leading byte out     const unsigned char firstByte = (unsigned char) *srcPtr;      // Special-case ASCII, which is a leading byte value of <= 127     if (firstByte <= 127) {       *dst++ = (wchar_t) firstByte;       srcPtr++;       continue;     }      // See how many trailing src bytes this sequence is going to require     trailingBytes = gUTFBytes[firstByte];          //  If there are not enough source bytes to do this one, then we     //  are done. Note that we done >= here because we are implicitly     //  counting the 1 byte we get no matter what.     if (srcPtr + trailingBytes >= srcEnd)       return SWIchar_FAIL;  // ??      // Looks ok, so lets build up the value     switch (trailingBytes) {     case 5: tmpVal += *srcPtr++; tmpVal <<= 6;     case 4: tmpVal += *srcPtr++; tmpVal <<= 6;     case 3: tmpVal += *srcPtr++; tmpVal <<= 6;     case 2: tmpVal += *srcPtr++; tmpVal <<= 6;     case 1: tmpVal += *srcPtr++; tmpVal <<= 6;     case 0: tmpVal += *srcPtr++;       break;     default:       return SWIchar_ERROR;     }     tmpVal -= gUTFOffsets[trailingBytes];      // If surrogate pairs would be required for 16-bit characters, fail.     if (tmpVal & 0xFFFF0000)       return SWIchar_FAIL;      if ( dst >= dstEnd ) {       return SWIchar_BUFFER_OVERFLOW;     }      *dst++ = (wchar_t)tmpVal;   }    *dst = L'\0';     // return dst-dstSave;    return SWIchar_SUCCESS;   // check this (CARO) }  int SWIwcstoutf8len(const wchar_t* src) {   int len = 0;    //  Get pointers to our start and end points of the input buffer.   const wchar_t* srcPtr = src;   const wchar_t* srcEnd = srcPtr + wcslen(src);    while (*src != 0) {     unsigned int encodedBytes;     wchar_t curVal = (*src++) & 0x0000FFFF;      // Watchout for surrogates.     if ((curVal >= 0xD800 && curVal <= 0xDBFF) || curVal == 0xFFFE || 	curVal == 0xFFFF)       return -2;      // Figure out how many bytes we need     if (curVal < 0x80)                encodedBytes = 1;     else if (curVal < 0x800)          encodedBytes = 2;     else if (curVal < 0x10000)        encodedBytes = 3;     else if (curVal < 0x200000)       encodedBytes = 4;     else if (curVal < 0x4000000)      encodedBytes = 5;     else if (curVal <= 0x7FFFFFFF)    encodedBytes = 6;     else {       // THIS SHOULD NOT HAPPEN!       return -2;     }      //  And spit out the bytes. We spit them out in reverse order     //  here, so bump up the output pointer and work down as we go.     len += encodedBytes;   }    return len; }   SWIcharResult SWIwcstoutf8(const wchar_t *src, unsigned char *dst, int maxdstlen) {   //  Get pointers to our start and end points of the input buffer.   const wchar_t* srcPtr = src;   const wchar_t* srcEnd = srcPtr + wcslen(src);   unsigned char *dstSave = dst;   unsigned char *dstEnd = dst+maxdstlen;    while (srcPtr < srcEnd) {     unsigned int encodedBytes;     wchar_t curVal = (*srcPtr++) & 0x0000FFFF;      // Watchout for surrogates.     if ( ((curVal >= 0xD800) && (curVal <= 0xDFFF)) ||          ((curVal == 0xFFFE) || curVal == 0xFFFF) )       return SWIchar_FAIL;      // Figure out how many bytes we need     if (curVal < 0x80)                encodedBytes = 1;     else if (curVal < 0x800)          encodedBytes = 2;     else if (curVal < 0x10000)        encodedBytes = 3;     else if (curVal < 0x200000)       encodedBytes = 4;     else if (curVal < 0x4000000)      encodedBytes = 5;     else if (curVal <= 0x7FFFFFFF)    encodedBytes = 6;     else {       // THIS SHOULD NOT HAPPEN!       return SWIchar_ERROR;     }      //  And spit out the bytes. We spit them out in reverse order     //  here, so bump up the output pointer and work down as we go.     dst += encodedBytes;      if ( dst > dstEnd ) {       return SWIchar_BUFFER_OVERFLOW;     }      switch(encodedBytes) {     case 6 : *--dst = (unsigned char) ((curVal | 0x80) & 0xBF);       curVal >>= 6;     case 5 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF);       curVal >>= 6;     case 4 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF);       curVal >>= 6;     case 3 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF);       curVal >>= 6;     case 2 : *--dst = (unsigned char)((curVal | 0x80) & 0xBF);       curVal >>= 6;     case 1 : *--dst = (unsigned char)(curVal | gFirstByteMark[encodedBytes]);     }      dst += encodedBytes;   }   *dst = '\0';    // return dst-dstSave    return SWIchar_SUCCESS; // check this (CARO) }
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -