📄 utf8_ucs2.c

📁 Apache V2.0.15 Alpha For Linuxhttpd-2_0_15-alpha.tar.Z
💻 C
字号:
/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2000-2001 The Apache Software Foundation.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * 3. The end-user documentation included with the redistribution, *    if any, must include the following acknowledgment: *       "This product includes software developed by the *        Apache Software Foundation (http://www.apache.org/)." *    Alternately, this acknowledgment may appear in the software itself, *    if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" must *    not be used to endorse or promote products derived from this *    software without prior written permission. For written *    permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", *    nor may "Apache" appear in their name, without prior written *    permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation.  For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */#include "i18n.h"/* Implement the design principal specified by RFC 2718 2.2.5  * Guidelines for new URL Schemes - within the APR. * * Since many architectures support unicode, and UCS2 is the most * efficient storage used by those archictures, these functions * exist to validate a UCS string.  It is up to the operating system * to determine the validitity of the string in the context of it's * native language support.  File systems that support filename  * characters of 0x80-0xff but have no support of Unicode will find  * this function useful only for validating the character sequences  * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is * desired. * * from RFC 2279 UTF-8, a transformation format of ISO 10646 * *     UCS-4 range (hex.)    UTF-8 octet sequence (binary) * 1:2 0000 0000-0000 007F   0xxxxxxx * 2:2 0000 0080-0000 07FF   110XXXXx 10xxxxxx * 3:2 0000 0800-0000 FFFF   1110XXXX 10Xxxxxx 10xxxxxx * 4:4 0001 0000-001F FFFF   11110zXX 10XXxxxx 10xxxxxx 10xxxxxx * inv 0020 0000-03FF FFFF   111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx * inv 0400 0000-7FFF FFFF   1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * One of the X values must be one for the encoding length to be legit. * Neither the z bit, nor the final two forms, are used for ucs-2 * *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in  *   Unicode parlance), being actually UCS-4 characters transformed  *   through UTF-16, need special treatment: the UTF-16 transformation  *   must be undone, yielding a UCS-4 character that is then transformed  *   as above." * * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask * *  U' = U - 0x10000 *  U' = 000000000000yyyyyyyyyyxxxxxxxxxx *                  W1 = 110110yyyyyyyyyy *                  W2 = 110111xxxxxxxxxx * * conv_utf8_to_ucs2 out bytes:  sizeof(in) * 1 <= Req <= sizeof(in) * 2 * * conv_ucs2_to_utf8 out words:  sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2 */apr_status_t conv_utf8_to_ucs2(const char *in, apr_size_t *inbytes,                               apr_wchar_t *out, apr_size_t *outwords){    apr_int64_t newch, mask;    int ch, expect, eating;        while (*inbytes && *outwords)     {        ch = (unsigned char)(*in++);        if (!(ch & 0200)) {            /* US-ASCII-7 plain text             */            --*inbytes;            --*outwords;            *(out++) = ch;        }        else        {            if ((ch & 0300) != 0300) {                 /* Multibyte Continuation is out of place                 */                return APR_EINVAL;            }            else            {                /* Multibyte Sequence Lead Character                 *                 * Compute the expected bytes while adjusting                 * or lead byte and leading zeros mask.                 */                mask = 0340;                expect = 1;                while ((ch & mask) == mask) {                    mask |= mask >> 1;                    if (++expect > 3) /* (truly 5 for ucs-4) */                        return APR_EINVAL;                }                newch = ch & ~mask;                eating = expect + 1;                if (*inbytes <= expect)                    return APR_INCOMPLETE;                /* Reject values of excessive leading 0 bits                 * utf-8 _demands_ the shortest possible byte length                 */                if (expect == 1) {                    if (!(newch & 0036))                        return APR_EINVAL;                }                else {                    /* Reject values of excessive leading 0 bits                     */                    if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))                        return APR_EINVAL;                    if (expect == 2) {                        /* Reject values D800-DFFF when not utf16 encoded                         * (may not be an appropriate restriction for ucs-4)                         */                        if (newch == 0015 && ((unsigned char)*in & 0040))                            return APR_EINVAL;                    }                    else if (expect == 3) {                        /* Short circuit values > 110000                         */                        if (newch > 4)                            return APR_EINVAL;                        if (newch == 4 && ((unsigned char)*in & 0060))                            return APR_EINVAL;                    }                }                if (*outwords < (expect > 2) + 1)                     break; /* buffer full */                while (expect--)                {                    /* Multibyte Continuation must be legal */                    if (((ch = (unsigned char)*(in++)) & 0300) != 0200)                        return APR_EINVAL;                    newch <<= 6;                    newch |= (ch & 0077);                }                *inbytes -= eating;                /* newch is now a true ucs-4 character                 *                 * now we need to fold to ucs-2                 */                if (newch < 0x10000)                 {                    --*outwords;                    *(out++) = (apr_wchar_t) newch;                }                else                 {                    *outwords -= 2;                    newch -= 0x10000;                    *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));                    *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));                                    }            }        }    }    /* Buffer full 'errors' aren't errors, the client must inspect both     * the inbytes and outwords values     */    return APR_SUCCESS;}apr_status_t conv_ucs2_to_utf8(const apr_wchar_t *in, apr_size_t *inwords,                               char *out, apr_size_t *outbytes){    apr_int64_t newch, require;    char *invout;    int ch, need;        while (*inwords && *outbytes)     {        ch = (unsigned short)(*in++);        if (ch < 0x80)        {            --*inwords;            --*outbytes;            *(out++) = (unsigned char) ch;        }        else         {            if ((ch & 0xFC00) == 0xDC00) {                /* Invalid Leading ucs-2 Multiword Continuation Character                 */                return APR_EINVAL;            }            if ((ch & 0xFC00) == 0xD800) {                /* Leading ucs-2 Multiword Character                 */                if (*inwords < 2) {                    /* Missing ucs-2 Multiword Continuation Character                     */                    return APR_INCOMPLETE;                }                if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {                    /* Invalid ucs-2 Multiword Continuation Character                     */                    return APR_EINVAL;                }                newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);                newch += 0x10000;            }            else {                /* ucs-2 Single Word Character                 */                newch = ch;            }            /* Determine the absolute minimum utf-8 bytes required             */            require = newch >> 11;            need = 1;            while (require)                require >>= 5, ++need;            if (need >= *outbytes)                break; /* Insufficient buffer */            *inwords -= (need > 2) + 1;            *outbytes -= need + 1;            /* Compute the utf-8 characters in last to first order,             * calculating the lead character length bits along the way.             */            ch = 0200;            out += need + 1;            invout = out;            while (need--) {                ch |= ch >> 1;                *(--invout) = (unsigned char)(0200 | (newch & 0077));                newch >>= 6;            }            /* Compute the lead utf-8 character and move the dest offset             */            *(--invout) = (unsigned char)(ch | newch);        }    }    /* Buffer full 'errors' aren't errors, the client must inspect both     * the inwords and outbytes values     */    return APR_SUCCESS;    }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -