strconv.cpp

来自「A*算法 A*算法 A*算法 A*算法A*算法A*算法」· C++ 代码 · 共 2,176 行 · 第 1/5 页

CPP
2,176
字号
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
};

size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
    size_t len = 0;

    while (*psz && ((!buf) || (len < n)))
    {
        unsigned char cc = *psz++;
        if (cc != '+')
        {
            // plain ASCII char
            if (buf)
                *buf++ = cc;
            len++;
        }
        else if (*psz == '-')
        {
            // encoded plus sign
            if (buf)
                *buf++ = cc;
            len++;
            psz++;
        }
        else
        {
            // BASE64 encoded string
            bool lsb;
            unsigned char c;
            unsigned int d, l;
            for (lsb = false, d = 0, l = 0;
                (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
            {
                d <<= 6;
                d += cc;
                for (l += 6; l >= 8; lsb = !lsb)
                {
                    c = (unsigned char)((d >> (l -= 8)) % 256);
                    if (lsb)
                    {
                        if (buf)
                            *buf++ |= c;
                        len ++;
                    }
                    else
                        if (buf)
                            *buf = (wchar_t)(c << 8);
                }
            }
            if (*psz == '-')
                psz++;
        }
    }
    if (buf && (len < n))
        *buf = 0;
    return len;
}

//
// BASE64 encoding table
//
static const unsigned char utf7enb64[] =
{
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
    'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
    'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
    'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
    'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
    'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
    'w', 'x', 'y', 'z', '0', '1', '2', '3',
    '4', '5', '6', '7', '8', '9', '+', '/'
};

//
// UTF-7 encoding table
//
// 0 - Set D (directly encoded characters)
// 1 - Set O (optional direct characters)
// 2 - whitespace characters (optional)
// 3 - special characters
//
static const unsigned char utf7encode[128] =
{
    3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
};

size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{


    size_t len = 0;

    while (*psz && ((!buf) || (len < n)))
    {
        wchar_t cc = *psz++;
        if (cc < 0x80 && utf7encode[cc] < 1)
        {
            // plain ASCII char
            if (buf)
                *buf++ = (char)cc;
            len++;
        }
#ifndef WC_UTF16
        else if (((wxUint32)cc) > 0xffff)
        {
            // no surrogate pair generation (yet?)
            return (size_t)-1;
        }
#endif
        else
        {
            if (buf)
                *buf++ = '+';
            len++;
            if (cc != '+')
            {
                // BASE64 encode string
                unsigned int lsb, d, l;
                for (d = 0, l = 0;; psz++)
                {
                    for (lsb = 0; lsb < 2; lsb ++)
                    {
                        d <<= 8;
                        d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;

                        for (l += 8; l >= 6; )
                        {
                            l -= 6;
                            if (buf)
                                *buf++ = utf7enb64[(d >> l) % 64];
                            len++;
                        }
                    }
                    cc = *psz;
                    if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
                        break;
                }
                if (l != 0)
                {
                    if (buf)
                        *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
                    len++;
                }
            }
            if (buf)
                *buf++ = '-';
            len++;
        }
    }
    if (buf && (len < n))
        *buf = 0;
    return len;
}

// ----------------------------------------------------------------------------
// UTF-8
// ----------------------------------------------------------------------------

static wxUint32 utf8_max[]=
    { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };

// boundaries of the private use area we use to (temporarily) remap invalid
// characters invalid in a UTF-8 encoded string
const wxUint32 wxUnicodePUA = 0x100000;
const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;

size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
    size_t len = 0;

    while (*psz && ((!buf) || (len < n)))
    {
        const char *opsz = psz;
        bool invalid = false;
        unsigned char cc = *psz++, fc = cc;
        unsigned cnt;
        for (cnt = 0; fc & 0x80; cnt++)
            fc <<= 1;
        if (!cnt)
        {
            // plain ASCII char
            if (buf)
                *buf++ = cc;
            len++;

            // escape the escape character for octal escapes
            if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
                    && cc == '\\' && (!buf || len < n))
            {
                if (buf)
                    *buf++ = cc;
                len++;
            }
        }
        else
        {
            cnt--;
            if (!cnt)
            {
                // invalid UTF-8 sequence
                invalid = true;
            }
            else
            {
                unsigned ocnt = cnt - 1;
                wxUint32 res = cc & (0x3f >> cnt);
                while (cnt--)
                {
                    cc = *psz;
                    if ((cc & 0xC0) != 0x80)
                    {
                        // invalid UTF-8 sequence
                        invalid = true;
                        break;
                    }
                    psz++;
                    res = (res << 6) | (cc & 0x3f);
                }
                if (invalid || res <= utf8_max[ocnt])
                {
                    // illegal UTF-8 encoding
                    invalid = true;
                }
                else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
                        res >= wxUnicodePUA && res < wxUnicodePUAEnd)
                {
                    // if one of our PUA characters turns up externally
                    // it must also be treated as an illegal sequence
                    // (a bit like you have to escape an escape character)
                    invalid = true;
                }
                else
                {
#ifdef WC_UTF16
                    // cast is ok because wchar_t == wxUuint16 if WC_UTF16
                    size_t pa = encode_utf16(res, (wxUint16 *)buf);
                    if (pa == (size_t)-1)
                    {
                        invalid = true;
                    }
                    else
                    {
                        if (buf)
                            buf += pa;
                        len += pa;
                    }
#else // !WC_UTF16
                    if (buf)
                        *buf++ = res;
                    len++;
#endif // WC_UTF16/!WC_UTF16
                }
            }
            if (invalid)
            {
                if (m_options & MAP_INVALID_UTF8_TO_PUA)
                {
                    while (opsz < psz && (!buf || len < n))
                    {
#ifdef WC_UTF16
                        // cast is ok because wchar_t == wxUuint16 if WC_UTF16
                        size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
                        wxASSERT(pa != (size_t)-1);
                        if (buf)
                            buf += pa;
                        opsz++;
                        len += pa;
#else
                        if (buf)
                            *buf++ = wxUnicodePUA + (unsigned char)*opsz;
                        opsz++;
                        len++;
#endif
                    }
                }
                else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
                {
                    while (opsz < psz && (!buf || len < n))
                    {
                        if ( buf && len + 3 < n )
                        {
                            unsigned char n = *opsz;
                            *buf++ = L'\\';
                            *buf++ = (wchar_t)( L'0' + n / 0100 );
                            *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
                            *buf++ = (wchar_t)( L'0' + n % 010 );
                        }
                        opsz++;
                        len += 4;
                    }
                }
                else // MAP_INVALID_UTF8_NOT
                {
                    return (size_t)-1;
                }
            }
        }
    }
    if (buf && (len < n))
        *buf = 0;
    return len;
}

static inline bool isoctal(wchar_t wch)
{
    return L'0' <= wch && wch <= L'7';
}

size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{
    size_t len = 0;

    while (*psz && ((!buf) || (len < n)))
    {
        wxUint32 cc;
#ifdef WC_UTF16
        // cast is ok for WC_UTF16
        size_t pa = decode_utf16((const wxUint16 *)psz, cc);
        psz += (pa == (size_t)-1) ? 1 : pa;
#else
        cc=(*psz++) & 0x7fffffff;
#endif

        if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
                && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
        {
            if (buf)
                *buf++ = (char)(cc - wxUnicodePUA);
            len++;
        }
        else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
                    && cc == L'\\' && psz[0] == L'\\' )
        {
            if (buf)
                *buf++ = (char)cc;
            psz++;
            len++;
        }
        else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
                    cc == L'\\' &&
                        isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
        {
            if (buf)
            {
                *buf++ = (char) ((psz[0] - L'0')*0100 +
                                 (psz[1] - L'0')*010 +
                                 (psz[2] - L'0'));
            }

            psz += 3;
            len++;
        }
        else
        {
            unsigned cnt;
            for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
            if (!cnt)
            {
                // plain ASCII char
                if (buf)
                    *buf++ = (char) cc;
                len++;
            }

            else
            {
                len += cnt + 1;
                if (buf)
                {
                    *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
                    while (cnt--)
                        *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
                }
            }
        }
    }

    if (buf && (len<n))
        *buf = 0;

    return len;
}

// ----------------------------------------------------------------------------
// UTF-16
// ----------------------------------------------------------------------------

#ifdef WORDS_BIGENDIAN
    #define wxMBConvUTF16straight wxMBConvUTF16BE
    #define wxMBConvUTF16swap     wxMBConvUTF16LE
#else
    #define wxMBConvUTF16swap     wxMBConvUTF16BE
    #define wxMBConvUTF16straight wxMBConvUTF16LE
#endif


#ifdef WC_UTF16

// copy 16bit MB to 16bit String
size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
    size_t len=0;

    while (*(wxUint16*)psz && (!buf || len < n))
    {
        if (buf)
            *buf++ = *(wxUint16*)psz;
        len++;

        psz += sizeof(wxUint16);
    }
    if (buf && len<n)   *buf=0;

    return len;
}


// copy 16bit String to 16bit MB
size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{
    size_t len=0;

    while (*psz && (!buf || len < n))
    {
        if (buf)
        {
            *(wxUint16*)buf = *psz;
            buf += sizeof(wxUint16);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?