📄 charencoder.cpp

📁 Windows CE 6.0 Server 源码
💻 CPP
字号:
//
// Copyright (c) Microsoft Corporation.  All rights reserved.
//
//
// Use of this source code is subject to the terms of the Microsoft shared
// source or premium shared source license agreement under which you licensed
// this source code. If you did not accept the terms of the license agreement,
// you are not authorized to use this source code. For the terms of the license,
// please see the license agreement between you and Microsoft or, if applicable,
// see the SOURCE.RTF on your install media or the root of your tools installation.
// THE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES.
//
//+---------------------------------------------------------------------------------
//
//
// File:
//      charencoder.cpp
//
// Contents:
//
//      CharEncoder, Encoding object implementation
//
//----------------------------------------------------------------------------------

#include "headers.h"

#include <tchar.h>
#include <mlang.h>

#include "charencoder.h"

CAutoRefc<IMultiLanguage> g_pMultiLanguage;

HRESULT _EnsureMultiLanguage();

static CSpinlock 	mlanglock(x_MultiLang);


//
// Delegate other charsets to mlang
//
const EncodingEntry CharEncoder::s_charsetInfo [] = 
{
    { CP_1250,  _T("WINDOWS-1250"),       1, wideCharToMultiByteWin32 }, 
    { CP_1251,  _T("WINDOWS-1251"),       1, wideCharToMultiByteWin32 }, 
    { CP_1252,  _T("WINDOWS-1252"),       1, wideCharToMultiByteWin32 },
    { CP_1253,  _T("WINDOWS-1253"),       1, wideCharToMultiByteWin32 }, 
    { CP_1254,  _T("WINDOWS-1254"),       1, wideCharToMultiByteWin32 },
    { CP_1257,  _T("WINDOWS-1257"),       1, wideCharToMultiByteWin32 },
    { CP_UCS_4, _T("UCS-4"),              4, wideCharToUcs4Littleendian },
    { CP_UCS_2, _T("ISO-10646-UCS-2"),    2, wideCharToUcs2Littleendian },
    { CP_UCS_2, _T("UCS-2"),              2, wideCharToUcs2Littleendian },
    { CP_UCS_2, _T("UNICODE-2-0-UTF-16"), 2, wideCharToUcs2Littleendian },
    { CP_UCS_2, _T("UTF-16"),             2, wideCharToUcs2Littleendian },
    { CP_UTF_8, _T("UNICODE-1-1-UTF-8"),  3, wideCharToUtf8 },
    { CP_UTF_8, _T("UNICODE-2-0-UTF-8"),  3, wideCharToUtf8 },
    { CP_UTF_8, _T("UTF-8"),              3, wideCharToUtf8 }
};

////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: int CharEncoder::NeedsByteOrderMark(const WCHAR * charset, ULONG cchEncoding)
//
//  parameters:
//
//  description:
//        Determines whether byte order mark is needed
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
int CharEncoder::NeedsByteOrderMark(const WCHAR * charset, ULONG cchEncoding)
{
    for (int i = 10; i > 5; i--)
    {
        if (_tcslen(s_charsetInfo[i].m_charset) == cchEncoding &&
            _tcsnicmp(charset, s_charsetInfo[i].m_charset, cchEncoding) == 0)
        {
            return TRUE;
        }
    }
    return FALSE;
}


////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: int CharEncoder::getCharsetInfo(const TCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize)
//
//  parameters:
//
//  description:
//        Returns charset information
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
int CharEncoder::getCharsetInfo(const TCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize)
{
    CPINFO cpinfo;

    for (int i = countof(s_charsetInfo) - 1; i >= 0; i--)
    {
        if (_tcsicmp(charset, s_charsetInfo[i].m_charset) == 0)
        {             
            //
            // test whether we can handle it locally or not
            //
            if (i > 5 || GetCPInfo(s_charsetInfo[i].m_codepage, &cpinfo))
            {
                *pcodepage = s_charsetInfo[i].m_codepage;
                *mCharSize = s_charsetInfo[i].m_maxCharSize;
                return i;
            }
            else
            {
                break;
            }
        }
    }

    //
    // delegate to MLANG then
    //
    MIMECSETINFO mimeCharsetInfo;
    HRESULT hr;

    hr = _EnsureMultiLanguage();
    if (hr == S_OK)
    {
        hr = g_pMultiLanguage->GetCharsetInfo((TCHAR*)charset, &mimeCharsetInfo);
        if (hr == S_OK)
        {
            *pcodepage = mimeCharsetInfo.uiInternetEncoding;
            if (GetCPInfo(*pcodepage, &cpinfo))
                *mCharSize = cpinfo.MaxCharSize;
            else // if we don't know the max size, assume a large size
                *mCharSize = 4;
            return -1;
        }
    }

    return -2;
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: HRESULT CreateMultiLanguage(IMultiLanguage ** ppUnk)
//
//  parameters:
//
//  description:
//        Creates instance of MultiLanguage component
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
HRESULT CreateMultiLanguage(IMultiLanguage ** ppUnk)
{
    HRESULT hr = S_OK;

    if (*ppUnk == NULL)
    {
        CLocalSpinlock 	m_lock(&mlanglock);
        // perhaps someone took care of it already
        if (*ppUnk == NULL)
        {
            hr = CoCreateInstance(CLSID_CMultiLanguage, NULL, CLSCTX_INPROC_SERVER, 
                IID_IMultiLanguage, (void**)ppUnk);
        }
    }
    return hr;
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: HRESULT _EnsureMultiLanguage()
//
//  parameters:
//
//  description:
//        Ensures MultiLanguage component is created.
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
HRESULT _EnsureMultiLanguage()
{
    return CreateMultiLanguage(&g_pMultiLanguage);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: HRESULT CharEncoder::getWideCharToMultiByteInfo
//
//  parameters:
//
//  description:
//        get information about a code page identified by <code> encoding </code>
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
HRESULT CharEncoder::getWideCharToMultiByteInfo(const WCHAR * charset, CODEPAGE * pcodepage, WideCharToMultiByteFunc ** pfnWideCharToMultiByte, UINT * mCharSize)
{
    HRESULT hr = S_OK;

    int i = getCharsetInfo(charset, pcodepage, mCharSize);
    if (i >= 0) // in our short list
    {
        switch (*pcodepage)
        {
        case CP_UCS_2:
            *pfnWideCharToMultiByte = wideCharToUcs2Littleendian;
            break;
        case CP_UCS_4:
            *pfnWideCharToMultiByte = wideCharToUcs4Littleendian;
            break;
        default:
            *pfnWideCharToMultiByte = s_charsetInfo[i].m_pfnWideCharToMultiByte;
            break;
        }
    }
    else if (i == -1) // delegate to MLANG
    {
        hr = g_pMultiLanguage->IsConvertible(CP_UTF_16, *pcodepage);
        if (hr == S_OK)
            *pfnWideCharToMultiByte = wideCharToMultiByteMlang;
        else
            hr = E_FAIL;
    }
    else
    {
        hr = E_FAIL;
    }

    return hr;
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: HRESULT CharEncoder::wideCharToUcs2Littleendian
//
//  parameters:
//
//  description:
//        Scans buffer and translates Unicode characters into Ucs2 little endian characters
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
HRESULT CharEncoder::wideCharToUcs2Littleendian(DWORD* , CODEPAGE , 
                                            TCHAR * buffer, 
                                            UINT *cch, 
                                            BYTE* bytebuffer, 
                                            UINT * cb)
{
    UINT num = (*cb) >> 1;
    if (num > *cch)
        num = *cch;
    ::memcpy(bytebuffer, buffer, num * sizeof(TCHAR));
    *cch = num;
    *cb = num << 1;
    return S_OK;
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: HRESULT CharEncoder::wideCharToUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, TCHAR * buffer, 
//
//  parameters:
//
//  description:
//        Scans buffer and translates Unicode characters into Ucs4 little endian characters
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
HRESULT CharEncoder::wideCharToUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, TCHAR * buffer, 
                                              UINT *cch, BYTE* bytebuffer, UINT * cb)
{
    UINT num = (*cb) >> 2;
    if (num > *cch)
        num = *cch;

    for (UINT i = num; i > 0; i--)
    {
        *bytebuffer++ = (*buffer) & 0xFF;
        *bytebuffer++ = (*buffer) >> 8;
        *bytebuffer++ = 0;
        *bytebuffer++ = 0;
        buffer++;
    }
    *cch = num;
    *cb = num << 2;
    return S_OK;
}


////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: HRESULT CharEncoder::wideCharToUtf8(DWORD* pdwMode, CODEPAGE codepage, TCHAR * buffer, 
//
//  parameters:
//
//  description:
//        Scans buffer and translates Unicode characters into UTF8 characters
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
HRESULT CharEncoder::wideCharToUtf8(DWORD* pdwMode, CODEPAGE codepage, TCHAR * buffer, 
                                       UINT *cch, BYTE* bytebuffer, UINT * cb)
{
    UINT count = 0, num = *cch, m1 = *cb, m2 = m1 - 1, m3 = m2 - 1, m4 = m3 - 1;
    DWORD dw1 = 0;
    bool surrogate = false;
    UINT i;

    for (i = num; i > 0; i--)
    {
        DWORD dw = *buffer;

        if (surrogate) //  is it the second char of a surrogate pair?
        {
            if (dw >= 0xdc00 && dw <= 0xdfff)
            {
                // four bytes 0x11110xxx 0x10xxxxxx 0x10xxxxxx 0x10xxxxxx
                if (count < m4)
                    count += 4;
                else
                    break;
                ULONG ucs4 = (dw1 - 0xd800) * 0x400 + (dw - 0xdc00) + 0x10000;
                *bytebuffer++ = (byte)(( ucs4 >> 18) | 0xF0);
                *bytebuffer++ = (byte)((( ucs4 >> 12) & 0x3F) | 0x80);
                *bytebuffer++ = (byte)((( ucs4 >> 6) & 0x3F) | 0x80);
                *bytebuffer++ = (byte)(( ucs4 & 0x3F) | 0x80);
                surrogate = false;
                buffer++;
                continue;
            }
            else // Then dw1 must be a three byte character
            {
                if (count < m3)
                    count += 3;
                else
                    break;
                *bytebuffer++ = (byte)(( dw1 >> 12) | 0xE0);
                *bytebuffer++ = (byte)((( dw1 >> 6) & 0x3F) | 0x80);
                *bytebuffer++ = (byte)(( dw1 & 0x3F) | 0x80);
            }
            surrogate = false;
        }

        if (dw  < 0x80) // one byte, 0xxxxxxx
        {
            if (count < m1)
                count++;
            else
                break;
            *bytebuffer++ = (byte)dw;
        }
        else if ( dw < 0x800) // two WORDS, 110xxxxx 10xxxxxx
        {
            if (count < m2)
                count += 2;
            else
                break;
            *bytebuffer++ = (byte)((dw >> 6) | 0xC0);
            *bytebuffer++ = (byte)((dw & 0x3F) | 0x80);
        }
        else if (dw >= 0xd800 && dw <= 0xdbff) // Assume that it is the first char of surrogate pair
        {
            if (i == 1) // last wchar in buffer
                break;
            dw1 = dw;
            surrogate = true;
        }
        else // three bytes, 1110xxxx 10xxxxxx 10xxxxxx
        {
            if (count < m3)
                count += 3;
            else
                break;
            *bytebuffer++ = (byte)(( dw >> 12) | 0xE0);
            *bytebuffer++ = (byte)((( dw >> 6) & 0x3F) | 0x80);
            *bytebuffer++ = (byte)(( dw & 0x3F) | 0x80);
        }
        buffer++;
    }

    *cch = surrogate ? num - i - 1 : num - i;
    *cb = count;

    return S_OK;
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: HRESULT CharEncoder::wideCharToMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, TCHAR * buffer, 
//
//  parameters:
//
//  description:
//        Scans buffer and translates Unicode characters into characters identified
//        by <code> codepage </>, using Win32 function WideCharToMultiByte for encoding 
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
HRESULT CharEncoder::wideCharToMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, TCHAR * buffer, 
                                              UINT *cch, BYTE* bytebuffer, UINT * cb)
{
    HRESULT hr = S_OK;
    BOOL fBadChar = false;
    *cb = ::WideCharToMultiByte(codepage, NULL, buffer, *cch, (char*)bytebuffer, *cb, NULL, &fBadChar);
    if (*cb == 0)
        hr = ::GetLastError();
    else if (fBadChar)
        hr = S_FALSE;
    return hr;
}

////////////////////////////////////////////////////////////////////////////////////////////////////
//  function: HRESULT CharEncoder::wideCharToMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, TCHAR * buffer, 
//
//  parameters:
//
//  description:
//        Scans buffer and translates Unicode characters into characters of charSet 
//        identified by <code> codepage </code>, using Mlang for encoding 
//  returns:
//
////////////////////////////////////////////////////////////////////////////////////////////////////
HRESULT CharEncoder::wideCharToMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, TCHAR * buffer, 
                                              UINT *cch, BYTE* bytebuffer, UINT * cb)
{
    HRESULT hr;
    if (FAILED(hr = _EnsureMultiLanguage()))
    {
        return hr;
    }
    if (FAILED(hr = g_pMultiLanguage->ConvertStringFromUnicode(pdwMode, codepage,
                                       buffer, cch, (char*)bytebuffer, cb )))
    {
        return hr;
    }
    return S_OK;
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -