📄 unicode.cpp
字号:
/*
* Copyright (c) 2001,2002,2003 Mike Matsnev. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice immediately at the beginning of the file, without modification,
* this list of conditions, and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Absolutely no warranty of function or purpose is made by the author
* Mike Matsnev.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $Id: Unicode.cpp,v 1.13.2.5 2003/12/17 12:19:58 mike Exp $
*
*/
#include <afx.h>
#include "ptr.h"
#include "Unicode.h"
#ifndef CP_UTF8
#define CP_UTF8 65001
#endif
#ifndef CP_1252
#define CP_1252 1252
#endif
// string compare
#define CmpI(s1,s2) \
(::CompareString(LOCALE_USER_DEFAULT,NORM_IGNORECASE, \
(s1),-1,(s2),-1)-2)
struct CodePage {
const TCHAR *name;
const TCHAR *alias1,*alias2;
UINT codepage;
int (*length)(struct CodePage *cp,const char *mbs,int mblen);
void (*convert)(struct CodePage *cp,const char *mbs,int mblen,
wchar_t *wcs,int wclen);
const wchar_t *table;
};
static int CE_cp_length(struct CodePage *cp,const char *mbs,int mblen) {
return MultiByteToWideChar(cp->codepage,0,mbs,mblen,0,NULL);
}
static void CE_cp_convert(struct CodePage *cp,const char *mbs,int mblen,
wchar_t *wcs,int wclen)
{
MultiByteToWideChar(cp->codepage,0,mbs,mblen,wcs,wclen);
}
static int TB_cp_length(struct CodePage *cp,const char *mbs,int mblen) {
return mblen;
}
static void TB_cp_convert(struct CodePage *cp,const char *mbs,int mblen,
wchar_t *wcs,int wclen)
{
const char *mbe=mbs+min(mblen,wclen);
while (mbs<mbe)
*wcs++=cp->table[(unsigned char)*mbs++];
}
static int WS_cp_length(struct CodePage *cp,const char *mbs,int mblen) {
return mblen;
}
static void WS_cp_convert(struct CodePage *cp,const char *mbs,int mblen,
wchar_t *wcs,int wclen)
{
const char *mbe=mbs+min(mblen,wclen);
while (mbs<mbe)
*wcs++=(unsigned char)*mbs++;
}
static int UTF_cp_length(struct CodePage *cp,const char *mbs,int mblen) {
const unsigned char *mb=(const unsigned char *)mbs;
int len=0;
while (mblen>0) {
unsigned char c=*mb++;
--mblen;
if (c<0x80) { // ascii
++len;
} else if (c<0xe0) { // 2-byte seq
if (mblen==0) // invalid
break;
if ((mb[0]&0x80)==0x80) {
++len;
++mb;
--mblen;
}
} else if (*mb<0xf0) {
if (mblen<=1) // invalid
break;
if ((mb[0]&0xc0)==0x80 && (mb[1]&0xc0)==0x80) {
++len;
mb+=2;
mblen-=2;
}
} else if (*mb<0xf4) {
if (mblen<=2) // invalid
break;
if ((mb[0]&0xc0)==0x80 && (mb[1]&0xc0)==0x80 && (mb[2]&0xc0)==0x80) {
++len;
mb+=3;
mblen-=3;
}
} else if (*mb==0xf4) {
if (mblen<=2) // invalid
break;
if ((mb[0]&0xf0)==0x80 && (mb[1]&0xc0)==0x80 && (mb[2]&0xc0)==0x80) {
++len;
mb+=3;
mblen-=3;
}
}
}
return len;
}
static void UTF_cp_convert(struct CodePage *cp,const char *mbs,int mblen,
wchar_t *wcs,int wclen)
{
const unsigned char *mb=(const unsigned char *)mbs;
int len=0;
wchar_t *wce=wcs+wclen;
while (mblen>0 && wcs<wce) {
unsigned char c=*mb++;
--mblen;
if (c<0x80) { // ascii
*wcs++=c;
} else if (c<0xe0) { // 2-byte seq
if (mblen==0) // invalid
break;
if ((mb[0]&0x80)==0x80) {
*wcs++=((wchar_t)(c&0x1f)<<6)|(*mb&0x3f);
++mb;
--mblen;
}
} else if (*mb<0xf0) {
if (mblen<=1) // invalid
break;
if ((mb[0]&0xc0)==0x80 && (mb[1]&0xc0)==0x80) {
*wcs++=((wchar_t)(c&0x0f)<<12)|((wchar_t)(mb[0]&0x3f)<<6)|(mb[1]&0x3f);
mb+=2;
mblen-=2;
}
} else if (*mb<0xf4) {
if (mblen<=2) // invalid
break;
if ((mb[0]&0xc0)==0x80 && (mb[1]&0xc0)==0x80 && (mb[2]&0xc0)==0x80) {
*wcs++=((wchar_t)(c&0x7)<<18)|((wchar_t)(mb[0]&0x3f)<<12)|
((wchar_t)(mb[1]&0x3f)<<6)|(mb[2]&0x3f);
mb+=3;
mblen-=3;
}
} else if (*mb==0xf4) {
if (mblen<=2) // invalid
break;
if ((mb[0]&0xf0)==0x80 && (mb[1]&0xc0)==0x80 && (mb[2]&0xc0)==0x80) {
*wcs++=((wchar_t)(c&0x7)<<18)|((wchar_t)(mb[0]&0x3f)<<12)|
((wchar_t)(mb[1]&0x3f)<<6)|(mb[2]&0x3f);
mb+=3;
mblen-=3;
}
}
}
}
static struct CodePage *codepages;
static int curcp,maxcp;
static int default_cp;
static int add_codepage(const TCHAR *name,UINT cp,const TCHAR *alias1=NULL,
const TCHAR *alias2=NULL)
{
if (curcp>=maxcp) {
maxcp+=16;
codepages=(struct CodePage *)realloc(codepages,maxcp*sizeof(struct CodePage));
if (codepages==NULL)
ExitThread(0);
}
codepages[curcp].name=name;
codepages[curcp].alias1=alias1;
codepages[curcp].alias2=alias2;
codepages[curcp].codepage=cp;
codepages[curcp].length=CE_cp_length;
codepages[curcp].convert=CE_cp_convert;
codepages[curcp].table=NULL;
return curcp++;
}
struct {
UINT cp;
const TCHAR *name;
const TCHAR *alias1,*alias2;
} ms_codepages[]={
{ 37, _T("IBM EBCDIC - U.S./Canada"), _T("IBM037"), _T("cp037") },
{ 437, _T("OEM - United States"), _T("IBM437"), _T("cp437") },
{ 500, _T("IBM EBCDIC - International"), _T("IBM500"), _T("cp500") },
{ 737, _T("OEM - Greek 437G") },
{ 775, _T("OEM - Baltic"), _T("IBM775"), _T("cp775") },
{ 850, _T("OEM - Multilingual Latin I"), _T("IBM850"), _T("cp850") },
{ 852, _T("OEM - Latin II"), _T("IBM852"), _T("cp852") },
{ 855, _T("OEM - Cyrillic"), _T("IBM855"), _T("cp855") },
{ 857, _T("OEM - Turkish"), _T("IBM857"), _T("cp857") },
{ 860, _T("OEM - Portuguese"), _T("IBM860"), _T("cp860") },
{ 861, _T("OEM - Icelandic"), _T("IBM861"), _T("cp861") },
{ 863, _T("OEM - Canadian French"), _T("IBM863"), _T("cp863") },
{ 865, _T("OEM - Nordic"), _T("IBM865"), _T("cp865") },
{ 866, _T("OEM - Russian"), _T("IBM866"), _T("cp866") },
{ 869, _T("OEM - Modern Greek"), _T("IBM869"), _T("cp869") },
{ 874, _T("ANSI/OEM - Thai") },
{ 875, _T("IBM EBCDIC - Modern Greek") },
{ 932, _T("ANSI/OEM - Japanese Shift-JIS"), _T("Shift_JIS") },
{ 936, _T("ANSI/OEM - Simplified Chinese GBK"), _T("GBK") },
{ 949, _T("ANSI/OEM - Korean") },
{ 950, _T("ANSI/OEM - Traditional Chinese Big5"), _T("Big5") },
{ 1026, _T("IBM EBCDIC - Turkish (Latin-5)"), _T("cp1026") },
{ 1250, _T("ANSI - Central Europe"), _T("windows-1250") },
{ 1251, _T("ANSI - Cyrillic"), _T("windows-1251") },
{ 1252, _T("ANSI - Latin I"), _T("windows-1252"), _T("ISO-8859-1") }, // XXX
{ 1253, _T("ANSI - Greek"), _T("windows-1253") },
{ 1254, _T("ANSI - Turkish"), _T("windows-1254") },
{ 1255, _T("ANSI - Hebrew"), _T("windows-1255") },
{ 1256, _T("ANSI - Arabic"), _T("windows-1256") },
{ 1257, _T("ANSI - Baltic"), _T("windows-1257") },
{ 1258, _T("ANSI/OEM - Viet Nam"), _T("windows-1258") },
{ 10000, _T("MAC - Roman") },
{ 10006, _T("MAC - Greek I") },
{ 10007, _T("MAC - Cyrillic") },
{ 10010, _T("MAC - Romania") },
{ 10017, _T("MAC - Ukraine") },
{ 10029, _T("MAC - Latin II") },
{ 10079, _T("MAC - Icelandic") },
{ 10081, _T("MAC - Turkish") },
{ 10082, _T("MAC - Croatia") },
{ 20127, _T("US-ASCII") },
{ 20261, _T("T.61") },
{ 20866, _T("Russian - KOI8"), _T("koi8-r") },
{ 21866, _T("Ukrainian - KOI8-U"), _T("koi8-u") },
{ 28591, _T("ISO 8859-1 Latin I"), _T("ISO-8859-1") },
{ 28592, _T("ISO 8859-2 Central Europe"), _T("ISO-8859-2") },
{ 28594, _T("ISO 8859-4 Baltic"), _T("ISO-8859-4") },
{ 28595, _T("ISO 8859-5 Cyrillic"), _T("ISO-8859-5") },
{ 28597, _T("ISO 8859-7 Greek"), _T("ISO-8859-7") },
{ 28599, _T("ISO 8859-9 Latin 5"), _T("ISO-8859-9") },
{ 28605, _T("ISO 8859-15 Latin 9"), _T("ISO-8859-15") },
{ 65000, _T("UTF-7") },
{ 65001, _T("UTF-8") },
};
#define NUM_MSCP (sizeof(ms_codepages)/sizeof(ms_codepages[0]))
static int get_mscp_num(UINT cp) {
int i=0,j=NUM_MSCP-1;
while (i<=j) {
int m=(i+j)>>1;
if (cp<ms_codepages[m].cp)
j=m-1;
else if (cp>ms_codepages[m].cp)
i=m+1;
else
return m;
}
return -1;
}
static struct {
BYTE distmap[256];
wchar_t unimap[256];
UINT cp;
} builtin_encodings[]={
{{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -