📄 unicode.cpp
字号:
321, 0, 63, 36, 6, 90, 1026, 15, 248, 119, 3, 97, 0, 0, 1, 5, 1, 2, 0, 173,
553, 0, 1, 70, 0, 15, 51, 81, 80, 144, 13, 102, 32, 0, 9, 65, 227, 82, 26, 0,
65, 36, 117, 146, 0, 0, 39, 0, 77, 50, 24, 0, 0, 0, 1, 105, 6, 0, 50, 0, 0, 0,
0, 20, 0, 0, 29, 0, 0, 1, 0, 0, 27, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
0, 0, 0, 0, 28, 0, 20, 0, 0, 4, 0, 0, 16, 0, 0, 11, 11, 38, 212, 0, 15, 7, 1,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0, 7, 0, 0, 59, 0, 0, 24, 0, 0,
0, 0, 0, 27, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 185, 0, 5,
0, 0, 226, 0, 0, 162, 0, 19, 0, 0, 94, 8, 0, 1, 0, 268, 44, 0, 0, 0, 0, 15, 0,
0, 0, 24, 0, 0, 0, 0, 66, 0, 2, 0, 0, 162, 0, 0, 155, 0, 53, 40, 0, 19, 19, 0,
1, 0, 2, 18, 0, 0, 0, 0, 0, 0, 0, 0, 58, 0, 0, 0, 0, 42, 0, 0, 0, 0, 137, 0,
0, 88, 0, 0, 0, 0, 10, 0, 0, 1, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0,
0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 9, 0, 0, 33, 75, 14, 10, 117, 6, 9, 0, 157, 11, 167, 130, 9,
0, 12, 25, 57, 76, 0, 0, 109, 0, 16, 34, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0, 1,
0, 27, 0, 22, 2, 0, 82, 0, 9, 129, 0, 0, 0, 104, 3, 0, 0, 0, 15, 13, 56, 0, 0,
0, 0, 0, 42, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 2, 6, 0, 0, 0, 0, 149,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 3, 46, 0, 26, 1, 0, 0, 0,
0, 2, 10, 0, 0, 1, 11, 43, 0, 0, 0, 0, 5, 0, 47, 0, 0, 0, 0, 2, 0, 0, 0, 0, 44,
14, 69, 10, 17, 23, 4, 19, 7, 80, 55, 57, 0, 0, 11, 63, 100, 0, 0, 21, 1, 6,
2, 39, 0, 0, 0, 0, 12, 9,
};
BOOL CALLBACK EnumCodePagesProc(LPTSTR name) {
UINT id;
int msnum;
if ((id=_tcstoul(name,NULL,10))!=0 && (msnum=get_mscp_num(id))>=0)
add_codepage(ms_codepages[msnum].name,id,ms_codepages[msnum].alias1,
ms_codepages[msnum].alias2);
return TRUE;
}
static int _cdecl enc_cmp(const void *v1,const void *v2) {
const struct CodePage *cp1=(const struct CodePage *)v1;
const struct CodePage *cp2=(const struct CodePage *)v2;
return cp1->codepage<cp2->codepage ? -1 : cp1->codepage>cp2->codepage ? 1 : 0;
}
static struct InitUnicode {
InitUnicode();
} InitUnicode;
InitUnicode::InitUnicode() {
// fetch system codepages
EnumSystemCodePages((CODEPAGE_ENUMPROC)EnumCodePagesProc,CP_INSTALLED);
qsort(codepages,curcp,sizeof(struct CodePage),enc_cmp);
// and add our own
DWORD mask;
int i;
for (i=mask=0;i<NUM_BUILTIN_ENCODINGS;++i) {
int icp=Unicode::GetIntCodePage(builtin_encodings[i].cp);
if (icp<0)
mask|=1<<i;
else
codepages[icp].table=builtin_encodings[i].unimap;
}
bool need_utf8=Unicode::GetIntCodePage(CP_UTF8)<0;
for (i=0;i<NUM_BUILTIN_ENCODINGS;++i)
if (mask&(1<<i)) {
int msnum=get_mscp_num(builtin_encodings[i].cp);
if (msnum>=0) {
int cp=add_codepage(ms_codepages[msnum].name,ms_codepages[msnum].cp,
ms_codepages[msnum].alias1,ms_codepages[msnum].alias2);
codepages[cp].length=TB_cp_length;
codepages[cp].convert=TB_cp_convert;
codepages[cp].table=builtin_encodings[i].unimap;
}
}
if (need_utf8) {
int cp=add_codepage(_T("UTF-8"),CP_UTF8);
codepages[cp].length=UTF_cp_length;
codepages[cp].convert=UTF_cp_convert;
}
if (mask || need_utf8)
qsort(codepages,curcp,sizeof(struct CodePage),enc_cmp);
default_cp=Unicode::GetIntCodePage(1251); // XXX hardcoded
}
int Unicode::WCLength(int codepage,const char *mbstr,int mblen) {
if (codepage>=0 && codepage<curcp)
return codepages[codepage].length(codepages+codepage,mbstr,mblen);
return 0;
}
void Unicode::ToWC(int codepage,const char *mbstr,int mblen,
wchar_t *wcstr,int wclen)
{
if (codepage>=0 && codepage<curcp)
codepages[codepage].convert(codepages+codepage,mbstr,mblen,wcstr,wclen);
}
int Unicode::GetNumCodePages() {
return curcp;
}
const TCHAR *Unicode::GetCodePageName(int num) {
if (num>=0 && num<curcp)
return codepages[num].name;
return NULL;
}
int Unicode::GetIntCodePage(UINT mscp) {
int i=0,j=curcp;
while (i<=j) {
int m=(i+j)>>1;
if (mscp<codepages[m].codepage)
j=m-1;
else if (mscp>codepages[m].codepage)
i=m+1;
else
return m;
}
return -1;
}
static UINT detect_encoding(const unsigned char *mbs,unsigned mblen) {
unsigned i,j;
int enc=0;
int sv,msv=0;
int hist[NUMLET*NUMLET];
unsigned int prev;
unsigned char *lettermap;
if (mblen<3) /* detection needs at least a few letters :) */
return CP_1252;
if (mbs[0]=='\xef' && mbs[1]=='\xbb' && mbs[2]=='\xbf') // utf8 bom
return CP_UTF8;
if (mblen>1024) /* don't waste too much time */
mblen=1024;
for (i=0;i<NUM_BUILTIN_ENCODINGS;++i) {
memset(hist,0,sizeof(int)*NUMLET*NUMLET);
lettermap=builtin_encodings[i].distmap;
for (j=prev=0;j<mblen;++j) {
unsigned int next=lettermap[mbs[j]];
if (next && prev)
++hist[prev*NUMLET+next];
prev=next;
}
for (j=sv=0;j<NUMLET*NUMLET;++j)
sv+=hist[j]*russian_distrib[j];
if (sv>msv) {
enc=i;
msv=sv;
}
}
if (msv<5) /* no cyrillic letters found */
return CP_1252;
return builtin_encodings[enc].cp;
}
int Unicode::DetectCodePage(const char *mbs,int mblen) {
UINT cp=detect_encoding((const unsigned char *)mbs,mblen);
int lcp=GetIntCodePage(cp);
return lcp<0 ? GetIntCodePage(CP_1252) : lcp; // 1252 should always be present
}
UINT Unicode::GetMSCodePage(int cp) {
if (cp>=0 && cp<curcp)
return codepages[cp].codepage;
return 1251; // XXX hardcoded
}
int Unicode::FindCodePage(const TCHAR *name) {
for (int i=0;i<curcp;++i)
if (!CmpI(name,codepages[i].name) ||
(codepages[i].alias1 && !CmpI(name,codepages[i].alias1)) ||
(codepages[i].alias2 && !CmpI(name,codepages[i].alias2)))
return i;
return -1;
}
int Unicode::DefaultCodePage() {
return default_cp;
}
const wchar_t *Unicode::GetTable(int cp) {
if (cp>=0 && cp<curcp && codepages[cp].table)
return codepages[cp].table;
return NULL;
}
int Unicode::MBLength(const wchar_t *wcstr,int wclen) {
return WideCharToMultiByte(CP_ACP,0,wcstr,wclen,NULL,0,NULL,NULL);
}
void Unicode::ToMB(const wchar_t *wcstr,int wclen,char *mbstr,int mblen) {
WideCharToMultiByte(CP_ACP,0,wcstr,wclen,mbstr,mblen,NULL,NULL);
}
Buffer<wchar_t> Unicode::ToWCbuf(int codepage,const char *mbstr,int mblen) {
int len=WCLength(codepage,mbstr,mblen);
Buffer<wchar_t> ret(len);
ToWC(codepage,mbstr,mblen,ret,len);
return ret;
}
Buffer<char> Unicode::ToMBbuf(const wchar_t *wcstr,int wclen) {
int len=MBLength(wcstr,wclen);
Buffer<char> ret(len);
ToMB(wcstr,wclen,ret,len);
return ret;
}
CString Unicode::ToCS(const wchar_t *wcstr,int wclen) {
return CString(wcstr,wclen);
}
Buffer<wchar_t> Unicode::ToWCbuf(const CString& str) {
return Buffer<wchar_t>(str,str.GetLength());
}
// cstrings are implicitly nul terminated, so we can just up the size
Buffer<wchar_t> Unicode::ToWCbufZ(const CString& str) {
return Buffer<wchar_t>(str,str.GetLength()+1);
}
const wchar_t *Unicode::GetCodePageNameW(int num) {
return GetCodePageName(num);
}
Buffer<wchar_t> Unicode::Lower(const Buffer<wchar_t>& str) {
int rlen=LCMapString(LOCALE_USER_DEFAULT,LCMAP_LOWERCASE,str,str.size(),NULL,0);
Buffer<wchar_t> ret(rlen);
LCMapString(LOCALE_USER_DEFAULT,LCMAP_LOWERCASE,str,str.size(),ret,ret.size());
return ret;
}
Buffer<char> Unicode::SortKey(LCID lcid,const wchar_t *str,int len) {
int rlen=LCMapString(lcid,LCMAP_SORTKEY|NORM_IGNORECASE,str,len,NULL,0);
Buffer<char> ret(rlen);
LCMapString(lcid,LCMAP_SORTKEY|NORM_IGNORECASE,str,len,(wchar_t*)(char*)ret,rlen);
if (rlen>0) // don't include terminating 0
ret.setsize(rlen-1);
return ret;
}
Buffer<char> Unicode::ToUtf8(const CString& cs) {
// determine length
int utflen;
const wchar_t *cp=cs;
int i;
int max=cs.GetLength();
for (i=utflen=0;i<max;++i) {
DWORD c;
if (cp[i]>=0xd800 && cp[i]<=0xdbff && i<max-1 && cp[i+1]>=0xdc00 && cp[i+1]<=0xdfff) {
c=((DWORD)(cp[i]-0xd800)<<10) + (cp[i+1]-0xdc00) + 0x10000;
++i;
} else
c=cp[i];
if (c<128)
++utflen;
else if (c<2048)
utflen+=2;
else if (c<65536)
utflen+=3;
else
utflen+=4;
}
Buffer<char> ret(utflen);
char *dp=ret;
for (i=0;i<max;++i) {
DWORD c;
if (cp[i]>=0xd800 && cp[i]<=0xdbff && i<max-1 && cp[i+1]>=0xdc00 && cp[i+1]<=0xdfff) {
c=((DWORD)(cp[i]-0xd800)<<10) + (cp[i+1]-0xdc00) + 0x10000;
++i;
} else
c=cp[i];
if (c<128)
*dp++=(char)c;
else if (c<2048) {
*dp++=(char)(0xc0 | (c>>6));
*dp++=(char)(0x80 | (c&0x3f));
} else if (c<65536) {
*dp++=(char)(0xe0 | (c>>12));
*dp++=(char)(0x80 | ((c>>6)&0x3f));
*dp++=(char)(0x80 | (c&0x3f));
} else {
*dp++=(char)(0xf0 | ((c>>18) & 0x07));
*dp++=(char)(0x80 | ((c>>12) & 0x3f));
*dp++=(char)(0x80 | ((c>>6) & 0x3f));
*dp++=(char)(0x80 | (c&0x3f));
}
}
return ret;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -