unicode.cpp

来自「俄罗斯人开发的大名鼎鼎的Pocket Pc 阅读器haaliread的源代码,v」· C++ 代码 · 共 880 行 · 第 1/3 页

CPP
880
字号
321, 0, 63, 36, 6, 90, 1026, 15, 248, 119, 3, 97, 0, 0, 1, 5, 1, 2, 0, 173,
553, 0, 1, 70, 0, 15, 51, 81, 80, 144, 13, 102, 32, 0, 9, 65, 227, 82, 26, 0,
65, 36, 117, 146, 0, 0, 39, 0, 77, 50, 24, 0, 0, 0, 1, 105, 6, 0, 50, 0, 0, 0,
0, 20, 0, 0, 29, 0, 0, 1, 0, 0, 27, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
0, 0, 0, 0, 28, 0, 20, 0, 0, 4, 0, 0, 16, 0, 0, 11, 11, 38, 212, 0, 15, 7, 1,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0, 7, 0, 0, 59, 0, 0, 24, 0, 0,
0, 0, 0, 27, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 185, 0, 5,
0, 0, 226, 0, 0, 162, 0, 19, 0, 0, 94, 8, 0, 1, 0, 268, 44, 0, 0, 0, 0, 15, 0,
0, 0, 24, 0, 0, 0, 0, 66, 0, 2, 0, 0, 162, 0, 0, 155, 0, 53, 40, 0, 19, 19, 0,
1, 0, 2, 18, 0, 0, 0, 0, 0, 0, 0, 0, 58, 0, 0, 0, 0, 42, 0, 0, 0, 0, 137, 0,
0, 88, 0, 0, 0, 0, 10, 0, 0, 1, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0,
0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 9, 0, 0, 33, 75, 14, 10, 117, 6, 9, 0, 157, 11, 167, 130, 9,
0, 12, 25, 57, 76, 0, 0, 109, 0, 16, 34, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0, 1,
0, 27, 0, 22, 2, 0, 82, 0, 9, 129, 0, 0, 0, 104, 3, 0, 0, 0, 15, 13, 56, 0, 0,
0, 0, 0, 42, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 2, 6, 0, 0, 0, 0, 149,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 3, 46, 0, 26, 1, 0, 0, 0,
0, 2, 10, 0, 0, 1, 11, 43, 0, 0, 0, 0, 5, 0, 47, 0, 0, 0, 0, 2, 0, 0, 0, 0, 44,
14, 69, 10, 17, 23, 4, 19, 7, 80, 55, 57, 0, 0, 11, 63, 100, 0, 0, 21, 1, 6,
2, 39, 0, 0, 0, 0, 12, 9,
};

BOOL CALLBACK EnumCodePagesProc(LPTSTR name) {
  UINT		id;
  int		msnum;

  if ((id=_tcstoul(name,NULL,10))!=0 && (msnum=get_mscp_num(id))>=0)
    add_codepage(ms_codepages[msnum].name,id,ms_codepages[msnum].alias1,
	ms_codepages[msnum].alias2);
  return TRUE;
}

static int    _cdecl enc_cmp(const void *v1,const void *v2) {
  const struct CodePage	*cp1=(const struct CodePage *)v1;
  const struct CodePage	*cp2=(const struct CodePage *)v2;
  return cp1->codepage<cp2->codepage ? -1 : cp1->codepage>cp2->codepage ? 1 : 0;
}

static struct InitUnicode {
  InitUnicode();
} InitUnicode;

InitUnicode::InitUnicode() {
  // fetch system codepages
  EnumSystemCodePages((CODEPAGE_ENUMPROC)EnumCodePagesProc,CP_INSTALLED);
  qsort(codepages,curcp,sizeof(struct CodePage),enc_cmp);
  // and add our own
  DWORD	  mask;
  int	  i;
  for (i=mask=0;i<NUM_BUILTIN_ENCODINGS;++i) {
    int icp=Unicode::GetIntCodePage(builtin_encodings[i].cp);
    if (icp<0)
      mask|=1<<i;
    else
      codepages[icp].table=builtin_encodings[i].unimap;
  }
  bool need_utf8=Unicode::GetIntCodePage(CP_UTF8)<0;
  for (i=0;i<NUM_BUILTIN_ENCODINGS;++i)
    if (mask&(1<<i)) {
      int msnum=get_mscp_num(builtin_encodings[i].cp);
      if (msnum>=0) {
	int cp=add_codepage(ms_codepages[msnum].name,ms_codepages[msnum].cp,
			ms_codepages[msnum].alias1,ms_codepages[msnum].alias2);
	codepages[cp].length=TB_cp_length;
	codepages[cp].convert=TB_cp_convert;
	codepages[cp].table=builtin_encodings[i].unimap;
      }
    }
  if (need_utf8) {
    int	cp=add_codepage(_T("UTF-8"),CP_UTF8);
    codepages[cp].length=UTF_cp_length;
    codepages[cp].convert=UTF_cp_convert;
  }
  if (mask || need_utf8)
    qsort(codepages,curcp,sizeof(struct CodePage),enc_cmp);
  default_cp=Unicode::GetIntCodePage(1251); // XXX hardcoded
}

int   Unicode::WCLength(int codepage,const char *mbstr,int mblen) {
  if (codepage>=0 && codepage<curcp)
    return codepages[codepage].length(codepages+codepage,mbstr,mblen);
  return 0;
}

void  Unicode::ToWC(int codepage,const char *mbstr,int mblen,
		    wchar_t *wcstr,int wclen)
{
  if (codepage>=0 && codepage<curcp)
    codepages[codepage].convert(codepages+codepage,mbstr,mblen,wcstr,wclen);
}

int   Unicode::GetNumCodePages() {
  return curcp;
}

const TCHAR  *Unicode::GetCodePageName(int num) {
  if (num>=0 && num<curcp)
    return codepages[num].name;
  return NULL;
}

int   Unicode::GetIntCodePage(UINT mscp) {
  int	i=0,j=curcp;

  while (i<=j) {
    int	  m=(i+j)>>1;
    if (mscp<codepages[m].codepage)
      j=m-1;
    else if (mscp>codepages[m].codepage)
      i=m+1;
    else
      return m;
  }
  return -1;
}

static UINT   detect_encoding(const unsigned char *mbs,unsigned mblen) {
  unsigned	i,j;
  int		enc=0;
  int		sv,msv=0;
  int		hist[NUMLET*NUMLET];
  unsigned int	prev;
  unsigned char	*lettermap;

  if (mblen<3) /* detection needs at least a few letters :) */
    return CP_1252;
  if (mbs[0]=='\xef' && mbs[1]=='\xbb' && mbs[2]=='\xbf') // utf8 bom
    return CP_UTF8;
  if (mblen>1024) /* don't waste too much time */
    mblen=1024;
  for (i=0;i<NUM_BUILTIN_ENCODINGS;++i) {
    memset(hist,0,sizeof(int)*NUMLET*NUMLET);
    lettermap=builtin_encodings[i].distmap;
    for (j=prev=0;j<mblen;++j) {
      unsigned int next=lettermap[mbs[j]];
      if (next && prev)
        ++hist[prev*NUMLET+next];
      prev=next;
    }
    for (j=sv=0;j<NUMLET*NUMLET;++j)
      sv+=hist[j]*russian_distrib[j];
    if (sv>msv) {
      enc=i;
      msv=sv;
    }
  }
  if (msv<5) /* no cyrillic letters found */
    return CP_1252;
  return builtin_encodings[enc].cp;
}

int   Unicode::DetectCodePage(const char *mbs,int mblen) {
  UINT	  cp=detect_encoding((const unsigned char *)mbs,mblen);
  int	  lcp=GetIntCodePage(cp);

  return lcp<0 ? GetIntCodePage(CP_1252) : lcp; // 1252 should always be present
}

UINT  Unicode::GetMSCodePage(int cp) {
  if (cp>=0 && cp<curcp)
    return codepages[cp].codepage;
  return 1251; // XXX hardcoded
}

int   Unicode::FindCodePage(const TCHAR *name) {
  for (int i=0;i<curcp;++i)
    if (!CmpI(name,codepages[i].name) ||
	(codepages[i].alias1 && !CmpI(name,codepages[i].alias1)) ||
	(codepages[i].alias2 && !CmpI(name,codepages[i].alias2)))
      return i;
  return -1;
}

int   Unicode::DefaultCodePage() {
  return default_cp;
}

const wchar_t *Unicode::GetTable(int cp) {
  if (cp>=0 && cp<curcp && codepages[cp].table)
    return codepages[cp].table;
  return NULL;
}

int	Unicode::MBLength(const wchar_t *wcstr,int wclen) {
  return WideCharToMultiByte(CP_ACP,0,wcstr,wclen,NULL,0,NULL,NULL);
}

void	Unicode::ToMB(const wchar_t *wcstr,int wclen,char *mbstr,int mblen) {
  WideCharToMultiByte(CP_ACP,0,wcstr,wclen,mbstr,mblen,NULL,NULL);
}

Buffer<wchar_t>	  Unicode::ToWCbuf(int codepage,const char *mbstr,int mblen) {
  int	len=WCLength(codepage,mbstr,mblen);
  Buffer<wchar_t> ret(len);
  ToWC(codepage,mbstr,mblen,ret,len);
  return ret;
}

Buffer<char>	  Unicode::ToMBbuf(const wchar_t *wcstr,int wclen) {
  int	len=MBLength(wcstr,wclen);
  Buffer<char>	ret(len);
  ToMB(wcstr,wclen,ret,len);
  return ret;
}

CString		  Unicode::ToCS(const wchar_t *wcstr,int wclen) {
  return CString(wcstr,wclen);
}

Buffer<wchar_t>	  Unicode::ToWCbuf(const CString& str) {
  return Buffer<wchar_t>(str,str.GetLength());
}

// cstrings are implicitly nul terminated, so we can just up the size
Buffer<wchar_t>	  Unicode::ToWCbufZ(const CString& str) {
  return Buffer<wchar_t>(str,str.GetLength()+1);
}

const wchar_t	  *Unicode::GetCodePageNameW(int num) {
  return GetCodePageName(num);
}

Buffer<wchar_t> Unicode::Lower(const Buffer<wchar_t>& str) {
  int	  rlen=LCMapString(LOCALE_USER_DEFAULT,LCMAP_LOWERCASE,str,str.size(),NULL,0);
  Buffer<wchar_t>   ret(rlen);
  LCMapString(LOCALE_USER_DEFAULT,LCMAP_LOWERCASE,str,str.size(),ret,ret.size());
  return ret;
}

Buffer<char>   Unicode::SortKey(LCID lcid,const wchar_t *str,int len) {
  int	rlen=LCMapString(lcid,LCMAP_SORTKEY|NORM_IGNORECASE,str,len,NULL,0);
  Buffer<char>	ret(rlen);
  LCMapString(lcid,LCMAP_SORTKEY|NORM_IGNORECASE,str,len,(wchar_t*)(char*)ret,rlen);
  if (rlen>0) // don't include terminating 0
    ret.setsize(rlen-1);
  return ret;
}

Buffer<char>  Unicode::ToUtf8(const CString& cs) {
  // determine length
  int		utflen;
  const wchar_t	*cp=cs;
  int		i;
  int		max=cs.GetLength();

  for (i=utflen=0;i<max;++i) {
    DWORD c;
    if (cp[i]>=0xd800 && cp[i]<=0xdbff && i<max-1 && cp[i+1]>=0xdc00 && cp[i+1]<=0xdfff) {
      c=((DWORD)(cp[i]-0xd800)<<10) + (cp[i+1]-0xdc00) + 0x10000;
      ++i;
    } else
      c=cp[i];

    if (c<128)
      ++utflen;
    else if (c<2048)
      utflen+=2;
    else if (c<65536)
      utflen+=3;
    else
      utflen+=4;
  }

  Buffer<char>	ret(utflen);
  char		*dp=ret;

  for (i=0;i<max;++i) {
    DWORD c;
    if (cp[i]>=0xd800 && cp[i]<=0xdbff && i<max-1 && cp[i+1]>=0xdc00 && cp[i+1]<=0xdfff) {
      c=((DWORD)(cp[i]-0xd800)<<10) + (cp[i+1]-0xdc00) + 0x10000;
      ++i;
    } else
      c=cp[i];
    if (c<128)
      *dp++=(char)c;
    else if (c<2048) {
      *dp++=(char)(0xc0 | (c>>6));
      *dp++=(char)(0x80 | (c&0x3f));
    } else if (c<65536) {
      *dp++=(char)(0xe0 | (c>>12));
      *dp++=(char)(0x80 | ((c>>6)&0x3f));
      *dp++=(char)(0x80 | (c&0x3f));
    } else {
      *dp++=(char)(0xf0 | ((c>>18) & 0x07));
      *dp++=(char)(0x80 | ((c>>12) & 0x3f));
      *dp++=(char)(0x80 | ((c>>6) & 0x3f));
      *dp++=(char)(0x80 | (c&0x3f));
    }
  }

  return ret;
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?