📄 unicode.cpp

📁 俄罗斯人开发的大名鼎鼎的Pocket Pc 阅读器haaliread的源代码,visual c
💻 CPP
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
321, 0, 63, 36, 6, 90, 1026, 15, 248, 119, 3, 97, 0, 0, 1, 5, 1, 2, 0, 173,
553, 0, 1, 70, 0, 15, 51, 81, 80, 144, 13, 102, 32, 0, 9, 65, 227, 82, 26, 0,
65, 36, 117, 146, 0, 0, 39, 0, 77, 50, 24, 0, 0, 0, 1, 105, 6, 0, 50, 0, 0, 0,
0, 20, 0, 0, 29, 0, 0, 1, 0, 0, 27, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
0, 0, 0, 0, 28, 0, 20, 0, 0, 4, 0, 0, 16, 0, 0, 11, 11, 38, 212, 0, 15, 7, 1,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0, 7, 0, 0, 59, 0, 0, 24, 0, 0,
0, 0, 0, 27, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 185, 0, 5,
0, 0, 226, 0, 0, 162, 0, 19, 0, 0, 94, 8, 0, 1, 0, 268, 44, 0, 0, 0, 0, 15, 0,
0, 0, 24, 0, 0, 0, 0, 66, 0, 2, 0, 0, 162, 0, 0, 155, 0, 53, 40, 0, 19, 19, 0,
1, 0, 2, 18, 0, 0, 0, 0, 0, 0, 0, 0, 58, 0, 0, 0, 0, 42, 0, 0, 0, 0, 137, 0,
0, 88, 0, 0, 0, 0, 10, 0, 0, 1, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0,
0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 9, 0, 0, 33, 75, 14, 10, 117, 6, 9, 0, 157, 11, 167, 130, 9,
0, 12, 25, 57, 76, 0, 0, 109, 0, 16, 34, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0, 1,
0, 27, 0, 22, 2, 0, 82, 0, 9, 129, 0, 0, 0, 104, 3, 0, 0, 0, 15, 13, 56, 0, 0,
0, 0, 0, 42, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 2, 6, 0, 0, 0, 0, 149,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 3, 46, 0, 26, 1, 0, 0, 0,
0, 2, 10, 0, 0, 1, 11, 43, 0, 0, 0, 0, 5, 0, 47, 0, 0, 0, 0, 2, 0, 0, 0, 0, 44,
14, 69, 10, 17, 23, 4, 19, 7, 80, 55, 57, 0, 0, 11, 63, 100, 0, 0, 21, 1, 6,
2, 39, 0, 0, 0, 0, 12, 9,
};

BOOL CALLBACK EnumCodePagesProc(LPTSTR name) {
  UINT		id;
  int		msnum;

  if ((id=_tcstoul(name,NULL,10))!=0 && (msnum=get_mscp_num(id))>=0)
    add_codepage(ms_codepages[msnum].name,id,ms_codepages[msnum].alias1,
	ms_codepages[msnum].alias2);
  return TRUE;
}

static int    _cdecl enc_cmp(const void *v1,const void *v2) {
  const struct CodePage	*cp1=(const struct CodePage *)v1;
  const struct CodePage	*cp2=(const struct CodePage *)v2;
  return cp1->codepage<cp2->codepage ? -1 : cp1->codepage>cp2->codepage ? 1 : 0;
}

static struct InitUnicode {
  InitUnicode();
} InitUnicode;

InitUnicode::InitUnicode() {
  // fetch system codepages
  EnumSystemCodePages((CODEPAGE_ENUMPROC)EnumCodePagesProc,CP_INSTALLED);
  qsort(codepages,curcp,sizeof(struct CodePage),enc_cmp);
  // and add our own
  DWORD	  mask;
  int	  i;
  for (i=mask=0;i<NUM_BUILTIN_ENCODINGS;++i) {
    int icp=Unicode::GetIntCodePage(builtin_encodings[i].cp);
    if (icp<0)
      mask|=1<<i;
    else
      codepages[icp].table=builtin_encodings[i].unimap;
  }
  bool need_utf8=Unicode::GetIntCodePage(CP_UTF8)<0;
  for (i=0;i<NUM_BUILTIN_ENCODINGS;++i)
    if (mask&(1<<i)) {
      int msnum=get_mscp_num(builtin_encodings[i].cp);
      if (msnum>=0) {
	int cp=add_codepage(ms_codepages[msnum].name,ms_codepages[msnum].cp,
			ms_codepages[msnum].alias1,ms_codepages[msnum].alias2);
	codepages[cp].length=TB_cp_length;
	codepages[cp].convert=TB_cp_convert;
	codepages[cp].table=builtin_encodings[i].unimap;
      }
    }
  if (need_utf8) {
    int	cp=add_codepage(_T("UTF-8"),CP_UTF8);
    codepages[cp].length=UTF_cp_length;
    codepages[cp].convert=UTF_cp_convert;
  }
  if (mask || need_utf8)
    qsort(codepages,curcp,sizeof(struct CodePage),enc_cmp);
  default_cp=Unicode::GetIntCodePage(1251); // XXX hardcoded
}

int   Unicode::WCLength(int codepage,const char *mbstr,int mblen) {
  if (codepage>=0 && codepage<curcp)
    return codepages[codepage].length(codepages+codepage,mbstr,mblen);
  return 0;
}

void  Unicode::ToWC(int codepage,const char *mbstr,int mblen,
		    wchar_t *wcstr,int wclen)
{
  if (codepage>=0 && codepage<curcp)
    codepages[codepage].convert(codepages+codepage,mbstr,mblen,wcstr,wclen);
}

int   Unicode::GetNumCodePages() {
  return curcp;
}

const TCHAR  *Unicode::GetCodePageName(int num) {
  if (num>=0 && num<curcp)
    return codepages[num].name;
  return NULL;
}

int   Unicode::GetIntCodePage(UINT mscp) {
  int	i=0,j=curcp;

  while (i<=j) {
    int	  m=(i+j)>>1;
    if (mscp<codepages[m].codepage)
      j=m-1;
    else if (mscp>codepages[m].codepage)
      i=m+1;
    else
      return m;
  }
  return -1;
}

static UINT   detect_encoding(const unsigned char *mbs,unsigned mblen) {
  unsigned	i,j;
  int		enc=0;
  int		sv,msv=0;
  int		hist[NUMLET*NUMLET];
  unsigned int	prev;
  unsigned char	*lettermap;

  if (mblen<3) /* detection needs at least a few letters :) */
    return CP_1252;
  if (mbs[0]=='\xef' && mbs[1]=='\xbb' && mbs[2]=='\xbf') // utf8 bom
    return CP_UTF8;
  if (mblen>1024) /* don't waste too much time */
    mblen=1024;
  for (i=0;i<NUM_BUILTIN_ENCODINGS;++i) {
    memset(hist,0,sizeof(int)*NUMLET*NUMLET);
    lettermap=builtin_encodings[i].distmap;
    for (j=prev=0;j<mblen;++j) {
      unsigned int next=lettermap[mbs[j]];
      if (next && prev)
        ++hist[prev*NUMLET+next];
      prev=next;
    }
    for (j=sv=0;j<NUMLET*NUMLET;++j)
      sv+=hist[j]*russian_distrib[j];
    if (sv>msv) {
      enc=i;
      msv=sv;
    }
  }
  if (msv<5) /* no cyrillic letters found */
    return CP_1252;
  return builtin_encodings[enc].cp;
}

int   Unicode::DetectCodePage(const char *mbs,int mblen) {
  UINT	  cp=detect_encoding((const unsigned char *)mbs,mblen);
  int	  lcp=GetIntCodePage(cp);

  return lcp<0 ? GetIntCodePage(CP_1252) : lcp; // 1252 should always be present
}

UINT  Unicode::GetMSCodePage(int cp) {
  if (cp>=0 && cp<curcp)
    return codepages[cp].codepage;
  return 1251; // XXX hardcoded
}

int   Unicode::FindCodePage(const TCHAR *name) {
  for (int i=0;i<curcp;++i)
    if (!CmpI(name,codepages[i].name) ||
	(codepages[i].alias1 && !CmpI(name,codepages[i].alias1)) ||
	(codepages[i].alias2 && !CmpI(name,codepages[i].alias2)))
      return i;
  return -1;
}

int   Unicode::DefaultCodePage() {
  return default_cp;
}

const wchar_t *Unicode::GetTable(int cp) {
  if (cp>=0 && cp<curcp && codepages[cp].table)
    return codepages[cp].table;
  return NULL;
}

int	Unicode::MBLength(const wchar_t *wcstr,int wclen) {
  return WideCharToMultiByte(CP_ACP,0,wcstr,wclen,NULL,0,NULL,NULL);
}

void	Unicode::ToMB(const wchar_t *wcstr,int wclen,char *mbstr,int mblen) {
  WideCharToMultiByte(CP_ACP,0,wcstr,wclen,mbstr,mblen,NULL,NULL);
}

Buffer<wchar_t>	  Unicode::ToWCbuf(int codepage,const char *mbstr,int mblen) {
  int	len=WCLength(codepage,mbstr,mblen);
  Buffer<wchar_t> ret(len);
  ToWC(codepage,mbstr,mblen,ret,len);
  return ret;
}

Buffer<char>	  Unicode::ToMBbuf(const wchar_t *wcstr,int wclen) {
  int	len=MBLength(wcstr,wclen);
  Buffer<char>	ret(len);
  ToMB(wcstr,wclen,ret,len);
  return ret;
}

CString		  Unicode::ToCS(const wchar_t *wcstr,int wclen) {
  return CString(wcstr,wclen);
}

Buffer<wchar_t>	  Unicode::ToWCbuf(const CString& str) {
  return Buffer<wchar_t>(str,str.GetLength());
}

// cstrings are implicitly nul terminated, so we can just up the size
Buffer<wchar_t>	  Unicode::ToWCbufZ(const CString& str) {
  return Buffer<wchar_t>(str,str.GetLength()+1);
}

const wchar_t	  *Unicode::GetCodePageNameW(int num) {
  return GetCodePageName(num);
}

Buffer<wchar_t> Unicode::Lower(const Buffer<wchar_t>& str) {
  int	  rlen=LCMapString(LOCALE_USER_DEFAULT,LCMAP_LOWERCASE,str,str.size(),NULL,0);
  Buffer<wchar_t>   ret(rlen);
  LCMapString(LOCALE_USER_DEFAULT,LCMAP_LOWERCASE,str,str.size(),ret,ret.size());
  return ret;
}

Buffer<char>   Unicode::SortKey(LCID lcid,const wchar_t *str,int len) {
  int	rlen=LCMapString(lcid,LCMAP_SORTKEY|NORM_IGNORECASE,str,len,NULL,0);
  Buffer<char>	ret(rlen);
  LCMapString(lcid,LCMAP_SORTKEY|NORM_IGNORECASE,str,len,(wchar_t*)(char*)ret,rlen);
  if (rlen>0) // don't include terminating 0
    ret.setsize(rlen-1);
  return ret;
}

Buffer<char>  Unicode::ToUtf8(const CString& cs) {
  // determine length
  int		utflen;
  const wchar_t	*cp=cs;
  int		i;
  int		max=cs.GetLength();

  for (i=utflen=0;i<max;++i) {
    DWORD c;
    if (cp[i]>=0xd800 && cp[i]<=0xdbff && i<max-1 && cp[i+1]>=0xdc00 && cp[i+1]<=0xdfff) {
      c=((DWORD)(cp[i]-0xd800)<<10) + (cp[i+1]-0xdc00) + 0x10000;
      ++i;
    } else
      c=cp[i];

    if (c<128)
      ++utflen;
    else if (c<2048)
      utflen+=2;
    else if (c<65536)
      utflen+=3;
    else
      utflen+=4;
  }

  Buffer<char>	ret(utflen);
  char		*dp=ret;

  for (i=0;i<max;++i) {
    DWORD c;
    if (cp[i]>=0xd800 && cp[i]<=0xdbff && i<max-1 && cp[i+1]>=0xdc00 && cp[i+1]<=0xdfff) {
      c=((DWORD)(cp[i]-0xd800)<<10) + (cp[i+1]-0xdc00) + 0x10000;
      ++i;
    } else
      c=cp[i];
    if (c<128)
      *dp++=(char)c;
    else if (c<2048) {
      *dp++=(char)(0xc0 | (c>>6));
      *dp++=(char)(0x80 | (c&0x3f));
    } else if (c<65536) {
      *dp++=(char)(0xe0 | (c>>12));
      *dp++=(char)(0x80 | ((c>>6)&0x3f));
      *dp++=(char)(0x80 | (c&0x3f));
    } else {
      *dp++=(char)(0xf0 | ((c>>18) & 0x07));
      *dp++=(char)(0x80 | ((c>>12) & 0x3f));
      *dp++=(char)(0x80 | ((c>>6) & 0x3f));
      *dp++=(char)(0x80 | (c&0x3f));
    }
  }

  return ret;
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -