⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 unicode.cpp

📁 俄罗斯人开发的大名鼎鼎的Pocket Pc 阅读器haaliread的源代码,visual c
💻 CPP
📖 第 1 页 / 共 3 页
字号:
/*
 * Copyright (c) 2001,2002,2003 Mike Matsnev.  All Rights Reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice immediately at the beginning of the file, without modification,
 *    this list of conditions, and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Absolutely no warranty of function or purpose is made by the author
 *    Mike Matsnev.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * $Id: Unicode.cpp,v 1.13.2.5 2003/12/17 12:19:58 mike Exp $
 * 
 */

#include <afx.h>

#include "ptr.h"
#include "Unicode.h"

#ifndef	CP_UTF8
#define CP_UTF8	65001
#endif

#ifndef CP_1252
#define	CP_1252 1252
#endif

// string compare
#define	CmpI(s1,s2) \
    (::CompareString(LOCALE_USER_DEFAULT,NORM_IGNORECASE, \
    (s1),-1,(s2),-1)-2)

struct CodePage {
  const TCHAR	*name;
  const TCHAR	*alias1,*alias2;
  UINT		codepage;
  int		(*length)(struct CodePage *cp,const char *mbs,int mblen);
  void		(*convert)(struct CodePage *cp,const char *mbs,int mblen,
      wchar_t *wcs,int wclen);
  const wchar_t	*table;
};

static int    CE_cp_length(struct CodePage *cp,const char *mbs,int mblen) {
  return MultiByteToWideChar(cp->codepage,0,mbs,mblen,0,NULL);
}

static void   CE_cp_convert(struct CodePage *cp,const char *mbs,int mblen,
			    wchar_t *wcs,int wclen)
{
  MultiByteToWideChar(cp->codepage,0,mbs,mblen,wcs,wclen);
}

static int    TB_cp_length(struct CodePage *cp,const char *mbs,int mblen) {
  return mblen;
}

static void   TB_cp_convert(struct CodePage *cp,const char *mbs,int mblen,
			    wchar_t *wcs,int wclen)
{
  const char  *mbe=mbs+min(mblen,wclen);
  while (mbs<mbe)
    *wcs++=cp->table[(unsigned char)*mbs++];
}

static int    WS_cp_length(struct CodePage *cp,const char *mbs,int mblen) {
  return mblen;
}

static void   WS_cp_convert(struct CodePage *cp,const char *mbs,int mblen,
			    wchar_t *wcs,int wclen)
{
  const char  *mbe=mbs+min(mblen,wclen);
  while (mbs<mbe)
    *wcs++=(unsigned char)*mbs++;
}

static int    UTF_cp_length(struct CodePage *cp,const char *mbs,int mblen) {
  const unsigned char *mb=(const unsigned char *)mbs;
  int		      len=0;

  while (mblen>0) {
    unsigned char c=*mb++;
    --mblen;
    if (c<0x80) { // ascii
      ++len;
    } else if (c<0xe0) { // 2-byte seq
      if (mblen==0) // invalid
	break;
      if ((mb[0]&0x80)==0x80) {
	++len;
	++mb;
	--mblen;
      }
    } else if (*mb<0xf0) {
      if (mblen<=1) // invalid
	break;
      if ((mb[0]&0xc0)==0x80 && (mb[1]&0xc0)==0x80) {
	++len;
	mb+=2;
	mblen-=2;
      }
    } else if (*mb<0xf4) {
      if (mblen<=2) // invalid
	break;
      if ((mb[0]&0xc0)==0x80 && (mb[1]&0xc0)==0x80 && (mb[2]&0xc0)==0x80) {
	++len;
	mb+=3;
	mblen-=3;
      }
    } else if (*mb==0xf4) {
      if (mblen<=2) // invalid
	break;
      if ((mb[0]&0xf0)==0x80 && (mb[1]&0xc0)==0x80 && (mb[2]&0xc0)==0x80) {
	++len;
	mb+=3;
	mblen-=3;
      }
    }
  }
  return len;
}

static void   UTF_cp_convert(struct CodePage *cp,const char *mbs,int mblen,
			    wchar_t *wcs,int wclen)
{
  const unsigned char *mb=(const unsigned char *)mbs;
  int		      len=0;
  wchar_t	      *wce=wcs+wclen;

  while (mblen>0 && wcs<wce) {
    unsigned char c=*mb++;
    --mblen;
    if (c<0x80) { // ascii
      *wcs++=c;
    } else if (c<0xe0) { // 2-byte seq
      if (mblen==0) // invalid
	break;
      if ((mb[0]&0x80)==0x80) {
	*wcs++=((wchar_t)(c&0x1f)<<6)|(*mb&0x3f);
	++mb;
	--mblen;
      }
    } else if (*mb<0xf0) {
      if (mblen<=1) // invalid
	break;
      if ((mb[0]&0xc0)==0x80 && (mb[1]&0xc0)==0x80) {
	*wcs++=((wchar_t)(c&0x0f)<<12)|((wchar_t)(mb[0]&0x3f)<<6)|(mb[1]&0x3f);
	mb+=2;
	mblen-=2;
      }
    } else if (*mb<0xf4) {
      if (mblen<=2) // invalid
	break;
      if ((mb[0]&0xc0)==0x80 && (mb[1]&0xc0)==0x80 && (mb[2]&0xc0)==0x80) {
	*wcs++=((wchar_t)(c&0x7)<<18)|((wchar_t)(mb[0]&0x3f)<<12)|
		((wchar_t)(mb[1]&0x3f)<<6)|(mb[2]&0x3f);
	mb+=3;
	mblen-=3;
      }
    } else if (*mb==0xf4) {
      if (mblen<=2) // invalid
	break;
      if ((mb[0]&0xf0)==0x80 && (mb[1]&0xc0)==0x80 && (mb[2]&0xc0)==0x80) {
	*wcs++=((wchar_t)(c&0x7)<<18)|((wchar_t)(mb[0]&0x3f)<<12)|
		((wchar_t)(mb[1]&0x3f)<<6)|(mb[2]&0x3f);
	mb+=3;
	mblen-=3;
      }
    }
  }
}

static struct CodePage	  *codepages;
static int		  curcp,maxcp;
static int		  default_cp;

static int    add_codepage(const TCHAR *name,UINT cp,const TCHAR *alias1=NULL,
			   const TCHAR *alias2=NULL)
{
  if (curcp>=maxcp) {
    maxcp+=16;
    codepages=(struct CodePage *)realloc(codepages,maxcp*sizeof(struct CodePage));
    if (codepages==NULL)
      ExitThread(0);
  }
  codepages[curcp].name=name;
  codepages[curcp].alias1=alias1;
  codepages[curcp].alias2=alias2;
  codepages[curcp].codepage=cp;
  codepages[curcp].length=CE_cp_length;
  codepages[curcp].convert=CE_cp_convert;
  codepages[curcp].table=NULL;
  return curcp++;
}

struct {
  UINT	      cp;
  const TCHAR *name;
  const TCHAR *alias1,*alias2;
} ms_codepages[]={
  { 37, _T("IBM EBCDIC - U.S./Canada"), _T("IBM037"), _T("cp037") },
  { 437, _T("OEM - United States"), _T("IBM437"), _T("cp437") },
  { 500, _T("IBM EBCDIC - International"), _T("IBM500"), _T("cp500") },
  { 737, _T("OEM - Greek 437G") },
  { 775, _T("OEM - Baltic"), _T("IBM775"), _T("cp775") },
  { 850, _T("OEM - Multilingual Latin I"), _T("IBM850"), _T("cp850") },
  { 852, _T("OEM - Latin II"), _T("IBM852"), _T("cp852") },
  { 855, _T("OEM - Cyrillic"), _T("IBM855"), _T("cp855") },
  { 857, _T("OEM - Turkish"), _T("IBM857"), _T("cp857") },
  { 860, _T("OEM - Portuguese"), _T("IBM860"), _T("cp860") },
  { 861, _T("OEM - Icelandic"), _T("IBM861"), _T("cp861") },
  { 863, _T("OEM - Canadian French"), _T("IBM863"), _T("cp863") },
  { 865, _T("OEM - Nordic"), _T("IBM865"), _T("cp865") },
  { 866, _T("OEM - Russian"), _T("IBM866"), _T("cp866") },
  { 869, _T("OEM - Modern Greek"), _T("IBM869"), _T("cp869") },
  { 874, _T("ANSI/OEM - Thai") },
  { 875, _T("IBM EBCDIC - Modern Greek") },
  { 932, _T("ANSI/OEM - Japanese Shift-JIS"), _T("Shift_JIS") },
  { 936, _T("ANSI/OEM - Simplified Chinese GBK"), _T("GBK") },
  { 949, _T("ANSI/OEM - Korean") },
  { 950, _T("ANSI/OEM - Traditional Chinese Big5"), _T("Big5") },
  { 1026, _T("IBM EBCDIC - Turkish (Latin-5)"), _T("cp1026") },
  { 1250, _T("ANSI - Central Europe"), _T("windows-1250") },
  { 1251, _T("ANSI - Cyrillic"), _T("windows-1251") },
  { 1252, _T("ANSI - Latin I"), _T("windows-1252"), _T("ISO-8859-1") }, // XXX
  { 1253, _T("ANSI - Greek"), _T("windows-1253") },
  { 1254, _T("ANSI - Turkish"), _T("windows-1254") },
  { 1255, _T("ANSI - Hebrew"), _T("windows-1255") },
  { 1256, _T("ANSI - Arabic"), _T("windows-1256") },
  { 1257, _T("ANSI - Baltic"), _T("windows-1257") },
  { 1258, _T("ANSI/OEM - Viet Nam"), _T("windows-1258") },
  { 10000, _T("MAC - Roman") },
  { 10006, _T("MAC - Greek I") },
  { 10007, _T("MAC - Cyrillic") },
  { 10010, _T("MAC - Romania") },
  { 10017, _T("MAC - Ukraine") },
  { 10029, _T("MAC - Latin II") },
  { 10079, _T("MAC - Icelandic") },
  { 10081, _T("MAC - Turkish") },
  { 10082, _T("MAC - Croatia") },
  { 20127, _T("US-ASCII") },
  { 20261, _T("T.61") },
  { 20866, _T("Russian - KOI8"), _T("koi8-r") },
  { 21866, _T("Ukrainian - KOI8-U"), _T("koi8-u") },
  { 28591, _T("ISO 8859-1 Latin I"), _T("ISO-8859-1") },
  { 28592, _T("ISO 8859-2 Central Europe"), _T("ISO-8859-2") },
  { 28594, _T("ISO 8859-4 Baltic"), _T("ISO-8859-4") },
  { 28595, _T("ISO 8859-5 Cyrillic"), _T("ISO-8859-5") },
  { 28597, _T("ISO 8859-7 Greek"), _T("ISO-8859-7") },
  { 28599, _T("ISO 8859-9 Latin 5"), _T("ISO-8859-9") },
  { 28605, _T("ISO 8859-15 Latin 9"), _T("ISO-8859-15") },
  { 65000, _T("UTF-7") },
  { 65001, _T("UTF-8") },
};
#define	NUM_MSCP    (sizeof(ms_codepages)/sizeof(ms_codepages[0]))

static int  get_mscp_num(UINT cp) {
  int	  i=0,j=NUM_MSCP-1;
  while (i<=j) {
    int	  m=(i+j)>>1;
    if (cp<ms_codepages[m].cp)
      j=m-1;
    else if (cp>ms_codepages[m].cp)
      i=m+1;
    else
      return m;
  }
  return -1;
}

static struct {
  BYTE	  distmap[256];
  wchar_t unimap[256];
  UINT	  cp;
} builtin_encodings[]={
{{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -