📄 utf8.cpp

📁 utf8编码
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
#include "utf8.h"
#include "stdlib.h" 
#include <stdio.h>
#include <wchar.h>
#include <exception>
#include <assert.h>
#include <string.h>  
#if defined(WIN32)	 
#include <windows.h>
#pragma warning(disable : 4244)		
#endif


namespace dnc{
	class UTF8ConvertError : public std::exception{
	public:
		virtual castr what() const throw(){
			return "UTF8ConvertError";
		}
	};
	class XCharError : public std::exception{
	public:
		XCharError(xchar ch){
			m_ch = ch;
			::sprintf(m_msg,"XCharError:%i",ch);
		}
		virtual castr what() const throw(){
			return m_msg;
		}
	private:
		char m_msg[100];
		xchar m_ch;
	};
	class UTF8FormatError : public std::exception{
	public:
		UTF8FormatError(unsigned long pos,unsigned long  len,byte byte,unsigned long  index){
			m_len = len;
			m_pos = pos;
			m_byte= byte;
			m_index=index;
		}
		UTF8FormatError(unsigned long index){
			m_len = 0;
			m_pos = 0;
			m_byte= 0;
			m_index=index;
		}
	public:
		virtual const char* what() const throw(){
			return "UTF8FormatError";
		}
	private:
		unsigned long  m_len;
		unsigned long  m_pos;
		byte   m_byte;
		unsigned long  m_index;
	};
	
	DNC_DECLARE unsigned int XCharToUTF8(xchar ch,astr utf8){
		unsigned int encodedBytes;
		if(ch <= 127){
			*utf8 = (char)ch;
			encodedBytes = 1;
		}else{
			uchar *chars = (ustr)utf8;
			uchar *outPtr = chars;
			// Figure out how many bytes we need
			
			if (ch < 0x80)
				encodedBytes = 1;
			else if (ch < 0x800)
				encodedBytes = 2;
			else if (ch < 0x10000)
				encodedBytes = 3;
			else if (ch < 0x200000)
				encodedBytes = 4;
			else if (ch < 0x4000000)
				encodedBytes = 5;
			else if (ch <= 0x7FFFFFFF)
				encodedBytes = 6;
			else{
				
				throw XCharError(ch);
			}

			//
			//  And spit out the bytes. We spit them out in reverse order
			//  here, so bump up the output pointer and work down as we go.
			//
			outPtr += encodedBytes;
			switch(encodedBytes){
				case 6 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
						ch >>= 6;
				case 5 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
						ch >>= 6;
				case 4 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
						ch >>= 6;
				case 3 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
						ch >>= 6;
				case 2 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
						ch >>= 6;
				case 1 : *--outPtr = byte(ch | gFirstByteMark[encodedBytes]);
			}
		}
		return encodedBytes;
	}
	DNC_DECLARE unsigned int UTF8ToXChar(custr utf8,xchar &ch){
		const unsigned char *srcPtr = (const unsigned char*)utf8;
		if (*srcPtr <= 127){
			return *srcPtr;
		}
		unsigned int trailingBytes = gUTFBytes[*srcPtr];

		xchar tmpVal = *srcPtr++;
		tmpVal <<= 6;
		for(unsigned int i=1; i<trailingBytes; i++){
			if((*srcPtr & 0xC0) == 0x80){
				tmpVal += *srcPtr++; 
				tmpVal <<= 6;
			}else throw UTF8FormatError(i,trailingBytes,*srcPtr,0xffffffff);
		}
		if((*srcPtr & 0xC0) == 0x80){
			tmpVal += *srcPtr++;
		}else throw UTF8FormatError(trailingBytes,trailingBytes,*srcPtr,0xffffffff);
		
		tmpVal -= gUTFOffsets[trailingBytes];

		//
		//  If it will fit into a single char, then put it in. Otherwise
		//  encode it as a surrogate pair. If its not valid, use the
		//  replacement char.
		//
		if (tmpVal & 0xFFFF0000){
			// Store the leading surrogate char
			tmpVal -= 0x10000;
		}
		ch = tmpVal;
		return trailingBytes+1;
	}
	DNC_DECLARE xchar utf8_value(custr str){
		unsigned char *srcPtr = (unsigned char*)str;
		if (*srcPtr <= 127){
			return *srcPtr;
		}
		unsigned int trailingBytes = gUTFBytes[*srcPtr];

		xchar tmpVal = *srcPtr++;
		tmpVal <<= 6;
		for(unsigned int i=1; i<trailingBytes; i++){
			if((*srcPtr & 0xC0) == 0x80){
				tmpVal += *srcPtr++; 
				tmpVal <<= 6;
			}else throw UTF8FormatError(i,trailingBytes,*srcPtr,0xffffffff);
		}
		if((*srcPtr & 0xC0) == 0x80){
			tmpVal += *srcPtr++;
		}else throw UTF8FormatError(trailingBytes,trailingBytes,*srcPtr,0xffffffff);
		
		tmpVal -= gUTFOffsets[trailingBytes];

		//
		//  If it will fit into a single char, then put it in. Otherwise
		//  encode it as a surrogate pair. If its not valid, use the
		//  replacement char.
		//
		if (tmpVal & 0xFFFF0000){
			// Store the leading surrogate char
			tmpVal -= 0x10000;
		}
		return tmpVal;
	}

	DNC_DECLARE int utf8_strcmp(castr str1,castr str2,size_t count){
		if(count == -1)
			return ::strcmp((char*)str1,(char*)str2);	
		else{
			unsigned int len1=(unsigned int)strlen((char*)str1);
			unsigned int len2=(unsigned int)strlen((char*)str2);
			unsigned int len = (len1<len2) ? len1 : len2;
			len = (len<count) ? len : count;

			int ret = memcmp(str1,str2,len);
			if(ret == 0){
				if(len1 > len2) ret = 1;
				else if(len1 < len2) ret = -1;
			}
			return ret;
		}
	}
	DNC_DECLARE void utf8_strlen(castr str,unsigned int &size,unsigned int &rawSize,unsigned int count){
		assert(str != NULL);
		rawSize = 0;
		size = 0;
		custr p=(custr)str;
		for(;*p!=0 && rawSize<count;p++){
			if(*p < 0x80 || *p >= 0xE0) size++;
			rawSize++;
		}
	}


	// ---------------------------------------------------------------------------
	//  Local static data
	//
	//  gUTFBytes
	//      A list of counts of trailing bytes for each initial byte in the input.
	//
	//  gUTFByteIndicator
	//      For a UTF8 sequence of n bytes, n>=2, the first byte of the
	//      sequence must contain n 1's followed by precisely 1 0 with the
	//      rest of the byte containing arbitrary bits.  This array stores
	//      the required bit pattern for validity checking.
	//  gUTFByteIndicatorTest
	//      When bitwise and'd with the observed value, if the observed
	//      value is correct then a result matching gUTFByteIndicator will
	//      be produced.
	//
	//  gUTFOffsets
	//      A list of values to offset each result char type, according to how
	//      many source bytes when into making it.
	//
	//  gFirstByteMark
	//      A list of values to mask onto the first byte of an encoded sequence,
	//      indexed by the number of bytes used to create the sequence.
	// ---------------------------------------------------------------------------
	cuchar gUTFBytes[256] =
	{
			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
		,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
		,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
		,   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
		,   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
	};

	static cuchar gUTFByteIndicator[6] =
	{
		0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
	};
	static cuchar gUTFByteIndicatorTest[6] =
	{
		0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE
	};

	const unsigned long gUTFOffsets[6] =
	{
		0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
	};

	cuchar gFirstByteMark[7] =
	{
		0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
	};

	DNC_DECLARE size_t ANSIToUNICODE(castr srcData,size_t srcCount, wstr destData,size_t destCount){
		int ret;
#if defined(_MSC_VER)
		ret = ::MultiByteToWideChar(CP_ACP,0,srcData,(int)srcCount,destData,(int)destCount);
		if(ret == 0) ret = -1;
		return ret;
#else
		ret = ::mbstowcs(destData, srcData,destCount);
#endif
		return (unsigned int)ret;
	}
	DNC_DECLARE size_t UNICODEToANSI(cwstr srcData,size_t srcCount,astr destData,size_t destCount){
		int ret;
#if defined(_MSC_VER)
		ret = ::WideCharToMultiByte(CP_ACP,0,srcData,(int)srcCount,destData,(int)destCount, NULL, NULL );
#else
		ret = ::wcstombs(destData,srcData,destCount);
#endif
		return (unsigned int)ret;
	}

	// ---------------------------------------------------------------------------
	//  XMLUTF8Transcoder: Implementation of the transcoder API
	// ---------------------------------------------------------------------------
	DNC_DECLARE size_t ANSIToUTF8(castr srcData,size_t srcCount,ustr destData,size_t destCount){
		wchar_t *wstr = (wchar_t*)malloc(srcCount*2);
		size_t len = 0;
		int ret = 0;
		try{
#if defined(WIN32)
12 下一页
💿 文件大小 8 K
👤 上传用户 zcflion
📂 所属分类 VC书籍
📄 代码行数 588 行
💻 语言类型 C++
🏷️ 相关标签

#utf8 #编码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -