tdbconvertencode.cpp

来自「以wxWidget撰寫的簡繁體中文轉換程式已在Linux上編譯過」· C++ 代码 · 共 741 行 · 第 1/2 页
CPP
741 行
/* 
 * Copyright (C) 2006, Dung-Bang Tsai
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * ( If you wnat to use this library for commercial use, 
 *	  feel free to contact me, just cost some money, I could sell
 *		you the code without GPL license, so you could use this code
 *		for your product without public your source code.  				)
 *
 * Authors:
 *	Tsai, Dung-Bang <dbtsai@gmail.com>
 *			
*			2006/03/05		at NCKU physics
 */

#include "TDBConvertEncode.h"
#include "TDBConvertEngine.h"
#include "tables/big5_to_unicode_table.h"
//#include "hex_tools.c"

bool TDBdetect_code_page::IsBinaryData()
{
	size_t n_bytes;	// In the loop, test n bytes
	if( test_n_bytes==0)  {n_bytes = src_n_bytes;}
	else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes; }
	else {n_bytes = src_n_bytes;}
		
    unsigned int i = 0;
    while(i < n_bytes)
    {
        if(src[i] <= 0x10 && (src[i] <= 7 || src[i] >= 0x0E)) //Fix me, prove me wrong.
        {  return true;}
		if(src[i] == 0x00)
		{ return true;}
        ++i;
    }
    return false;
}


bool TDBdetect_code_page::IsASCII()
{
	size_t n_bytes;	// In the loop, test n bytes
	if( test_n_bytes==0)  {n_bytes = src_n_bytes;}
	else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes; }
	else {n_bytes = src_n_bytes;}

    register unsigned int i = 0;
    while(i < n_bytes)
    {
        if(src[i] <= 0x10 && (src[i] <= 7 || src[i] >= 0x0E)) //Fix me, prove me wrong.
        {  return false;}
		if(src[i] == 0x00)
		{ return false;}
		if( src[i] & 0x80)
		{ return false;}
        ++i;
    }
    return true;
}

int TDBdetect_code_page::UnicodeBOM_test()
{
	// You need test the 4bytes BOM first, or you will mix UTF32LE and UTF16LE.
	// That is because UTF16LE (FF FE), UTF32LE(FF FE 00 00), if you try UTF16LE fist,
	// You may think a UTF32LE file as UTF16LE
	if(IsUTF32BE_BOM_test())	{return UTF32BE_CODE;}
	if(IsUTF32LE_BOM_test())	{return UTF32LE_CODE;}
	if(IsUTF16BE_BOM_test())	{return UTF16BE_CODE;}
	if(IsUTF16LE_BOM_test())	{return UTF16LE_CODE;}
	if(IsUTF8_BOM_test())			{return UTF8_CODE;}
	else											{return OTHER_CODE;}
}

bool TDBdetect_code_page::IsUTF8_BOM_test()
{
	if(src_n_bytes >= 3 )
	{
		if( src[0] == 0xEF && src[1] == 0xBB && src[2] == 0xBF)
			return true;
		else
			return false;
	}
	else 
		return false;	
}

bool TDBdetect_code_page::IsUTF16LE_BOM_test()
{
	if(src_n_bytes >= 2 )
	{
		if( src[0] == 0xFF&& src[1] == 0xFE)
			return true;
		else
			return false;
	}
	else 
		return false;	
}

bool TDBdetect_code_page::IsUTF16BE_BOM_test()
{
	if(src_n_bytes >= 2 )
	{
		if( src[0] == 0xFE&& src[1] == 0xFF)	
			return true;
		else
			return false;
	}
	else 
		return false;	
}

bool TDBdetect_code_page::IsUTF32LE_BOM_test()
{
	if(src_n_bytes >= 4 )
	{
		if(src[0] == 0xFF&& src[1] == 0xFE && src[2] == 0x00&& src[3] == 0x00)
			return true;
		else
			return false;
	}
	else 
		return false;	
}

bool TDBdetect_code_page::IsUTF32BE_BOM_test()
{
	if(src_n_bytes >= 4 )
	{
		if(src[0] == 0x00&& src[1] == 0x00 && src[2] == 0xFE&& src[3] == 0xFF)
			return true;
		else
			return false;
	}
		return false;
}

bool  TDBdetect_code_page::UTF8_signature_test()
{
// Pass the BOM, and perform signature test
	register size_t i;		( IsUTF8_BOM_test() )? i = 3 : i = 0;
 
// If I just test test_n_bytes, the last word may be cut, so if the last word error, ignore it.
	bool last_word_ignore = false;
	bool if_ascii = true;
	size_t n_bytes;	// In the loop, test n bytes
	size_t ignore_n_bytes = 6; // Fix me
	
	if( test_n_bytes==0)  {n_bytes = src_n_bytes;}
	else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes; last_word_ignore=true;}
	else {n_bytes = src_n_bytes;}
	if(n_bytes == 0 || i>= n_bytes)  return false;

	while( i < n_bytes )
	{
		if( ! (src[i] & 0x80) ) 	// ascii, 128 code points (1byte)
		{	// OE OF 10 and 00~07 are forbidden, Well usually not use. Fix me, if I'm wrong.
			if( src[i]  <= 0x10 && (src[i] <= 7 ||  src[i] >= 0x0E))
                return false;
			else
				++i;
		}
/* 2 bytes UTF-8 string: 110xxxxx 10xxxxxx
    hence mask: 001xxxxx 01xxxxxx, or 0x20 0x40 0x40 */
		else if( (i+1<n_bytes) && ! ( (src[i] & 0x20) | (src[i+1]&0x40) ) )	//2 bytes 
		{	i+=2;  if_ascii = false; }
/* 3 bytes of UTF-8 string should be 1110xxxx 10xxxxxx 10xxxxxx
    so, the mask will be 0001xxxx 01xxxxxx 01xxxxxx, or 0x10 0x40 0x40) */
		else if(  (i+2<n_bytes) && !(  (src[i]  &0x10) | (src[i+1]&0x40)	//3 bytes
               				| (src[i+2]&0x40)                         ) )
    	{	i+=3; if_ascii = false; }
	    else if( (i+3<n_bytes) && !(  (src[i]  &0x08) | (src[i+1]&0x40)	//4 bytes
               			| (src[i+2]&0x40) | (src[i+3]&0x40)  ) )
    	{	i+=4; if_ascii = false; }
		//ingore last error word.
 		else if( last_word_ignore && ( (i+ignore_n_bytes) >  n_bytes) ) 
			goto end;
		else	
			return false;
	}
end:
	if(if_ascii)
		return false;
	else
		return true;
}

bool  TDBdetect_code_page::UTF16LE_signature_test()
{
	unsigned short int ucs2_test = 0;
	register size_t i;	
	bool if_ok = false;
	bool highsurrogate = false;

	( IsUTF16LE_BOM_test() && !IsUTF32LE_BOM_test() )? i = 2 : i = 0;

	size_t n_bytes;	
	
	if( test_n_bytes==0)  {n_bytes = src_n_bytes;}
	else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes;}
	else {n_bytes = src_n_bytes;}
	if(n_bytes == 0 || i+1>= n_bytes)  return false;
	
	while( i + 1 < n_bytes )
	{
		ucs2_test = (src[i+1] << 8) + src[i] ;		
		if( ucs2_test <=0xFF)
		{
			if( ucs2_test <= 7 ||  (ucs2_test<=0x10&&ucs2_test>=0x0e)  ) // Fix me, if I'm wrong.
				return false;
			// Fix me, if I'm wrong.  (If there is no acsii, it may return false)
			if(( ucs2_test >= 9 && ucs2_test <= 0x0D)  || ( ucs2_test>= 0x20 && ucs2_test <= 0x7E))
				if_ok = true;
		}
// The following check, you should read Unicode surrogates check.
		else if(ucs2_test >= 0xD800 && ucs2_test <= 0xDBFF)
		{
			if(highsurrogate)
				return false;
			highsurrogate = true;
		}
		else if(ucs2_test >= 0xDC00 && ucs2_test <= 0xDFFF)
		{
			if(highsurrogate == false)
				return false;
			highsurrogate = false;
		}
		i+=2;
	}
	return if_ok;
}

bool  TDBdetect_code_page::UTF16BE_signature_test()
{
	unsigned short int ucs2_test = 0;
	bool if_ok = false;
	bool highsurrogate = false;
	register size_t i;	
	( IsUTF16BE_BOM_test() )? i = 2 : i = 0;

	size_t n_bytes;	
	
	if( test_n_bytes==0)  {n_bytes = src_n_bytes;}
	else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes;}
	else {n_bytes = src_n_bytes;}
	if(n_bytes == 0 || i+1>= n_bytes)  return false;
	
	while( i + 1 < n_bytes )
	{
		ucs2_test = (src[i] << 8) + src[i+1] ;
		if( ucs2_test <=0xFF)
		{
			if( ucs2_test <= 7 ||  (ucs2_test<=0x10&&ucs2_test>=0x0e)  ) // Fix me, if I'm wrong.
				return false;
// Fix me, if I'm wrong.  (If there is no acsii, it may return false)
			if(( ucs2_test >= 9 && ucs2_test <= 0x0D)  || ( ucs2_test>= 0x20 && ucs2_test <= 0x7E))
				if_ok = true;
		}
// The following check, you should read Unicode surrogates check.
		else if(ucs2_test >= 0xD800 && ucs2_test <= 0xDBFF)
		{
			if(highsurrogate)
				return false;
			highsurrogate = true;
		}
		else if(ucs2_test >= 0xDC00 && ucs2_test <= 0xDFFF)
		{
			if(highsurrogate == false)
				return false;
			highsurrogate = false;
		}
		i+=2;
	}
	return if_ok;
}

bool  TDBdetect_code_page::UTF32LE_signature_test()
{
	unsigned  int ucs4_test = 0;
	register size_t i;	
	( IsUTF32LE_BOM_test() )? i = 4 : i = 0;

	size_t n_bytes;	
	
	if( test_n_bytes==0)  {n_bytes = src_n_bytes;}
	else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes;}
	else {n_bytes = src_n_bytes;}
	if(n_bytes == 0 || i+3>= n_bytes)  return false;
	
	while( i + 3 < n_bytes )
	{
		ucs4_test = (src[i+3] << 24) + (src[i+2]<<16) + (src[i+1]<<8)  + src[i];	
		if( ucs4_test <= 7 ||  (ucs4_test<=0x10&&ucs4_test>=0x0e )  ) // Fix me, if I'm wrong.
			return false;  
		if( ucs4_test>=0x0010FFFF)
			return false;
		i+=4;
	}
	return true;
}

bool  TDBdetect_code_page::UTF32BE_signature_test()
{
	unsigned  int ucs4_test = 0;
	register size_t i;	
	( IsUTF32BE_BOM_test() )? i = 4 : i = 0;

	size_t n_bytes;	
	
	if( test_n_bytes==0)  {n_bytes = src_n_bytes;}
	else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes;}
	else {n_bytes = src_n_bytes;}
	if(n_bytes == 0 || i+3>= n_bytes)  return false;
	
	while( i + 3 < n_bytes )
	{
		ucs4_test = (src[i] << 24) + ( src[i+1]<<16 ) + ( src[i+2]<<8 )  + src[i+3];
		if( ucs4_test <= 7 ||  (ucs4_test<=0x10&&ucs4_test>=0x0e)  ) // Fix me, if I'm wrong.
			return false; 
		if( ucs4_test>=0x0010FFFF)
			return false;
		i+=4;
	}
	return true;
}

int TDBdetect_code_page::Unicode_signature_test()
{
	if(UTF8_signature_test())			{return UTF8_CODE;}
	if(UTF16LE_signature_test())	{return UTF16LE_CODE;}
	if(UTF16BE_signature_test())	{return UTF16BE_CODE;}
	if(UTF32LE_signature_test())	{return UTF32LE_CODE;}
	if(UTF32BE_signature_test())	{return UTF32BE_CODE;}
	else								{return OTHER_CODE;}
}


int TDBdetect_code_page::Chinese_code_test()
{
	double error_limt=0.05;  // Give 5% error capility.
	size_t n_bytes;	// In the loop, test n bytes
	if( test_n_bytes==0)  {n_bytes = src_n_bytes;}
	else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes; }
	else {n_bytes = src_n_bytes;}
	
	unsigned short int test_char = 0;
	int big5 = 0; int gbk = 0; int total_words=0; // total words are all two bytes world.
	int big5_error = 0; int gbk_error = 0; // Boundary test
	unsigned register int i = 0;
	
	while(i+1 < n_bytes)
	{
		if( src[i] & 0x80 )  // Non ascii mode
		{  
			test_char = (src[i] << 8) + src[i+1];
			switch(test_char)
tdbconvertencode.cpp - 源码说明

本页面展示了「以wxWidget撰寫的簡繁體中文轉換程式已在Linux上編譯過」中的 tdbconvertencode.cpp 源码文件，采用 C++ 编程语言编写，共 741 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与wxWidget相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?