📄 tdbconvertencode.cpp
字号:
/*
* Copyright (C) 2006, Dung-Bang Tsai
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*
* ( If you wnat to use this library for commercial use,
* feel free to contact me, just cost some money, I could sell
* you the code without GPL license, so you could use this code
* for your product without public your source code. )
*
* Authors:
* Tsai, Dung-Bang <dbtsai@gmail.com>
*
* 2006/03/05 at NCKU physics
*/
#include "TDBConvertEncode.h"
#include "TDBConvertEngine.h"
#include "tables/big5_to_unicode_table.h"
//#include "hex_tools.c"
bool TDBdetect_code_page::IsBinaryData()
{
size_t n_bytes; // In the loop, test n bytes
if( test_n_bytes==0) {n_bytes = src_n_bytes;}
else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes; }
else {n_bytes = src_n_bytes;}
unsigned int i = 0;
while(i < n_bytes)
{
if(src[i] <= 0x10 && (src[i] <= 7 || src[i] >= 0x0E)) //Fix me, prove me wrong.
{ return true;}
if(src[i] == 0x00)
{ return true;}
++i;
}
return false;
}
bool TDBdetect_code_page::IsASCII()
{
size_t n_bytes; // In the loop, test n bytes
if( test_n_bytes==0) {n_bytes = src_n_bytes;}
else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes; }
else {n_bytes = src_n_bytes;}
register unsigned int i = 0;
while(i < n_bytes)
{
if(src[i] <= 0x10 && (src[i] <= 7 || src[i] >= 0x0E)) //Fix me, prove me wrong.
{ return false;}
if(src[i] == 0x00)
{ return false;}
if( src[i] & 0x80)
{ return false;}
++i;
}
return true;
}
int TDBdetect_code_page::UnicodeBOM_test()
{
// You need test the 4bytes BOM first, or you will mix UTF32LE and UTF16LE.
// That is because UTF16LE (FF FE), UTF32LE(FF FE 00 00), if you try UTF16LE fist,
// You may think a UTF32LE file as UTF16LE
if(IsUTF32BE_BOM_test()) {return UTF32BE_CODE;}
if(IsUTF32LE_BOM_test()) {return UTF32LE_CODE;}
if(IsUTF16BE_BOM_test()) {return UTF16BE_CODE;}
if(IsUTF16LE_BOM_test()) {return UTF16LE_CODE;}
if(IsUTF8_BOM_test()) {return UTF8_CODE;}
else {return OTHER_CODE;}
}
bool TDBdetect_code_page::IsUTF8_BOM_test()
{
if(src_n_bytes >= 3 )
{
if( src[0] == 0xEF && src[1] == 0xBB && src[2] == 0xBF)
return true;
else
return false;
}
else
return false;
}
bool TDBdetect_code_page::IsUTF16LE_BOM_test()
{
if(src_n_bytes >= 2 )
{
if( src[0] == 0xFF&& src[1] == 0xFE)
return true;
else
return false;
}
else
return false;
}
bool TDBdetect_code_page::IsUTF16BE_BOM_test()
{
if(src_n_bytes >= 2 )
{
if( src[0] == 0xFE&& src[1] == 0xFF)
return true;
else
return false;
}
else
return false;
}
bool TDBdetect_code_page::IsUTF32LE_BOM_test()
{
if(src_n_bytes >= 4 )
{
if(src[0] == 0xFF&& src[1] == 0xFE && src[2] == 0x00&& src[3] == 0x00)
return true;
else
return false;
}
else
return false;
}
bool TDBdetect_code_page::IsUTF32BE_BOM_test()
{
if(src_n_bytes >= 4 )
{
if(src[0] == 0x00&& src[1] == 0x00 && src[2] == 0xFE&& src[3] == 0xFF)
return true;
else
return false;
}
return false;
}
bool TDBdetect_code_page::UTF8_signature_test()
{
// Pass the BOM, and perform signature test
register size_t i; ( IsUTF8_BOM_test() )? i = 3 : i = 0;
// If I just test test_n_bytes, the last word may be cut, so if the last word error, ignore it.
bool last_word_ignore = false;
bool if_ascii = true;
size_t n_bytes; // In the loop, test n bytes
size_t ignore_n_bytes = 6; // Fix me
if( test_n_bytes==0) {n_bytes = src_n_bytes;}
else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes; last_word_ignore=true;}
else {n_bytes = src_n_bytes;}
if(n_bytes == 0 || i>= n_bytes) return false;
while( i < n_bytes )
{
if( ! (src[i] & 0x80) ) // ascii, 128 code points (1byte)
{ // OE OF 10 and 00~07 are forbidden, Well usually not use. Fix me, if I'm wrong.
if( src[i] <= 0x10 && (src[i] <= 7 || src[i] >= 0x0E))
return false;
else
++i;
}
/* 2 bytes UTF-8 string: 110xxxxx 10xxxxxx
hence mask: 001xxxxx 01xxxxxx, or 0x20 0x40 0x40 */
else if( (i+1<n_bytes) && ! ( (src[i] & 0x20) | (src[i+1]&0x40) ) ) //2 bytes
{ i+=2; if_ascii = false; }
/* 3 bytes of UTF-8 string should be 1110xxxx 10xxxxxx 10xxxxxx
so, the mask will be 0001xxxx 01xxxxxx 01xxxxxx, or 0x10 0x40 0x40) */
else if( (i+2<n_bytes) && !( (src[i] &0x10) | (src[i+1]&0x40) //3 bytes
| (src[i+2]&0x40) ) )
{ i+=3; if_ascii = false; }
else if( (i+3<n_bytes) && !( (src[i] &0x08) | (src[i+1]&0x40) //4 bytes
| (src[i+2]&0x40) | (src[i+3]&0x40) ) )
{ i+=4; if_ascii = false; }
//ingore last error word.
else if( last_word_ignore && ( (i+ignore_n_bytes) > n_bytes) )
goto end;
else
return false;
}
end:
if(if_ascii)
return false;
else
return true;
}
bool TDBdetect_code_page::UTF16LE_signature_test()
{
unsigned short int ucs2_test = 0;
register size_t i;
bool if_ok = false;
bool highsurrogate = false;
( IsUTF16LE_BOM_test() && !IsUTF32LE_BOM_test() )? i = 2 : i = 0;
size_t n_bytes;
if( test_n_bytes==0) {n_bytes = src_n_bytes;}
else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes;}
else {n_bytes = src_n_bytes;}
if(n_bytes == 0 || i+1>= n_bytes) return false;
while( i + 1 < n_bytes )
{
ucs2_test = (src[i+1] << 8) + src[i] ;
if( ucs2_test <=0xFF)
{
if( ucs2_test <= 7 || (ucs2_test<=0x10&&ucs2_test>=0x0e) ) // Fix me, if I'm wrong.
return false;
// Fix me, if I'm wrong. (If there is no acsii, it may return false)
if(( ucs2_test >= 9 && ucs2_test <= 0x0D) || ( ucs2_test>= 0x20 && ucs2_test <= 0x7E))
if_ok = true;
}
// The following check, you should read Unicode surrogates check.
else if(ucs2_test >= 0xD800 && ucs2_test <= 0xDBFF)
{
if(highsurrogate)
return false;
highsurrogate = true;
}
else if(ucs2_test >= 0xDC00 && ucs2_test <= 0xDFFF)
{
if(highsurrogate == false)
return false;
highsurrogate = false;
}
i+=2;
}
return if_ok;
}
bool TDBdetect_code_page::UTF16BE_signature_test()
{
unsigned short int ucs2_test = 0;
bool if_ok = false;
bool highsurrogate = false;
register size_t i;
( IsUTF16BE_BOM_test() )? i = 2 : i = 0;
size_t n_bytes;
if( test_n_bytes==0) {n_bytes = src_n_bytes;}
else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes;}
else {n_bytes = src_n_bytes;}
if(n_bytes == 0 || i+1>= n_bytes) return false;
while( i + 1 < n_bytes )
{
ucs2_test = (src[i] << 8) + src[i+1] ;
if( ucs2_test <=0xFF)
{
if( ucs2_test <= 7 || (ucs2_test<=0x10&&ucs2_test>=0x0e) ) // Fix me, if I'm wrong.
return false;
// Fix me, if I'm wrong. (If there is no acsii, it may return false)
if(( ucs2_test >= 9 && ucs2_test <= 0x0D) || ( ucs2_test>= 0x20 && ucs2_test <= 0x7E))
if_ok = true;
}
// The following check, you should read Unicode surrogates check.
else if(ucs2_test >= 0xD800 && ucs2_test <= 0xDBFF)
{
if(highsurrogate)
return false;
highsurrogate = true;
}
else if(ucs2_test >= 0xDC00 && ucs2_test <= 0xDFFF)
{
if(highsurrogate == false)
return false;
highsurrogate = false;
}
i+=2;
}
return if_ok;
}
bool TDBdetect_code_page::UTF32LE_signature_test()
{
unsigned int ucs4_test = 0;
register size_t i;
( IsUTF32LE_BOM_test() )? i = 4 : i = 0;
size_t n_bytes;
if( test_n_bytes==0) {n_bytes = src_n_bytes;}
else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes;}
else {n_bytes = src_n_bytes;}
if(n_bytes == 0 || i+3>= n_bytes) return false;
while( i + 3 < n_bytes )
{
ucs4_test = (src[i+3] << 24) + (src[i+2]<<16) + (src[i+1]<<8) + src[i];
if( ucs4_test <= 7 || (ucs4_test<=0x10&&ucs4_test>=0x0e ) ) // Fix me, if I'm wrong.
return false;
if( ucs4_test>=0x0010FFFF)
return false;
i+=4;
}
return true;
}
bool TDBdetect_code_page::UTF32BE_signature_test()
{
unsigned int ucs4_test = 0;
register size_t i;
( IsUTF32BE_BOM_test() )? i = 4 : i = 0;
size_t n_bytes;
if( test_n_bytes==0) {n_bytes = src_n_bytes;}
else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes;}
else {n_bytes = src_n_bytes;}
if(n_bytes == 0 || i+3>= n_bytes) return false;
while( i + 3 < n_bytes )
{
ucs4_test = (src[i] << 24) + ( src[i+1]<<16 ) + ( src[i+2]<<8 ) + src[i+3];
if( ucs4_test <= 7 || (ucs4_test<=0x10&&ucs4_test>=0x0e) ) // Fix me, if I'm wrong.
return false;
if( ucs4_test>=0x0010FFFF)
return false;
i+=4;
}
return true;
}
int TDBdetect_code_page::Unicode_signature_test()
{
if(UTF8_signature_test()) {return UTF8_CODE;}
if(UTF16LE_signature_test()) {return UTF16LE_CODE;}
if(UTF16BE_signature_test()) {return UTF16BE_CODE;}
if(UTF32LE_signature_test()) {return UTF32LE_CODE;}
if(UTF32BE_signature_test()) {return UTF32BE_CODE;}
else {return OTHER_CODE;}
}
int TDBdetect_code_page::Chinese_code_test()
{
double error_limt=0.05; // Give 5% error capility.
size_t n_bytes; // In the loop, test n bytes
if( test_n_bytes==0) {n_bytes = src_n_bytes;}
else if(test_n_bytes < src_n_bytes){n_bytes = test_n_bytes; }
else {n_bytes = src_n_bytes;}
unsigned short int test_char = 0;
int big5 = 0; int gbk = 0; int total_words=0; // total words are all two bytes world.
int big5_error = 0; int gbk_error = 0; // Boundary test
unsigned register int i = 0;
while(i+1 < n_bytes)
{
if( src[i] & 0x80 ) // Non ascii mode
{
test_char = (src[i] << 8) + src[i+1];
switch(test_char)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -