📄 tdbconvertencode.cpp
字号:
{
// The followings char are often used in BIG5
case 0xa143: case 0xa148: case 0xa149: // 。?!
case 0xa14b: case 0xa158: case 0xa15e: // …—)
case 0xa160: case 0xa162: case 0xa164: // ︶}︸
case 0xa16a: case 0xa16c: case 0xa16e: // 】︼》
case 0xa170: case 0xa172: case 0xa176: // ︾〉」
case 0xa17a: case 0xa1a2: case 0xa1a4: // 』﹜﹞
case 0xa1a6: case 0xa1a8: case 0xa1aa: // ’”〞
case 0xa1ac: case 0xaaba: case 0xa7da: // ′的我
++big5;break;
// The followings char are often used in GBK
case 0xa1a3: case 0xa1ad: case 0xa1af: // 。…’
case 0xa1b1: case 0xa1b5: case 0xa1b7: // ”〉》
case 0xa1b9: case 0xa1bb: case 0xa1bf: // 」〉》
case 0xa1e4: case 0xa3a1: case 0xa3ac: // ′!,
case 0xa3a9: case 0xa3bf: case 0xa3fd: // )?}
case 0xa6e1: case 0xa6e7: case 0xa6ef: // ︶︾︼
case 0xa6f1: case 0xa895: case 0xa979: // ︸〞﹜
case 0xa97b: case 0xb5c4: case 0xced2: // ﹞的我
++gbk; break;
}
// 因為有造字區的存在,所導致超出邊界也不能說他絕對不是某某編碼,
// 但是實務上一篇文章在造字區的字一定少很多,所以就用比重來測試。
// Fisrt Big5 test
//Normally, Big5's first byte should be between A1~FE
// and second byte should be between 40~7E or A1~FE
// 假如有在上面區域,則測試是否有在使用者定義區,把造字區的字算成錯誤。
if( (0xA1<=src[i]&&src[i]<=0xFE) && //測試是否在big5邊界
( (0x40<=src[i+1]&&src[i+1]<=0x7E) || (0xA1<=src[i+1]&&src[i+1]<=0xFE) ))
{
if( 0xA140 <=test_char && test_char <= 0xA3BF || test_char == 0xA3E1){} //標點符號等等
else if( 0xA440 <=test_char && test_char <= 0xC67E ){} //第一級常用漢字
else if( 0xC940 <=test_char && test_char <= 0xF9FE ){}//第二級次常用漢字
else ++big5_error ;
}
else{ big5_error+=3; }// Very serious problem, so the weight is hight.
// Second, GBK test,測試是否在gbk邊界
// GBK在 firt byte A1~F7時,第二個byte一定不能在40~7E,這是很有用的判斷。
bool test_a = !((src[i] >= 0xA1 && src[i] <= 0xF7) && (src[i+1] >= 0x40 && src[i+1]<= 0x7E) );
bool test_b = (0x81<=src[i]&&src[i]<=0xFE) && ( (0x40<=src[i+1]&&src[i+1]<=0x7E) || (0x80<=src[i+1]&&src[i+1]<=0xFE) );
bool test_c = !((src[i] >= 0xF8 && src[i]<= 0xF9) && ( (src[i+1] >= 0x40 && src[i+1]<= 0x7E) || (src[i+1] >= 0xA1 &&src[i+1] <= 0xFE )));
bool test_d = 1;//!(src[i] >= 0xA8 && src[i] <= 0xAF) && ((src[i+1]>= 0x40 && src[i+1] <= 0x7E ) || (src[i+1] >= 0xA1 &&src[i+1] <= 0xFE ));
if( test_a && test_b && test_c)//&& test_d)
{
if( 0xA1A1 <=test_char && test_char <= 0xA9FE){} //標點符號等和非漢字區
else if( 0xB0A1 <=test_char && test_char <= 0xF7FE ){} //第一級漢字
else if( 0x8140 <=test_char && test_char <= 0xA0FE ){} //第二級漢字
else if( 0xAA40 <=test_char && test_char <= 0xFEA0 ){} //第三級漢字
else if( 0xA840 <=test_char && test_char <= 0xA9A0 ){} //第四級漢字
else ++gbk_error ;
}
else{ gbk_error+=3;}
++i; ++total_words;
}
++i ;
}
//cout << endl<< " total_words: "<<total_words<<endl<<
// "big5: "<<big5<< " big5_error: "<<big5_error<<
// " big5_error_percent: "<<(double) big5_error/total_words <<endl;
//cout<<"gbk: "<<gbk<< " gbk_error: "<<gbk_error<<
// " gbk_error_percent: "<<(double) gbk_error/total_words <<endl;
if(total_words==0)
return ASCII_CODE;
if( (big5>gbk) && (big5_error<gbk_error) && (
(double) big5_error/total_words < error_limt ) )
return Big5_CODE;
else if( (big5<gbk) && (big5_error>gbk_error) && (
(double) gbk_error/total_words < error_limt ) )
return GBK_CODE;
else if( big5 == gbk) // Not offten occor,故用比較嚴謹的判斷錯誤上限
{
if( (big5_error<gbk_error) && (
(double) big5_error/total_words < error_limt/10 ) )
return Big5_CODE;
if( (big5_error>gbk_error) && (
(double) gbk_error/total_words < error_limt/10 ) )
return GBK_CODE;
}
else // 好吧,如果這樣子還猜不出來答案..XD,我放棄了。
return 0;
}
inline size_t TDBdetect_code_page::a_UCS4toUTF32BE(const unsigned int& src, unsigned char *outbuf)
{
if(src>0x10FFFF) return 0;
outbuf[0]= (src & 0xFF000000) >> 24 ;
outbuf[1]= (src & 0x00FF0000) >> 16 ;
outbuf[2]= (src & 0x0000FF00) >> 8 ;
outbuf[3]= (src & 0x000000FF) ;
return 4;
}
inline size_t TDBdetect_code_page::a_UCS4toUTF32LE(const unsigned int& src, unsigned char *outbuf)
{
if(src>0x10FFFF) return 0;
outbuf[0]= (src & 0x000000FF) ;
outbuf[1]= (src & 0x0000FF00) >> 8 ;
outbuf[2]= (src & 0x00FF0000) >> 16 ;
outbuf[3]= (src & 0xFF000000) >> 24 ;
return 4;
}
inline size_t TDBdetect_code_page::a_UCS4toUTF16BE(const unsigned int& src, unsigned char *outbuf)
{
if(src<0x10000)
{
outbuf[0]= (src & 0xFF00) >> 8 ;
outbuf[1]= (src & 0x00FF) ;
return 2;
}
if( (src>0xFFFF) && (src<0x110000) ) // Unicode surrogate mode
{
unsigned int temp = src - 0x10000;
unsigned short int high = (temp>>10)+0xD800;
unsigned short int low = (src&0x3FF)+0xDC00;
outbuf[0]=(high & 0xFF00)>>8;
outbuf[1]=(high & 0x00FF) ;
outbuf[2]=(low & 0xFF00)>>8;
outbuf[3]=(low & 0x00FF) ;
return 4;
}
return 0;
}
inline size_t TDBdetect_code_page::a_UCS4toUTF16LE(const unsigned int& src, unsigned char *outbuf)
{
if(src<0x10000)
{
outbuf[0]= (src & 0x00FF) ;
outbuf[1]= (src & 0xFF00) >> 8 ;
return 2;
}
if( (src>0xFFFF) && (src<0x110000) ) // Unicode surrogate mode
{
unsigned int temp = src - 0x10000;
unsigned short int high = (temp>>10)+0xD800;
unsigned short int low = (src&0x3FF)+0xDC00;
outbuf[0]=(high & 0x00FF) ;
outbuf[1]=(high & 0xFF00)>>8;
outbuf[2]=(low & 0x00FF) ;
outbuf[3]=(low & 0xFF00)>>8;
return 4;
}
return 0;
}
inline size_t TDBdetect_code_page::a_UCS4toUTF8(const unsigned int& src, unsigned char *outbuf)
{
if( src<0x80)
{
outbuf[0]=src;
return 1;
}
if(src<0x800)
{
outbuf[0]= 0xC0 | (src>>6);
outbuf[1]= 0x80 | (src&0x3F);
return 2;
}
if(src<0x10000)
{
outbuf[0]= 0xE0 | (src>>12);
outbuf[1]= 0x80 | ((src>>6)&0x3F);
outbuf[2]= 0x80 | (src&0x3F);
return 3;
}
if(src<0x110000)
{
outbuf[0]= 0xF0 | (src>>18);
outbuf[1]= 0x80 | ((src>>12)&0x3F);
outbuf[2]= 0x80 | ((src>>6)&0x3F);
outbuf[3]= 0x80 | (src&0x3F);
return 4;
}
return 0;
}
inline int TDBdetect_code_page::a_UTF8toUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{
if(len < 1) return -1;
if( ! ( src[0] & 0x80 ) )
{
outbuf = src[0];
return 1;
}
if(len < 2) return -2;
if( ! ( ( src[0] & 0x20 ) | ( src[1] & 0x40 ) ) )
{
outbuf = ( ( src[0] & 0x1f ) << 6 ) | ( ( src[1] & 0x3f ) );
return 2;
}
if(len < 3) return -3;
if( ! ( ( src[0] & 0x10) | ( src[1] & 0x40 ) | ( src[2] & 0x40 ) ) )
{
outbuf = ( ( src[0] & 0x0f ) << 12) | ( ( src[1] & 0x3f ) << 6 ) | ( ( src[2] & 0x3f ) );
return 3;
}
if(len < 4) return -4;
if( ! ( (src[0] &0x08 ) | ( src[1] & 0x40 ) | ( src[2] & 0x40 ) | ( src[3] & 0x40 ) ) )
{
outbuf = ( ( src[0] & 0x07) << 18) | ( ( src[1] & 0x3f) << 12 ) | ( ( src[2] & 0x3f ) << 6 ) | ( ( src[3] & 0x3f ) );
return 4;
}
return 0;
}
inline int TDBdetect_code_page::a_UTF32BEtoUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{ if(len<4) return -4; outbuf = ( src[0] << 24 ) + ( src[1] << 16 ) + ( src[2] << 8 ) + src[3] ; return 4; }
inline int TDBdetect_code_page::a_UTF32LEtoUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{ if(len<4) return -4; outbuf = ( src[3] << 24 ) + ( src[2] << 16 ) + ( src[1] << 8 ) + src[0] ; return 4; }
inline int TDBdetect_code_page::a_UTF16BEtoUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{
if(len < 2) return -2;
unsigned int high = (src[0] << 8 ) + src[1];
if( high >= 0xD800 && high <= 0xDBFF)
{
if(len < 4 ) return -4;
unsigned int low = (src[2] << 8 ) + src[3];
if( low>= 0xDC00 && low <=0xDFFF )
{
outbuf = ( ( high - 0xD800 ) << 10 ) + ( low - 0xDC00 ) + 0x10000;
return 4;
}
}
outbuf = high;
return 2;
}
inline int TDBdetect_code_page::a_UTF16LEtoUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{
if(len < 2) return -2;
unsigned int high = (src[1] << 8 ) + src[0];
if( high >= 0xD800 && high <= 0xDBFF)
{
if(len < 4 ) return -4;
unsigned int low = (src[3] << 8 ) + src[2];
if( low>= 0xDC00 && low <=0xDFFF )
{
outbuf = ( ( high - 0xD800 ) << 10 ) + ( low - 0xDC00 ) + 0x10000;
return 4;
}
}
outbuf = high;
return 2;
}
inline int TDBdetect_code_page::a_Big5toUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{
if(len < 1) return -1;
if( ! ( src[0] & 0x80 ) )
{
outbuf = big5_standard_to_unicode( src[0] );
return 1;
}
if(len < 2) return -2;
else
{
outbuf =big5_standard_to_unicode( (src[0] <<8) + src[1] );
return 2;
}
return 0;
}
inline int TDBdetect_code_page::a_Big5UAOtoUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{
if(len < 1) return -1;
if( ! ( src[0] & 0x80 ) )
{
outbuf = big5_uao_to_unicode( src[0] );
return 1;
}
if(len < 2) return -2;
else
{
outbuf =big5_standard_to_unicode( (src[0] <<8) + src[1] );
return 2;
}
return 0;
}
inline int TDBdetect_code_page::a_Big52003toUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{
if(len < 1) return -1;
if( ! ( src[0] & 0x80 ) )
{
outbuf = big5_2003_to_unicode( src[0] );
return 1;
}
if(len < 2) return -2;
else
{
outbuf =big5_standard_to_unicode( (src[0] <<8) + src[1] );
return 2;
}
return 0;
}
inline int TDBdetect_code_page::a_Big5HKSCStoUCS4(const unsigned char* src, unsigned int& outbuf, size_t len)
{
if(len < 1) return -1;
if( ! ( src[0] & 0x80 ) )
{
outbuf = big5_hkscs_to_unicode( src[0] );
return 1;
}
if(len < 2) return -2;
else
{
outbuf =big5_standard_to_unicode( (src[0] <<8) + src[1] );
return 2;
}
return 0;
}
size_t TDBdetect_code_page::ConvertFromSrctoResultString(int code_page)
{
int (*pfun)(const unsigned char*, unsigned int&, size_t) =NULL;
bool if_fun_init = false;
switch(code_page)
{
case UTF8_CODE: pfun = &TDBdetect_code_page::a_UTF8toUCS4; if_fun_init = true; break;
case UTF32BE_CODE: pfun = &TDBdetect_code_page::a_UTF32BEtoUCS4; if_fun_init = true; break;
case UTF32LE_CODE: pfun = &TDBdetect_code_page::a_UTF32LEtoUCS4; if_fun_init = true; break;
case UTF16BE_CODE: pfun = &TDBdetect_code_page::a_UTF16BEtoUCS4; if_fun_init = true; break;
case UTF16LE_CODE: pfun = &TDBdetect_code_page::a_UTF16LEtoUCS4; if_fun_init = true; break;
case Big5_CODE: pfun = &TDBdetect_code_page::a_Big5toUCS4; if_fun_init = true; break;
case Big5_HKSCS_CODE: pfun = &TDBdetect_code_page::a_Big5HKSCStoUCS4; if_fun_init = true; break;
case Big5_UAO_CODE: pfun = &TDBdetect_code_page::a_Big5UAOtoUCS4; if_fun_init = true; break;
case Big5_2003_CODE: pfun = &TDBdetect_code_page::a_Big52003toUCS4; if_fun_init = true; break;
case ASCII_CODE: pfun = &TDBdetect_code_page::a_UTF8toUCS4; if_fun_init = true; break;
default: return 0;
} //Fix me, if = 0 case
register unsigned int temp=0;
register signed char return_flag=0;
register size_t i=0;
while(i < src_n_bytes)
{
return_flag = pfun(src+i ,temp , src_n_bytes -i);
if(return_flag>0 && temp!=0)
{
output_string->push_back(temp);
i+=return_flag;
continue;
}
if(return_flag>0 && temp==0)
{
unsigned int unknow = '?'; // If unknow in src
output_string->push_back(unknow);
i+=return_flag;
continue;
}
if(return_flag==0) // Some unknow error, just pass it.
{
++i;
continue;
}
if(return_flag<0) return i; // Well, memory lock.
}
return i;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -