📄 utf8fix.c
字号:
/*
AutoConvert, a Chinese HZ/GB/Big5 encodings auto-converter
Copyright (C) 1999 于广辉 Yu Guanghui <ygh@debian.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or any
later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
*/
/*
* author: Yu Guanghui <ygh@debian.org>
* Network Center
* Dalian Univ. of Tech.
* 1999.5
*/
/*
*Author: Ha Shao <hashao@china.com>
*Date: 2000.08.30
*/
/* Judge Encoding based on character frequency.
* We will use 2 hash tables generated from GNU gperf
*/
// Platform settings
#include <OggOs.h>
// This file is for non PLUGIN_SYSTEM only
#if !defined(PLUGIN_SYSTEM)
//#include "zhstatis.h"
#include "Utf8Fix.h"
/////////////////////////////////////////////////
// Locals
//
// not used, #define GBTOPPER 3.500849 /* Top GB frequency. */
// not used, #define B5TOPPER 3.803567 /* Top Big5 frequency. */
// not used, #define MAX_MODULE 16
// not used, #define MAX_BUFFER 8192
struct charHz { char *name; double percent; };
/////////////////////////////////////////////////
// Local Function Prototypes
//
static unsigned int b5Hash ( register const char *str, register unsigned int len );
static unsigned int gbHash ( register const char *str, register unsigned int len );
static const struct charHz* inBig5( register const char *str, register unsigned int len );
static const struct charHz* inGB( register const char *str, register unsigned int len );
static int j_code3( const char * buff, int count );
/* ANSI-C code produced by gperf version 2.7.1 (19981006 egcs) */
/* Command-line: gperf -L ANSI-C -I -t -H gbHash -N inGB -C -E -o -n -c -D gbpercent.txt */
/****************************************************
400 Most frequently used GB chars.
****************************************************/
/* maximum key range = 991, duplicates = 5 */
/////////////////////////////////////////////////
// Global Functions
//
#ifdef OGGPLAYPLUGIN
#define EXPORTED
#else
#define EXPORTED EXPORT_C
#endif
EXPORTED int jcode(const char* buff)
{
return j_code(buff, _ogg_strlen(buff));
}
int j_code(const char* buff, int count)
{
const unsigned char * phz;
int c_gb=0;
int c_big5=0;
char HZ_START[]="~{";
char HZ_END[]="~}";
/* first we look up "我" and "的" ,both gb and big5
* in the text.
*/
for(phz=(unsigned char * /*FIXIT*/)buff;phz<((unsigned char * /*FIXIT*/)buff+count);phz++){
if(*phz & 0x80){
if((*phz==0xB5&&*(phz+1)==0xC4) || ((*phz==0xCE)&&*(phz+1)==0xD2)){
c_gb++;
phz++;
continue;
}else if((*phz==0xAA&&*(phz+1)==0xBA)|| ((*phz==0xA7)&&*(phz+1)==0xDA)){
c_big5++;
phz++;
continue;
}
phz++;
}
}
if(c_gb > c_big5){
return GB_CODE;
}else if (c_gb == c_big5){ //c_gb == 0,c_big5==0
if(_ogg_strstr(buff,HZ_START)!=NULL && _ogg_strstr(buff,HZ_END)!=NULL){
return HZ_CODE;
}
/*There is not "我" and "的" in the text
*So we test the text with a 400 words table.
*/
return j_code3(buff,count);
}else{
return BIG5_CODE;
}
}
/////////////////////////////////////////////////
// Local Functions
//
#ifdef __GNUC__
__inline
#endif
static unsigned int
gbHash (register const char *str, register unsigned int len)
{
static const unsigned short asso_values[] =
{
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 991, 991, 991, 991, 991, 991, 991, 991, 991,
991, 91, 425, 320, 148, 70, 510, 85, 203, 160,
6, 18, 480, 223, 113, 345, 28, 120, 320, 390,
75, 20, 275, 380, 230, 50, 433, 15, 145, 5,
455, 460, 163, 180, 338, 40, 8, 508, 510, 45,
325, 85, 30, 140, 105, 305, 3, 225, 100, 403,
113, 480, 25, 205, 0, 250, 393, 371, 63, 260,
268, 991, 10, 470, 405, 268, 348, 76, 285, 131,
408, 398, 991, 125, 330, 126, 388, 503, 51, 148,
350, 46, 1, 166, 15, 16, 485, 106, 458, 41,
103, 221, 21, 61, 500, 991
};
return asso_values[(unsigned char)str[len - 1]] + asso_values[(unsigned char)str[0]];
}
#ifdef __GNUC__
__inline
#endif
static const struct charHz *
inGB (register const char *str, register unsigned int len)
{
enum
{
TOTAL_KEYWORDS = 400,
MIN_WORD_LENGTH = 2,
MAX_WORD_LENGTH = 2,
MIN_HASH_VALUE = 0,
MAX_HASH_VALUE = 990
};
static const struct charHz wordlist[] =
{
{"种", 0.151748},
{"治", 0.062792},
{"知", 0.176508},
{"为", 0.442767},
{"文", 0.214763},
{"无", 0.213959},
{"只", 0.229438},
{"或", 0.085280},
{"位", 0.128747},
{"交", 0.073778},
{"华", 0.094296},
{"将", 0.168043},
{"到", 0.514231},
{"近", 0.063487},
{"的", 3.500849},
{"手", 0.211674},
{"元", 0.071841},
{"问", 0.140707},
{"式", 0.057680},
{"但", 0.167690},
{"怎", 0.072121},
{"话", 0.146944},
{"术", 0.070839},
{"务", 0.081974},
{"实", 0.156769},
{"今", 0.083052},
{"那", 0.382771},
{"没", 0.244614},
{"基", 0.057849},
{"毛", 0.075090},
{"得", 0.396983},
{"命", 0.093383},
{"众", 0.058726},
{"还", 0.233198},
{"活", 0.092784},
{"公", 0.188655},
{"倒", 0.054738},
{"内", 0.111181},
{"前", 0.200232},
{"是", 1.345411},
{"打", 0.129442},
{"次", 0.121678},
{"未", 0.054546},
{"使", 0.118288},
{"第", 0.123170},
{"你", 0.436374},
{"们", 0.495768},
{"民", 0.174189},
{"在", 0.964866},
{"山", 0.103285},
{"数", 0.091243},
{"传", 0.063780},
{"点", 0.152685},
{"中", 0.717652},
{"甚", 0.058755},
{"色", 0.074190},
{"叫", 0.089328},
{"主", 0.204736},
{"心", 0.273937},
{"西", 0.117537},
{"教", 0.117812},
{"过", 0.327524},
{"之", 0.327879},
{"因", 0.127394},
{"么", 0.276714},
{"我", 1.014949},
{"机", 0.137512},
{"要", 0.370943},
{"直", 0.075193},
{"条", 0.064573},
{"太", 0.103034},
{"性", 0.095783},
{"声", 0.151640},
{"一", 1.703619},
{"市", 0.101242},
{"神", 0.091301},
{"业", 0.134539},
{"五", 0.103181},
{"被", 0.126292},
{"深", 0.060073},
{"以", 0.349432},
{"当", 0.222406},
{"爱", 0.073591},
{"十", 0.211422},
{"员", 0.097189},
{"明", 0.152480},
{"四", 0.123866},
{"时", 0.412130},
{"物", 0.078221},
{"国", 0.624527},
{"半", 0.061338},
{"书", 0.098381},
{"钱", 0.055097},
{"说", 0.508134},
{"它", 0.075883},
{"极", 0.055787},
{"运", 0.061514},
{"听", 0.115919},
{"八", 0.075698},
{"情", 0.143753},
{"请", 0.073937},
{"及", 0.083070},
{"写", 0.056349},
{"清", 0.078525},
{"代", 0.104545},
{"至", 0.079151},
{"林", 0.074843},
{"所", 0.197660},
{"道", 0.421135},
{"两", 0.176988},
{"联", 0.064201},
{"生", 0.343557},
{"记", 0.077728},
{"小", 0.293018},
{"世", 0.114808},
{"留", 0.063376},
{"便", 0.128778},
{"工", 0.149985},
{"行", 0.203868},
{"她", 0.305242},
{"美", 0.152936},
{"建", 0.074596},
{"战", 0.076361},
{"求", 0.059639},
{"李", 0.057653},
{"此", 0.158364},
{"算", 0.073128},
{"花", 0.065122},
{"即", 0.059687},
{"六", 0.070030},
{"达", 0.061845},
{"现", 0.169370},
{"向", 0.143764},
{"江", 0.058127},
{"指", 0.068890},
{"令", 0.055467},
{"南", 0.074475},
{"收", 0.055267},
{"体", 0.100845},
{"来", 0.642527},
{"北", 0.113407},
{"大", 0.670268},
{"义", 0.072306},
{"系", 0.085791},
{"原", 0.089937},
{"字", 0.074014},
{"更", 0.087664},
{"水", 0.102945},
{"展", 0.063943},
{"转", 0.059482},
{"家", 0.324818},
{"些", 0.166013},
{"名", 0.142495},
{"正", 0.143041},
{"周", 0.062711},
{"该", 0.063875},
{"果", 0.084231},
{"结", 0.070334},
{"自", 0.340950},
{"能", 0.253537},
{"白", 0.089789},
{"资", 0.066471},
{"会", 0.342850},
{"思", 0.068379},
{"流", 0.067758},
{"武", 0.061514},
{"技", 0.058356},
{"东", 0.121345},
{"亲", 0.084066},
{"受", 0.084736},
{"对", 0.311761},
{"远", 0.066691},
{"最", 0.134213},
{"死", 0.089264},
{"识", 0.056331},
{"万", 0.101912},
{"台", 0.078054},
{"上", 0.636581},
{"里", 0.300629},
{"轻", 0.058334},
{"干", 0.076271},
{"儿", 0.146378},
{"管", 0.067145},
{"了", 1.283668},
{"王", 0.084392},
{"报", 0.094190},
{"步", 0.060622},
{"任", 0.079391},
{"感", 0.088464},
{"跟", 0.065369},
{"不", 1.307755},
{"各", 0.080068},
{"年", 0.402590},
{"然", 0.251950},
{"住", 0.097438},
{"全", 0.151277},
{"等", 0.136630},
{"往", 0.064102},
{"安", 0.090869},
{"什", 0.130740},
{"想", 0.217789},
{"解", 0.081322},
{"金", 0.088088},
{"作", 0.224425},
{"德", 0.055379},
{"化", 0.107695},
{"他", 0.826706},
{"月", 0.152597},
{"让", 0.073580},
{"见", 0.196038},
{"事", 0.229908},
{"共", 0.077287},
{"提", 0.088749},
{"几", 0.135949},
{"革", 0.060397},
{"分", 0.168552},
{"夫", 0.067138},
{"利", 0.084762},
{"方", 0.202303},
{"三", 0.185462},
{"老", 0.182608},
{"持", 0.053845},
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -