charsets.c

来自「一个很有名的浏览器」· C语言代码 · 共 875 行 · 第 1/2 页
875 行
/* Charsets convertor *//* $Id: charsets.c,v 1.108.4.1 2005/04/05 21:08:41 jonas Exp $ */#ifdef HAVE_CONFIG_H#include "config.h"#endif#if HAVE_LANGINFO_CODESET#include <langinfo.h>#endif#include <ctype.h>#include <stdlib.h>#include "elinks.h"#include "document/options.h"#include "intl/charsets.h"#include "util/conv.h"#include "util/error.h"#include "util/fastfind.h"#include "util/memory.h"#include "util/string.h"#include "util/types.h"/* Fix namespace clash on MacOS. */#define table table_elinksstruct table_entry {	unsigned char c;	unicode_val u;};struct codepage_desc {	unsigned char *name;	unsigned char **aliases;	struct table_entry *table;};#include "codepage.inc"#include "uni_7b.inc"#include "entity.inc"static char strings[256][2] = {	"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",	"\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",	"\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",	"\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",	"\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",	"\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",	"\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",	"\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",	"\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",	"\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",	"\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",	"\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",	"\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",	"\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",	"\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",	"\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",	"\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",	"\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",	"\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",	"\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",	"\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",	"\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",	"\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",	"\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",	"\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",	"\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",	"\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",	"\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",	"\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",	"\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",	"\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",	"\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",};static voidfree_translation_table(struct conv_table *p){	int i;	for (i = 0; i < 256; i++)		if (p[i].t)			free_translation_table(p[i].u.tbl);	mem_free(p);}static unsigned char *no_str = "*";static voidnew_translation_table(struct conv_table *p){	int i;	for (i = 0; i < 256; i++)		if (p[i].t)			free_translation_table(p[i].u.tbl);	for (i = 0; i < 128; i++) {		p[i].t = 0;		p[i].u.str = strings[i];	}	for (; i < 256; i++) {		p[i].t = 0;	       	p[i].u.str = no_str;	}}#define BIN_SEARCH(table, entry, entries, key, result)					\{											\	long _s = 0, _e = (entries) - 1;						\											\	while (_s <= _e || !((result) = -1)) {						\		long _m = (_s + _e) / 2;						\											\		if ((table)[_m].entry == (key)) {					\			(result) = _m;							\			break;								\		}									\		if ((table)[_m].entry > (key)) _e = _m - 1;				\		if ((table)[_m].entry < (key)) _s = _m + 1;				\	}										\}											\static const unicode_val strange_chars[32] = {0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,};#define SYSTEM_CHARSET_FLAG 128unsigned char *u2cp_(unicode_val u, int to, int no_nbsp_hack){	int j;	int s;	if (u < 128) return strings[u];	/* To mark non breaking spaces, we use a special char NBSP_CHAR. */	if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;	if (u == 0xad) return "";	if (u < 0xa0) {		unicode_val strange = strange_chars[u - 0x80];		if (!strange) return NULL;		return u2cp_(strange, to, no_nbsp_hack);	}	to &= ~SYSTEM_CHARSET_FLAG;	for (j = 0; codepages[to].table[j].c; j++)		if (codepages[to].table[j].u == u)			return strings[codepages[to].table[j].c];	BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);	if (s != -1) return unicode_7b[s].s;	return no_str;}static unsigned char utf_buffer[7];static unsigned char *encode_utf_8(unicode_val u){	memset(utf_buffer, 0, 7);	if (u < 0x80)		utf_buffer[0] = u;	else if (u < 0x800)		utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),		utf_buffer[1] = 0x80 | (u & 0x3f);	else if (u < 0x10000)		utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),		utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),		utf_buffer[2] = 0x80 | (u & 0x3f);	else if (u < 0x200000)		utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),		utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),		utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),		utf_buffer[3] = 0x80 | (u & 0x3f);	else if (u < 0x4000000)		utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),		utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),		utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),		utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),		utf_buffer[4] = 0x80 | (u & 0x3f);	else	utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),		utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),		utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),		utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),		utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),		utf_buffer[5] = 0x80 | (u & 0x3f);	return utf_buffer;}/* This slow and ugly code is used by the terminal utf_8_io */unsigned char *cp2utf_8(int from, int c){	int j;	from &= ~SYSTEM_CHARSET_FLAG;	if (codepages[from].table == table_utf_8 || c < 128)		return strings[c];	for (j = 0; codepages[from].table[j].c; j++)		if (codepages[from].table[j].c == c)			return encode_utf_8(codepages[from].table[j].u);	return encode_utf_8(UCS_NO_CHAR);}static voidadd_utf_8(struct conv_table *ct, unicode_val u, unsigned char *str){	unsigned char *p = encode_utf_8(u);	while (p[1]) {		if (ct[*p].t) ct = ct[*p].u.tbl;		else {			struct conv_table *nct;			assertm(ct[*p].u.str == no_str, "bad utf encoding #1");			if_assert_failed return;			nct = mem_calloc(256, sizeof(*nct));			if (!nct) return;			new_translation_table(nct);			ct[*p].t = 1;			ct[*p].u.tbl = nct;			ct = nct;		}		p++;	}	assertm(!ct[*p].t, "bad utf encoding #2");	if_assert_failed return;	if (ct[*p].u.str == no_str)		ct[*p].u.str = str;}struct conv_table utf_table[256];int utf_table_init = 1;static voidfree_utf_table(void){	int i;	for (i = 128; i < 256; i++)		mem_free(utf_table[i].u.str);}static struct conv_table *get_translation_table_to_utf_8(int from){	int i;	static int lfr = -1;	if (from == -1) return NULL;	from &= ~SYSTEM_CHARSET_FLAG;	if (from == lfr) return utf_table;	if (utf_table_init)		memset(utf_table, 0, sizeof(utf_table)),		utf_table_init = 0;	else		free_utf_table();	for (i = 0; i < 128; i++)		utf_table[i].u.str = strings[i];	if (codepages[from].table == table_utf_8) {		for (i = 128; i < 256; i++)			utf_table[i].u.str = stracpy(strings[i]);		return utf_table;	}	for (i = 128; i < 256; i++)		utf_table[i].u.str = NULL;	for (i = 0; codepages[from].table[i].c; i++) {		unicode_val u = codepages[from].table[i].u;		if (!utf_table[codepages[from].table[i].c].u.str)			utf_table[codepages[from].table[i].c].u.str =				stracpy(encode_utf_8(u));	}	for (i = 128; i < 256; i++)		if (!utf_table[i].u.str)			utf_table[i].u.str = stracpy(no_str);	return utf_table;}struct conv_table table[256];static int first = 1;voidfree_conv_table(void){	if (!utf_table_init) free_utf_table();	if (first) {		memset(table, 0, sizeof(table));		first = 0;	}	new_translation_table(table);}struct conv_table *get_translation_table(int from, int to){	static int lfr = -1;	static int lto = -1;	from &= ~SYSTEM_CHARSET_FLAG;	to &= ~SYSTEM_CHARSET_FLAG;	if (first) {		memset(table, 0, sizeof(table));		first = 0;	}	if (/*from == to ||*/ from == -1 || to == -1)		return NULL;	if (codepages[to].table == table_utf_8)		return get_translation_table_to_utf_8(from);	if (from == lfr && to == lto)		return table;	lfr = from;	lto = to;	new_translation_table(table);	if (codepages[from].table == table_utf_8) {		int i;		for (i = 0; codepages[to].table[i].c; i++)			add_utf_8(table, codepages[to].table[i].u,				  strings[codepages[to].table[i].c]);		for (i = 0; unicode_7b[i].x != -1; i++)			if (unicode_7b[i].x >= 0x80)				add_utf_8(table, unicode_7b[i].x,					  unicode_7b[i].s);	} else {		int i;		for (i = 128; i < 256; i++) {			int j;			for (j = 0; codepages[from].table[j].c; j++) {				if (codepages[from].table[j].c == i) {					unsigned char *u;					u = u2cp(codepages[from].table[j].u, to);					if (u) table[i].u.str = u;					break;				}			}		}	}	return table;}static inline intxxstrcmp(unsigned char *s1, unsigned char *s2, int l2){	while (l2) {		if (*s1 > *s2) return 1;		if (*s1 < *s2) return -1;		s1++;	       	s2++;		l2--;	}	return *s2 ? -1 : 0;}/* Entity cache debugging purpose. */#if 0#define DEBUG_ENTITY_CACHE#else#undef DEBUG_ENTITY_CACHE#endifstruct entity_cache {	unsigned int hits;	int strlen;	int encoding;	unsigned char *result;	unsigned char str[20]; /* Suffice in any case. */};static inthits_cmp(struct entity_cache *a, struct entity_cache *b){	if (a->hits == b->hits) return 0;	if (a->hits > b->hits) return -1;	else return 1;}static intcompare_entities(const void *key_, const void *element_){	struct string *key = (struct string *) key_;	struct entity *element = (struct entity *) element_;	int length = key->length;	unsigned char *first = key->source;	unsigned char *second = element->s;	return xxstrcmp(first, second, length);}unsigned char *get_entity_string(const unsigned char *str, const int strlen, int encoding){#define ENTITY_CACHE_SIZE 10	/* 10 seems a good value. */#define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1			           will go in [0] table */	static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];	static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];	static int first_time = 1;	unsigned int slen;	unsigned char *result = NULL;	if (strlen <= 0) return NULL;
charsets.c - 源码说明

本页面展示了「一个很有名的浏览器」中的 charsets.c 源码文件，采用 C语言编程语言编写，共 875 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与浏览器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?