📄 charsets.c
字号:
if (first_time) { memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int)); first_time = 0; } /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website * + google + slashdot + websites that result from a search for test on google, * + various ones) show a quite impressive improvment: * Top ten is: * 0: hits=2459 l=4 st='nbsp' * 1: hits=2152 l=6 st='eacute' * 2: hits=235 l=6 st='egrave' * 3: hits=136 l=6 st='agrave' * 4: hits=100 l=3 st='amp' * 5: hits=40 l=5 st='laquo' * 6: hits=8 l=4 st='copy' * 7: hits=5 l=2 st='gt' * 8: hits=2 l=2 st='lt' * 9: hits=1 l=6 st='middot' * * Most of the time cache hit ratio is near 95%. * * A long test shows: 15186 hits vs. 24 misses and mean iteration * count is kept < 2 (worst case 1.58). Not so bad ;) * * --Zas */ /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */ slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0; if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) { int i; for (i = 0; i < nb_entity_cache[slen]; i++) { if (entity_cache[slen][i].encoding == encoding && !memcmp(str, entity_cache[slen][i].str, strlen)) {#ifdef DEBUG_ENTITY_CACHE static double total_iter = 0; static unsigned long hit_count = 0; total_iter += i + 1; hit_count++; fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);#endif if (entity_cache[slen][i].hits < (unsigned int) ~0) entity_cache[slen][i].hits++; return entity_cache[slen][i].result; } }#ifdef DEBUG_ENTITY_CACHE fprintf(stderr, "miss\n");#endif } if (*str == '#') { /* Numeric entity. */ int l = (int) strlen; unsigned char *st = (unsigned char *) str; unicode_val n = 0; if (l == 1) goto end; /* &#; ? */ st++, l--; if ((*st | 32) == 'x') { /* Hexadecimal */ if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */ st++, l--; do { unsigned char c = (*(st++) | 32); if (isdigit(c)) n = (n << 4) | (c - '0'); else if (isxdigit(c)) n = (n << 4) | (c - 'a' + 10); else goto end; /* Bad char. */ } while (--l); } else { /* Decimal */ if (l > 10) goto end; /* 4294967295 max. */ do { unsigned char c = *(st++); if (isdigit(c)) n = n * 10 + c - '0'; else goto end; /* Bad char. */ /* Limit to 0xFFFFFFFF. */ if (n == (unicode_val) 0xFFFFFFFF) goto end; } while (--l); } result = u2cp(n, encoding);#ifdef DEBUG_ENTITY_CACHE fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);#endif } else { /* Text entity. */ struct string key = INIT_STRING((unsigned char *) str, strlen); struct entity *element = bsearch((void *) &key, entities, N_ENTITIES, sizeof(*element), compare_entities); if (element) result = u2cp(element->c, encoding); }end: /* Take care of potential buffer overflow. */ if (strlen < sizeof(entity_cache[slen][0].str)) { struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]]; /* Copy new entry to cache. */ ece->hits = 1; ece->strlen = strlen; ece->encoding = encoding; ece->result = result; memcpy(ece->str, str, strlen); ece->str[strlen] = '\0'; /* Increment number of cache entries if possible. */ if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;#ifdef DEBUG_ENTITY_CACHE fprintf(stderr, "Added in [%d]: l=%d st='%s'\n", slen, entity_cache[slen][0].strlen, entity_cache[slen][0].str);#endif /* Sort entries by hit order. */ if (nb_entity_cache[slen] > 1) qsort(&entity_cache[slen][0], nb_entity_cache[slen], sizeof(entity_cache[slen][0]), (void *) hits_cmp);#ifdef DEBUG_ENTITY_CACHE { unsigned int i; fprintf(stderr, "- Cache entries [%d] -\n", slen); for (i = 0; i < nb_entity_cache[slen] ; i++) fprintf(stderr, "%d: hits=%d l=%d st='%s'\n", i, entity_cache[slen][i].hits, entity_cache[slen][i].strlen, entity_cache[slen][i].str); fprintf(stderr, "-----------------\n"); }#endif } return result;}unsigned char *convert_string(struct conv_table *convert_table, unsigned char *chars, int charslen, enum convert_string_mode mode, int *length, void (*callback)(void *data, unsigned char *buf, int buflen), void *callback_data){ unsigned char *buffer; int bufferpos = 0; int charspos = 0; if (!convert_table && !memchr(chars, '&', charslen)) { if (callback) { if (charslen) callback(callback_data, chars, charslen); return NULL; } else { return memacpy(chars, charslen); } } /* Buffer allocation */ buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */); if (!buffer) return NULL; /* Iterate ;-) */ while (charspos < charslen) { unsigned char *translit;#define PUTC do { \ buffer[bufferpos++] = chars[charspos++]; \ translit = ""; \ goto flush; \ } while (0) if (chars[charspos] != '&') { struct conv_table *t; int i; if (chars[charspos] < 128 || !convert_table) PUTC; t = convert_table; i = charspos; while (t[chars[i]].t) { t = t[chars[i++]].u.tbl; if (i >= charslen) PUTC; } translit = t[chars[i]].u.str; charspos = i + 1; } else if (mode == CSM_FORM || mode == CSM_NONE) { PUTC; } else { int start = charspos + 1; int i = start; while (i < charslen && (isasciialpha(chars[i]) || isdigit(chars[i]) || (chars[i] == '#'))) i++; /* This prevents bug 213: we were expanding "entities" * in URL query strings. */ /* XXX: But this disables    usage, which * appears to be relatively common! --pasky */ if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '=')) && i > start && !isasciialpha(chars[i]) && !isdigit(chars[i])) { translit = get_entity_string(&chars[start], i - start, global_doc_opts->cp); if (chars[i] != ';') { /* Eat    <foo> happily, but * pull back from the character after * entity string if it is not the valid * terminator. */ i--; } if (!translit) PUTC; charspos = i + (i < charslen); } else PUTC; } if (!translit[0]) continue; if (!translit[1]) { buffer[bufferpos++] = translit[0]; translit = ""; goto flush; } while (*translit) { unsigned char *new; buffer[bufferpos++] = *(translit++);flush: if (bufferpos & (ALLOC_GR - 1)) continue; if (callback) { buffer[bufferpos] = 0; callback(callback_data, buffer, bufferpos); bufferpos = 0; } else { new = mem_realloc(buffer, bufferpos + ALLOC_GR); if (!new) { mem_free(buffer); return NULL; } buffer = new; } }#undef PUTC } /* Say bye */ buffer[bufferpos] = 0; if (length) *length = bufferpos; if (callback) { if (bufferpos) callback(callback_data, buffer, bufferpos); mem_free(buffer); return NULL; } else { return buffer; }}#ifndef USE_FASTFINDintget_cp_index(unsigned char *name){ int i, a; int syscp = 0; if (!strcasecmp(name, "System")) {#if HAVE_LANGINFO_CODESET name = nl_langinfo(CODESET); syscp = SYSTEM_CHARSET_FLAG;#else name = "us-ascii";#endif } for (i = 0; codepages[i].name; i++) { for (a = 0; codepages[i].aliases[a]; a++) { /* In the past, we looked for the longest substring * in all the names; it is way too expensive, though: * * % cumulative self self total * time seconds seconds calls us/call us/call name * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index * * Anything called from redraw_screen() is in fact * relatively expensive, even if it's called just * once. So we will do a simple strcasecmp() here. */ if (!strcasecmp(name, codepages[i].aliases[a])) return i | syscp; } } if (syscp) { return get_cp_index("us-ascii") | syscp; } else { return -1; }}#elsestatic unsigned int i_name = 0;static unsigned int i_alias = 0;/* Reset internal list pointer */voidcharsets_list_reset(void){ i_name = 0; i_alias = 0;}/* Returns a pointer to a struct that contains current key and data pointers * and increment internal pointer. It returns NULL when key is NULL. */struct fastfind_key_value *charsets_list_next(void){ static struct fastfind_key_value kv; if (!codepages[i_name].name) return NULL; kv.key = codepages[i_name].aliases[i_alias]; kv.data = &codepages[i_name]; if (codepages[i_name].aliases[i_alias + 1]) i_alias++; else { i_name++; i_alias = 0; } return &kv;}static struct fastfind_index ff_charsets_index = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);/* It searchs for a charset named @name or one of its aliases and * returns index for it or -1 if not found. */intget_cp_index(unsigned char *name){ struct codepage_desc *codepage; int syscp = 0; if (!strcasecmp(name, "System")) {#if HAVE_LANGINFO_CODESET name = nl_langinfo(CODESET); syscp = SYSTEM_CHARSET_FLAG;#else name = "us-ascii";#endif } codepage = fastfind_search(&ff_charsets_index, name, strlen(name)); if (codepage) { assert(codepages <= codepage && codepage < codepages + N_CODEPAGES); return (codepage - codepages) | syscp; } else if (syscp) { return get_cp_index("us-ascii") | syscp; } else { return -1; }}#endif /* USE_FASTFIND */voidinit_charsets_lookup(void){#ifdef USE_FASTFIND fastfind_index(&ff_charsets_index, FF_COMPRESS);#endif}voidfree_charsets_lookup(void){#ifdef USE_FASTFIND fastfind_done(&ff_charsets_index);#endif}unsigned char *get_cp_name(int cp_index){ if (cp_index < 0) return "none"; if (cp_index & SYSTEM_CHARSET_FLAG) return "System"; return codepages[cp_index].name;}unsigned char *get_cp_mime_name(int cp_index){ if (cp_index < 0) return "none"; if (cp_index & SYSTEM_CHARSET_FLAG) return "System"; if (!codepages[cp_index].aliases) return NULL; return codepages[cp_index].aliases[0];}intis_cp_special(int cp_index){ cp_index &= ~SYSTEM_CHARSET_FLAG; return codepages[cp_index].table == table_utf_8;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -