📄 htmltokenizer.c
字号:
/******************************************************************************* * * htmltokenizer.c - HTML parsing engine * * Hash functions for identifying html tags. * * Cheetah Web Browser * Copyright (C) 2001 Garett Spencley * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * *******************************************************************************/#include <stdlib.h>#include <string.h>#include <ctype.h>#include "htmltokenizer.h"#include "html.h"#include "str_util.h"#define TOTAL_KEYWORDS 92#define MIN_WORD_LENGTH 1#define MAX_WORD_LENGTH 10#define MIN_HASH_VALUE 1#define MAX_HASH_VALUE 231/* * tag_hash() * * hashes html tags */__inline static unsigned int tag_hash(const char *str, unsigned int len){ static unsigned char asso_values[] = { 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 45, 40, 50, 20, 25, 30, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 90, 20, 75, 65, 47, 35, 10, 125, 115, 232, 20, 5, 5, 125, 7, 100, 0, 35, 40, 0, 55, 10, 232, 20, 35, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232 }; return len + asso_values[(unsigned char) str[len - 1]] + asso_values[(unsigned char) str[0]];}/* * lookup_tag() * * Returns the structure for the tag we are looking for */__inline html_token *lookup_tag(const char *str, unsigned int len){ /* DON'T TOUCH THIS! Use gperf on hash/tags.gperf */ static html_token wordlist[] = { {"Q", QUOTE, html_q}, {"TT", TT, html_tt}, {"TFOOT", TFOOT, html_tfoot}, {"OBJECT", OBJECT, html_object}, {"OL", OL, html_ol}, {"LABEL", LABEL, html_label}, {"BASEFONT", BASEFONT, html_basefont}, {"LINK", LINK, html_link}, {"BDO", BDO, html_bdo}, {"BIG", BIG, html_big}, {"TR", TR, html_tr}, {"FONT", FONT, html_font}, {"TBODY", TBODY, html_tbody}, {"B", BOLD, html_bold}, {"FIELDSET", FIELDSET, html_fieldset}, {"FRAMESET", FRAMESET, html_frameset}, {"FORM", FORM, html_form}, {"SCRIPT", SCRIPT, html_script}, {"SELECT", SELECT, html_select}, {"VAR", VAR, html_var}, {"SMALL", SMALL, html_small}, {"TITLE", TITLE, html_title}, {"TABLE", TABLE, html_table}, {"EM", EM, html_em}, {"STRONG", STRONG, html_strong}, {"BR", BR, html_br}, {"BODY", BODY, html_body}, {"UL", UL, html_ul}, {"SUB", SUB, html_sub}, {"MENU", MENU, html_menu}, {"TD", TD, html_td}, {"DT", DT, html_dt}, {"THEAD", THEAD, html_thead}, {"BASE", BASE, html_base}, {"DL", DL, html_dl}, {"DEL", STRIKE, html_strike}, {"LEGEND", LEGEND, html_legend}, {"BLOCKQUOTE", BLOCKQUOTE, html_blockquote}, {"DIV", DIV, html_div}, {"S", STRIKE, html_strike}, {"COL", COL, html_col}, {"FRAME", FRAME, html_frame}, {"KBD", KBD, html_kbd}, {"STYLE", STYLE, html_style}, {"STRIKE", STRIKE, html_strike}, {"APPLET", APPLET, html_applet}, {"TEXTAREA", TEXTAREA, html_textarea}, {"META", META, html_meta}, {"ACRONYM", ACRONYM, html_acronym}, {"DIR", DIRECTORY, html_dir}, {"MAP", MAP, html_map}, {"PARAM", PARAM, html_param}, {"U", UNDERLINE, html_underline}, {"OPTGROUP", OPTGROUP, html_optgroup}, {"CENTER", CENTER, html_center}, {"INPUT", INPUT, html_input}, {"LI", LI, html_li}, {"CODE", CODE, html_code}, {"CITE", CITE, html_cite}, {"TH", TH, html_th}, {"IMG", IMG, html_img}, {"ABBR", ABBR, html_abbr}, {"DD", DD, html_dd}, {"NOSCRIPT", NOSCRIPT, html_noscript}, {"HTML", HTML, html_html}, {"ADDRESS", ADDRESS, html_address}, {"OPTION", OPTION, html_option}, {"ISINDEX", ISINDEX, html_isindex}, {"SUP", SUP, html_sup}, {"SAMP", SAMP, html_samp}, {"H4", H4, html_header}, {"PRE", PRE, html_pre}, {"BUTTON", BUTTON, html_button}, {"H5", H5, html_header}, {"H6", H6, html_header}, {"INS", INS, html_ins}, {"HR", HR, html_hr}, {"NOBR", NOBR, html_nobr}, {"H2", H2, html_header}, {"IFRAME", IFRAME, html_iframe}, {"SPAN", SPAN, html_span}, {"H1", H1, html_header}, {"NOFRAMES", NOFRAMES, html_noframes}, {"H3", H3, html_header}, {"A", A, html_a}, {"COLGROUP", COLGROUP, html_colgroup}, {"AREA", AREA, html_area}, {"DFN", DFN, html_dfn}, {"HEAD", HEAD, html_head}, {"P", PARA, html_para}, {"CAPTION", CAPTION, html_caption}, {"I", ITALIC, html_italic} }; static short lookup[] = { -1, 0, 1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 7, 8, -1, -1, 9, -1, -1, -1, 10, -1, 11, 12, 13, -1, -167, 16, -1, -158, -1, 19, -1, 20, -1, -153, -1, 23, -1, 24, 25, -1, 26, -71, -2, 27, 28, 29, -75, -2, -161, -62, -2, 32, 33, 34, 35, -78, -2, 36, 37, 38, -1, -1, 39, -1, 40, -1, -1, -1, 41, 42, -1, -1, -1, 43, 44, -1, -1, 45, -1, 46, 47, -1, -1, 48, 49, -1, -1, -1, -1, 50, -1, 51, 52, -1, -1, -1, 53, 54, -1, -1, -1, 55, -1, 56, -1, -1, -1, -223, 59, 60, 61, -35, -2, 62, 63, 64, -1, -1, 65, 66, -1, -1, -1, 67, 68, 69, -1, -1, 70, -1, -1, 71, 72, 73, -1, -1, -1, -1, 74, 75, -1, -1, -1, 76, -1, 77, -1, -1, 78, 79, 80, -1, -1, 81, 82, -1, -1, -1, 83, -1, -1, -1, 84, -1, 85, 86, -1, -1, -1, -1, -1, -1, -1, -1, 87, 88, -1, -1, -1, -1, -1, -1, 89, -1, -1, -1, -1, -1, 90, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 91 }; if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) { register int key = tag_hash(str, len); if (key <= MAX_HASH_VALUE && key >= 0) { register int index = lookup[key]; if (index >= 0) { register const char *s = wordlist[index].tag; if ((*str == *s) && (my_strcmp(str + 1, s + 1) == 0)) return &wordlist[index]; } else if (index < -TOTAL_KEYWORDS) { register int offset = -1 - TOTAL_KEYWORDS - index; register html_token *wordptr = &wordlist[TOTAL_KEYWORDS + lookup[offset]]; register html_token *wordendptr = wordptr + -lookup[offset + 1]; while (wordptr < wordendptr) { register const char *s = wordptr->tag; if ((*str == *s) && (my_strcmp(str + 1, s + 1) == 0)) return wordptr; wordptr++; } } } } return NULL;}/* * get_tag() * * returns the tag id for the tag string 'tag' */html_token *get_tag(char *tag){ html_token *token; char buf[strlen(tag)+1]; char *string = buf; if (!tag) return NULL; while(*tag) { *string = toupper(*tag); ++tag; ++string; } *string = 0; token = lookup_tag(buf, strlen(buf)); if (!token) return NULL; return token;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -