📄 htmltokenizer.c

📁 微型浏览器
💻 C
字号:
/******************************************************************************* * * htmltokenizer.c - HTML parsing engine * * Hash functions for identifying html tags. * * Cheetah Web Browser * Copyright (C) 2001 Garett Spencley  *  * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  * *******************************************************************************/#include <stdlib.h>#include <string.h>#include <ctype.h>#include "htmltokenizer.h"#include "html.h"#include "str_util.h"#define TOTAL_KEYWORDS 92#define MIN_WORD_LENGTH 1#define MAX_WORD_LENGTH 10#define MIN_HASH_VALUE 1#define MAX_HASH_VALUE 231/* * tag_hash() * * hashes html tags */__inline static unsigned int tag_hash(const char *str, unsigned int len){	static unsigned char asso_values[] = {		232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232,  45,       	40,  50,  20,  25,  30, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232,  90,  20,  75,  65,  47,       	35,  10, 125, 115, 232,  20,   5,   5, 125,   7,      	100,   0,  35,  40,   0,  55,  10, 232,  20,  35,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,      	232, 232, 232, 232, 232, 232	};	return len + asso_values[(unsigned char) str[len - 1]] + asso_values[(unsigned char) str[0]];}/* * lookup_tag() *  * Returns the structure for the tag we are looking for */__inline html_token *lookup_tag(const char *str, unsigned int len){	/* DON'T TOUCH THIS! Use gperf on hash/tags.gperf */			static html_token wordlist[] = {	  {"Q", QUOTE, html_q},      {"TT", TT, html_tt},      {"TFOOT", TFOOT, html_tfoot},      {"OBJECT", OBJECT, html_object},      {"OL", OL, html_ol},      {"LABEL", LABEL, html_label},      {"BASEFONT", BASEFONT, html_basefont},      {"LINK", LINK, html_link},      {"BDO", BDO, html_bdo},      {"BIG", BIG, html_big},      {"TR", TR, html_tr},      {"FONT", FONT, html_font},      {"TBODY", TBODY, html_tbody},      {"B", BOLD, html_bold},      {"FIELDSET", FIELDSET, html_fieldset},      {"FRAMESET", FRAMESET, html_frameset},      {"FORM", FORM, html_form},      {"SCRIPT", SCRIPT, html_script},      {"SELECT", SELECT, html_select},      {"VAR", VAR, html_var},      {"SMALL", SMALL, html_small},      {"TITLE", TITLE, html_title},      {"TABLE", TABLE, html_table},      {"EM", EM, html_em},      {"STRONG", STRONG, html_strong},      {"BR", BR, html_br},      {"BODY", BODY, html_body},      {"UL", UL, html_ul},      {"SUB", SUB, html_sub},      {"MENU", MENU, html_menu},      {"TD", TD, html_td},      {"DT", DT, html_dt},      {"THEAD", THEAD, html_thead},      {"BASE", BASE, html_base},      {"DL", DL, html_dl},      {"DEL", STRIKE, html_strike},      {"LEGEND", LEGEND, html_legend},      {"BLOCKQUOTE", BLOCKQUOTE, html_blockquote},      {"DIV", DIV, html_div},      {"S", STRIKE, html_strike},      {"COL", COL, html_col},      {"FRAME", FRAME, html_frame},      {"KBD", KBD, html_kbd},      {"STYLE", STYLE, html_style},	  {"STRIKE", STRIKE, html_strike},      {"APPLET", APPLET, html_applet},      {"TEXTAREA", TEXTAREA, html_textarea},      {"META", META, html_meta},      {"ACRONYM", ACRONYM, html_acronym},      {"DIR", DIRECTORY, html_dir},      {"MAP", MAP, html_map},      {"PARAM", PARAM, html_param},      {"U", UNDERLINE, html_underline},      {"OPTGROUP", OPTGROUP, html_optgroup},      {"CENTER", CENTER, html_center},      {"INPUT", INPUT, html_input},      {"LI", LI, html_li},      {"CODE", CODE, html_code},      {"CITE", CITE, html_cite},      {"TH", TH, html_th},      {"IMG", IMG, html_img},      {"ABBR", ABBR, html_abbr},      {"DD", DD, html_dd},      {"NOSCRIPT", NOSCRIPT, html_noscript},      {"HTML", HTML, html_html},      {"ADDRESS", ADDRESS, html_address},      {"OPTION", OPTION, html_option},      {"ISINDEX", ISINDEX, html_isindex},      {"SUP", SUP, html_sup},      {"SAMP", SAMP, html_samp},      {"H4", H4, html_header},      {"PRE", PRE, html_pre},      {"BUTTON", BUTTON, html_button},      {"H5", H5, html_header},      {"H6", H6, html_header},      {"INS", INS, html_ins},      {"HR", HR, html_hr},      {"NOBR", NOBR, html_nobr},      {"H2", H2, html_header},      {"IFRAME", IFRAME, html_iframe},      {"SPAN", SPAN, html_span},      {"H1", H1, html_header},      {"NOFRAMES", NOFRAMES, html_noframes},      {"H3", H3, html_header},      {"A", A, html_a},      {"COLGROUP", COLGROUP, html_colgroup},      {"AREA", AREA, html_area},      {"DFN", DFN, html_dfn},      {"HEAD", HEAD, html_head},      {"P", PARA, html_para},      {"CAPTION", CAPTION, html_caption},      {"I", ITALIC, html_italic}	};	static short lookup[] = {		-1,    0,    1,   -1,   -1,    2,   -1,   -1,        -1,   -1,   -1,   -1,   -1,    3,    4,    5,        -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,        -1,   -1,   -1,   -1,    6,    7,    8,   -1,        -1,    9,   -1,   -1,   -1,   10,   -1,   11,        12,   13,   -1, -167,   16,   -1, -158,   -1,        19,   -1,   20,   -1, -153,   -1,   23,   -1,        24,   25,   -1,   26,  -71,   -2,   27,   28,        29,  -75,   -2, -161,  -62,   -2,   32,   33,        34,   35,  -78,   -2,   36,   37,   38,   -1,        -1,   39,   -1,   40,   -1,   -1,   -1,   41,        42,   -1,   -1,   -1,   43,   44,   -1,   -1,        45,   -1,   46,   47,   -1,   -1,   48,   49,        -1,   -1,   -1,   -1,   50,   -1,   51,   52,        -1,   -1,   -1,   53,   54,   -1,   -1,   -1,        55,   -1,   56,   -1,   -1,   -1, -223,   59,        60,   61,  -35,   -2,   62,   63,   64,   -1,        -1,   65,   66,   -1,   -1,   -1,   67,   68,        69,   -1,   -1,   70,   -1,   -1,   71,   72,        73,   -1,   -1,   -1,   -1,   74,   75,   -1,        -1,   -1,   76,   -1,   77,   -1,   -1,   78,        79,   80,   -1,   -1,   81,   82,   -1,   -1,        -1,   83,   -1,   -1,   -1,   84,   -1,   85,        86,   -1,   -1,   -1,   -1,   -1,   -1,   -1,        -1,   87,   88,   -1,   -1,   -1,   -1,   -1,        -1,   89,   -1,   -1,   -1,   -1,   -1,   90,        -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,        -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,        -1,   -1,   -1,   -1,   -1,   -1,   -1,   91	};	if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) {		register int key = tag_hash(str, len);		if (key <= MAX_HASH_VALUE && key >= 0) {			register int index = lookup[key];			if (index >= 0) {				register const char *s = wordlist[index].tag;				if ((*str == *s) && (my_strcmp(str + 1, s + 1) == 0))					return &wordlist[index];			} else if (index < -TOTAL_KEYWORDS) {				register int offset = -1 - TOTAL_KEYWORDS - index;				register html_token *wordptr = &wordlist[TOTAL_KEYWORDS + lookup[offset]];				register html_token *wordendptr = wordptr + -lookup[offset + 1];				while (wordptr < wordendptr) {					register const char *s = wordptr->tag;					if ((*str == *s) && (my_strcmp(str + 1, s + 1) == 0))						return wordptr;					wordptr++;				}			}		}	}	return NULL;}/* * get_tag() * * returns the tag id for the tag string 'tag' */html_token *get_tag(char *tag){	html_token *token;	char buf[strlen(tag)+1];	char *string = buf;	if (!tag)		return NULL;	while(*tag) {		*string = toupper(*tag);		++tag;		++string;	}	*string = 0;	token = lookup_tag(buf, strlen(buf));	if (!token)		return NULL;	return token;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -