📄 htmlparser.c
字号:
/* * Copyright (c) 2002, Adam Dunkels. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * 3. All advertising materials mentioning features or use of this * software must display the following acknowledgement: * This product includes software developed by Adam Dunkels. * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This file is part of the Contiki desktop environment * * $Id: htmlparser.c,v 1.4 2003/09/04 19:33:05 adamdunkels Exp $ * *//* htmlparser.c: * * Implements a very simplistic HTML parser. It recognizes HTML links * (<a href>-tags), HTML img alt tags, a few text flow break tagsG * (<br>, <p>, <h>), the <li> tag (but does not even try to * distinguish between <ol> or <ul>) as well as HTML comment tags * (<!-- -->). * * To save memory, the HTML parser is state machine driver, which * means that it will shave off one character from the HTML page, * process that character, and return to the next. Another way of * doing it would be to buffer a number of characters and process them * together. * * The main function in this file is the htmlparser_parse() function * which takes a htmlparser_state structur and a part of an HTML file * as an argument. The htmlparser_parse() function will call the * helper functions parse_char() and parse_tag(). Those functions will * in turn call the two callback functions htmlparser_char() and * htmlparser_tag(). Those functions must be implemented by the using * module (e.g., a web browser program). * * htmlparser_char() will be called for every non-tag character. * * htmlparser_tag() will be called whenever a full tag has been found. * */#include "htmlparser.h"#include "html-strings.h"#include "www-conf.h"#include "cc.h"#if 1#define PRINTF(x)#else#include <stdio.h>#define PRINTF(x) printf x#endif/*-----------------------------------------------------------------------------------*/#define ISO_A 0x41#define ISO_B 0x42#define ISO_E 0x45#define ISO_F 0x46#define ISO_G 0x47#define ISO_H 0x48#define ISO_I 0x49#define ISO_L 0x4c#define ISO_M 0x4d#define ISO_P 0x50#define ISO_R 0x52#define ISO_T 0x54#define ISO_a (ISO_A | 0x20)#define ISO_b (ISO_B | 0x20)#define ISO_e (ISO_E | 0x20)#define ISO_f (ISO_F | 0x20)#define ISO_g (ISO_G | 0x20)#define ISO_h (ISO_H | 0x20)#define ISO_i (ISO_I | 0x20)#define ISO_l (ISO_L | 0x20)#define ISO_m (ISO_M | 0x20)#define ISO_p (ISO_P | 0x20)#define ISO_r (ISO_R | 0x20)#define ISO_t (ISO_T | 0x20)#define ISO_ht 0x09#define ISO_nl 0x0a#define ISO_cr 0x0d#define ISO_space 0x20#define ISO_bang 0x21#define ISO_citation 0x22#define ISO_ampersand 0x26#define ISO_citation2 0x27#define ISO_asterisk 0x2a#define ISO_dash 0x2d#define ISO_slash 0x2f#define ISO_semicolon 0x3b#define ISO_lt 0x3c#define ISO_eq 0x3d#define ISO_gt 0x3e#define ISO_rbrack 0x5b#define ISO_lbrack 0x5d#define MINORSTATE_NONE 0#define MINORSTATE_TEXT 1 /* Parse normal text */#define MINORSTATE_EXTCHAR 2 /* Check for semi-colon */#define MINORSTATE_TAG 3 /* Check for name of tag. */#define MINORSTATE_TAGEND 4 /* Scan for end of tag. */#define MINORSTATE_TAGATTR 5 /* Parse tag attr. */#define MINORSTATE_TAGATTRSPACE 6 /* Parse optional space after tag attr. */#define MINORSTATE_TAGATTRPARAM 7 /* Parse tag attr parameter. */#define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without quotation marks. */#define MINORSTATE_HTMLCOMMENT 9 /* Scan for HTML comment end */#define MAJORSTATE_NONE 0#define MAJORSTATE_BODY 1#define MAJORSTATE_LINK 2#define MAJORSTATE_FORM 3#define MAJORSTATE_DISCARD 4struct htmlparser_state { unsigned char minorstate; char tag[20]; unsigned char tagptr; char tagattr[20]; unsigned char tagattrptr; char tagattrparam[WWW_CONF_MAX_URLLEN]; unsigned char tagattrparamptr; unsigned char lastchar, quotechar; unsigned char majorstate, lastmajorstate; char linkurl[WWW_CONF_MAX_URLLEN]; char linktext[40]; unsigned char linktextptr;#if WWW_CONF_FORMS char formaction[WWW_CONF_MAX_FORMACTIONLEN]; char formname[WWW_CONF_MAX_FORMNAMELEN]; unsigned char inputtype; char inputname[WWW_CONF_MAX_INPUTNAMELEN]; char inputvalue[WWW_CONF_MAX_INPUTVALUELEN]; unsigned char inputvaluesize;#endif /* WWW_CONF_FORMS */};static struct htmlparser_state s;/*-----------------------------------------------------------------------------------*/static char last[1] = {0xff};static const char *tags[] = {#define TAG_FIRST 0#define TAG_SLASHA 0 html_slasha,#define TAG_SLASHCENTER 1 html_slashcenter,#define TAG_SLASHFORM 2 html_slashform,#define TAG_SLASHH 3 html_slashh,#define TAG_SLASHSCRIPT 4 html_slashscript,#define TAG_SLASHSELECT 5 html_slashselect,#define TAG_SLASHSTYLE 6 html_slashstyle,#define TAG_A 7 html_a,#define TAG_BODY 8 html_body,#define TAG_BR 9 html_br,#define TAG_CENTER 10 html_center,#define TAG_FORM 11 html_form,#define TAG_FRAME 12 html_frame,#define TAG_H1 13 html_h1,#define TAG_H2 14 html_h2,#define TAG_H3 15 html_h3,#define TAG_H4 16 html_h4,#define TAG_IMG 17 html_img,#define TAG_INPUT 18 html_input,#define TAG_LI 19 html_li,#define TAG_P 20 html_p,#define TAG_SCRIPT 21 html_script, #define TAG_SELECT 22 html_select,#define TAG_STYLE 23 html_style,#define TAG_TR 24 html_tr,#define TAG_LAST 25 last,};/*-----------------------------------------------------------------------------------*/static unsigned char CC_FASTCALLiswhitespace(char c){ return (c == ISO_space || c == ISO_nl || c == ISO_cr || c == ISO_ht);}/*-----------------------------------------------------------------------------------*/static unsigned char CC_FASTCALLfind_tag(char *tag){ static unsigned char first, last, i, tabi; static char tagc; tabi = first = TAG_FIRST; last = TAG_LAST; i = 0; do { tagc = tag[i]; if(tagc == 0 && tags[first][i] == 0) { return first; } /* First, find first matching tag from table. */ while(tagc > (tags[tabi])[i] && tabi < last) { ++tabi; } first = tabi; /* Second, find last matching tag from table. */ while(tagc == (tags[tabi])[i] && tabi < last) { ++tabi; } last = tabi; /* If first and last matching tags are equal, we have a match and return. Else we continue with the next character. */ ++i; tabi = first; } while(last != first); return TAG_LAST;}/*-----------------------------------------------------------------------------------*/static void CC_FASTCALLparse_char(unsigned char c){ if(c < 0x80) { if(s.majorstate == MAJORSTATE_LINK) { if(s.linktextptr < sizeof(s.linktext)) { if(iswhitespace(c)) { c = ISO_space; } s.linktext[s.linktextptr] = c; ++s.linktextptr; } } else if(s.majorstate != MAJORSTATE_DISCARD) { htmlparser_char(c); } }}/*-----------------------------------------------------------------------------------*/static void CC_FASTCALLswitch_majorstate(unsigned char newstate){ if(s.majorstate != newstate) { PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate)); s.lastmajorstate = s.majorstate; s.majorstate = newstate; }}/*-----------------------------------------------------------------------------------*/static voidparse_tag(void){ static char *tagattrparam; static unsigned char size, i; PRINTF(("Parsing tag '%s' '%s' '%s'\n", s.tag, s.tagattr, s.tagattrparam)); switch(find_tag(s.tag)) { case TAG_P: case TAG_H1: case TAG_H2: case TAG_H3: case TAG_H4: parse_char(ISO_nl); /* FALLTHROUGH */ case TAG_BR: case TAG_TR: case TAG_SLASHH: parse_char(ISO_nl); break; case TAG_LI: parse_char(ISO_nl); parse_char(ISO_asterisk); parse_char(ISO_space); break; case TAG_SCRIPT: case TAG_STYLE: case TAG_SELECT: switch_majorstate(MAJORSTATE_DISCARD); break; case TAG_SLASHSCRIPT: case TAG_SLASHSTYLE: case TAG_SLASHSELECT: switch_majorstate(s.lastmajorstate); break; case TAG_BODY: s.majorstate = s.lastmajorstate = MAJORSTATE_BODY; break; case TAG_FRAME: if(strncmp(s.tagattr, html_src, sizeof(html_src)) == 0 && s.tagattrparam[0] != 0) { switch_majorstate(MAJORSTATE_BODY); parse_char(ISO_nl); parse_char(ISO_rbrack); parse_char(ISO_space); htmlparser_link((char *)html_frame, s.tagattrparam); PRINTF(("Frame [%s]\n", s.tagattrparam)); parse_char(ISO_space); parse_char(ISO_lbrack); parse_char(ISO_nl); } break; case TAG_IMG: if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 && s.tagattrparam[0] != 0) { parse_char(ISO_lt); tagattrparam = &s.tagattrparam[0]; while(*tagattrparam) { parse_char(*tagattrparam); ++tagattrparam; } parse_char(ISO_gt); } break; case TAG_A: PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam)); if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 && s.tagattrparam[0] != 0) { strcpy(s.linkurl, s.tagattrparam); switch_majorstate(MAJORSTATE_LINK); s.linktextptr = 0; } break; case TAG_SLASHA: if(s.majorstate == MAJORSTATE_LINK) { switch_majorstate(s.lastmajorstate); s.linktext[s.linktextptr] = 0; htmlparser_link(s.linktext, s.linkurl); PRINTF(("Link '%s' [%s]\n", s.linktext, s.linkurl));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -