📄 htmlparser.c

📁 一个小的RTOS具有UIP网络功能
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * Copyright (c) 2002, Adam Dunkels. * All rights reserved.  * * Redistribution and use in source and binary forms, with or without  * modification, are permitted provided that the following conditions  * are met:  * 1. Redistributions of source code must retain the above copyright  *    notice, this list of conditions and the following disclaimer.  * 2. Redistributions in binary form must reproduce the above *    copyright notice, this list of conditions and the following *    disclaimer in the documentation and/or other materials provided *    with the distribution.  * 3. All advertising materials mentioning features or use of this *    software must display the following acknowledgement: *        This product includes software developed by Adam Dunkels.  * 4. The name of the author may not be used to endorse or promote *    products derived from this software without specific prior *    written permission.   * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   * * This file is part of the Contiki desktop environment  * * $Id: htmlparser.c,v 1.4 2003/09/04 19:33:05 adamdunkels Exp $ * *//* htmlparser.c: * * Implements a very simplistic HTML parser. It recognizes HTML links * (<a href>-tags), HTML img alt tags, a few text flow break tagsG * (<br>, <p>, <h>), the <li> tag (but does not even try to * distinguish between <ol> or <ul>) as well as HTML comment tags * (<!-- -->). * * To save memory, the HTML parser is state machine driver, which * means that it will shave off one character from the HTML page, * process that character, and return to the next. Another way of * doing it would be to buffer a number of characters and process them * together. * * The main function in this file is the htmlparser_parse() function * which takes a htmlparser_state structur and a part of an HTML file * as an argument. The htmlparser_parse() function will call the * helper functions parse_char() and parse_tag(). Those functions will * in turn call the two callback functions htmlparser_char() and * htmlparser_tag(). Those functions must be implemented by the using * module (e.g., a web browser program). * * htmlparser_char() will be called for every non-tag character. * * htmlparser_tag() will be called whenever a full tag has been found. * */#include "htmlparser.h"#include "html-strings.h"#include "www-conf.h"#include "cc.h"#if 1#define PRINTF(x)#else#include <stdio.h>#define PRINTF(x) printf x#endif/*-----------------------------------------------------------------------------------*/#define ISO_A     0x41#define ISO_B     0x42#define ISO_E     0x45#define ISO_F     0x46#define ISO_G     0x47#define ISO_H     0x48#define ISO_I     0x49#define ISO_L     0x4c#define ISO_M     0x4d#define ISO_P     0x50#define ISO_R     0x52#define ISO_T     0x54#define ISO_a     (ISO_A | 0x20)#define ISO_b     (ISO_B | 0x20)#define ISO_e     (ISO_E | 0x20)#define ISO_f     (ISO_F | 0x20)#define ISO_g     (ISO_G | 0x20)#define ISO_h     (ISO_H | 0x20)#define ISO_i     (ISO_I | 0x20)#define ISO_l     (ISO_L | 0x20)#define ISO_m     (ISO_M | 0x20)#define ISO_p     (ISO_P | 0x20)#define ISO_r     (ISO_R | 0x20)#define ISO_t     (ISO_T | 0x20)#define ISO_ht    0x09#define ISO_nl    0x0a#define ISO_cr    0x0d#define ISO_space 0x20#define ISO_bang  0x21#define ISO_citation 0x22#define ISO_ampersand 0x26#define ISO_citation2 0x27#define ISO_asterisk 0x2a#define ISO_dash  0x2d#define ISO_slash 0x2f#define ISO_semicolon  0x3b#define ISO_lt    0x3c#define ISO_eq    0x3d#define ISO_gt    0x3e#define ISO_rbrack 0x5b#define ISO_lbrack 0x5d#define MINORSTATE_NONE           0#define MINORSTATE_TEXT           1 /* Parse normal text */#define MINORSTATE_EXTCHAR        2 /* Check for semi-colon */#define MINORSTATE_TAG            3 /* Check for name of tag. */#define MINORSTATE_TAGEND         4 /* Scan for end of tag. */#define MINORSTATE_TAGATTR        5 /* Parse tag attr. */#define MINORSTATE_TAGATTRSPACE   6 /* Parse optional space after tag				       attr. */#define MINORSTATE_TAGATTRPARAM   7 /* Parse tag attr parameter. */#define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without				  quotation marks. */#define MINORSTATE_HTMLCOMMENT    9 /* Scan for HTML comment end */#define MAJORSTATE_NONE       0#define MAJORSTATE_BODY       1#define MAJORSTATE_LINK       2#define MAJORSTATE_FORM       3#define MAJORSTATE_DISCARD    4struct htmlparser_state {  unsigned char minorstate;  char tag[20];  unsigned char tagptr;  char tagattr[20];  unsigned char tagattrptr;  char tagattrparam[WWW_CONF_MAX_URLLEN];  unsigned char tagattrparamptr;  unsigned char lastchar, quotechar;  unsigned char majorstate, lastmajorstate;  char linkurl[WWW_CONF_MAX_URLLEN];  char linktext[40];  unsigned char linktextptr;#if WWW_CONF_FORMS  char formaction[WWW_CONF_MAX_FORMACTIONLEN];  char formname[WWW_CONF_MAX_FORMNAMELEN];  unsigned char inputtype;  char inputname[WWW_CONF_MAX_INPUTNAMELEN];  char inputvalue[WWW_CONF_MAX_INPUTVALUELEN];  unsigned char inputvaluesize;#endif /* WWW_CONF_FORMS */};static struct htmlparser_state s;/*-----------------------------------------------------------------------------------*/static char last[1] = {0xff};static const char *tags[] = {#define TAG_FIRST       0#define TAG_SLASHA      0  html_slasha,#define TAG_SLASHCENTER 1  html_slashcenter,#define TAG_SLASHFORM   2  html_slashform,#define TAG_SLASHH      3  html_slashh,#define TAG_SLASHSCRIPT 4  html_slashscript,#define TAG_SLASHSELECT 5  html_slashselect,#define TAG_SLASHSTYLE  6  html_slashstyle,#define TAG_A           7  html_a,#define TAG_BODY        8  html_body,#define TAG_BR          9  html_br,#define TAG_CENTER     10   html_center,#define TAG_FORM       11  html_form,#define TAG_FRAME      12      html_frame,#define TAG_H1         13    html_h1,#define TAG_H2         14  html_h2,#define TAG_H3         15    html_h3,#define TAG_H4         16    html_h4,#define TAG_IMG        17    html_img,#define TAG_INPUT      18    html_input,#define TAG_LI         19  html_li,#define TAG_P          20  html_p,#define TAG_SCRIPT     21  html_script, #define TAG_SELECT     22  html_select,#define TAG_STYLE      23  html_style,#define TAG_TR         24     html_tr,#define TAG_LAST       25  last,};/*-----------------------------------------------------------------------------------*/static unsigned char CC_FASTCALLiswhitespace(char c){  return (c == ISO_space ||	  c == ISO_nl ||	  c == ISO_cr ||	  c == ISO_ht);}/*-----------------------------------------------------------------------------------*/static unsigned char CC_FASTCALLfind_tag(char *tag){  static unsigned char first, last, i, tabi;  static char tagc;    tabi = first = TAG_FIRST;  last = TAG_LAST;  i = 0;    do {    tagc = tag[i];    if(tagc == 0 &&       tags[first][i] == 0) {      return first;    }        /* First, find first matching tag from table. */    while(tagc > (tags[tabi])[i] &&	  tabi < last) {      ++tabi;    }    first = tabi;        /* Second, find last matching tag from table. */    while(tagc == (tags[tabi])[i] &&	  tabi < last) {      ++tabi;    }    last = tabi;        /* If first and last matching tags are equal, we have a match and       return. Else we continue with the next character. */    ++i;    tabi = first;  } while(last != first);  return TAG_LAST;}/*-----------------------------------------------------------------------------------*/static void CC_FASTCALLparse_char(unsigned char c){  if(c < 0x80) {    if(s.majorstate == MAJORSTATE_LINK) {      if(s.linktextptr < sizeof(s.linktext)) {	if(iswhitespace(c)) {	  c = ISO_space;	}	s.linktext[s.linktextptr] = c;	++s.linktextptr;      }    } else if(s.majorstate != MAJORSTATE_DISCARD) {      htmlparser_char(c);    }   }}/*-----------------------------------------------------------------------------------*/static void CC_FASTCALLswitch_majorstate(unsigned char newstate){  if(s.majorstate != newstate) {    PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate));    s.lastmajorstate = s.majorstate;    s.majorstate = newstate;  }}/*-----------------------------------------------------------------------------------*/static voidparse_tag(void){  static char *tagattrparam;  static unsigned char size, i;    PRINTF(("Parsing tag '%s' '%s' '%s'\n",	  s.tag, s.tagattr, s.tagattrparam));  switch(find_tag(s.tag)) {  case TAG_P:  case TAG_H1:  case TAG_H2:  case TAG_H3:  case TAG_H4:    parse_char(ISO_nl);    /* FALLTHROUGH */  case TAG_BR:  case TAG_TR:  case TAG_SLASHH:    parse_char(ISO_nl);    break;  case TAG_LI:    parse_char(ISO_nl);    parse_char(ISO_asterisk);    parse_char(ISO_space);    break;  case TAG_SCRIPT:  case TAG_STYLE:  case TAG_SELECT:    switch_majorstate(MAJORSTATE_DISCARD);    break;  case TAG_SLASHSCRIPT:  case TAG_SLASHSTYLE:  case TAG_SLASHSELECT:    switch_majorstate(s.lastmajorstate);    break;  case TAG_BODY:    s.majorstate = s.lastmajorstate = MAJORSTATE_BODY;    break;  case TAG_FRAME:    if(strncmp(s.tagattr, html_src, sizeof(html_src)) == 0 &&       s.tagattrparam[0] != 0) {      switch_majorstate(MAJORSTATE_BODY);      parse_char(ISO_nl);      parse_char(ISO_rbrack);      parse_char(ISO_space);      htmlparser_link((char *)html_frame, s.tagattrparam);      PRINTF(("Frame [%s]\n", s.tagattrparam));      parse_char(ISO_space);      parse_char(ISO_lbrack);      parse_char(ISO_nl);    }    break;  case TAG_IMG:    if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 &&       s.tagattrparam[0] != 0) {      parse_char(ISO_lt);      tagattrparam = &s.tagattrparam[0];      while(*tagattrparam) {	parse_char(*tagattrparam);	++tagattrparam;      }      parse_char(ISO_gt);    }    break;  case TAG_A:    PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam));    if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 &&       s.tagattrparam[0] != 0) {      strcpy(s.linkurl, s.tagattrparam);      switch_majorstate(MAJORSTATE_LINK);      s.linktextptr = 0;    }    break;  case TAG_SLASHA:    if(s.majorstate == MAJORSTATE_LINK) {      switch_majorstate(s.lastmajorstate);      s.linktext[s.linktextptr] = 0;      htmlparser_link(s.linktext, s.linkurl);      PRINTF(("Link '%s' [%s]\n", s.linktext, s.linkurl));
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -