📄 html-parse.c

📁 wget (command line browser) source code
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
	  else	    state = AC_S_DEFAULT;	  break;	case AC_S_QUOTE1:	  /* We must use 0x22 because broken assert macros choke on	     '"' and '\"'.  */	  assert (ch == '\'' || ch == 0x22);	  quote_char = ch;	/* cheating -- I really don't feel like				   introducing more different states for				   different quote characters. */	  ch = *p++;	  state = AC_S_IN_QUOTE;	  break;	case AC_S_IN_QUOTE:	  if (ch == quote_char)	    state = AC_S_QUOTE2;	  else	    ch = *p++;	  break;	case AC_S_QUOTE2:	  assert (ch == quote_char);	  ch = *p++;	  state = AC_S_DEFAULT;	  break;	case AC_S_DASH1:	  assert (ch == '-');	  ch = *p++;	  state = AC_S_DASH2;	  break;	case AC_S_DASH2:	  switch (ch)	    {	    case '-':	      ch = *p++;	      state = AC_S_COMMENT;	      break;	    default:	      state = AC_S_BACKOUT;	    }	  break;	case AC_S_COMMENT:	  switch (ch)	    {	    case '-':	      state = AC_S_DASH3;	      break;	    default:	      ch = *p++;	      break;	    }	  break;	case AC_S_DASH3:	  assert (ch == '-');	  ch = *p++;	  state = AC_S_DASH4;	  break;	case AC_S_DASH4:	  switch (ch)	    {	    case '-':	      ch = *p++;	      state = AC_S_DEFAULT;	      break;	    default:	      state = AC_S_COMMENT;	      break;	    }	  break;	}    }  if (state == AC_S_BACKOUT)    {#ifdef STANDALONE      ++comment_backout_count;#endif      return beg + 1;    }  return p;}/* Find the first occurrence of the substring "-->" in [BEG, END) and   return the pointer to the character after the substring.  If the   substring is not found, return NULL.  */static const char *find_comment_end (const char *beg, const char *end){  /* Open-coded Boyer-Moore search for "-->".  Examine the third char;     if it's not '>' or '-', advance by three characters.  Otherwise,     look at the preceding characters and try to find a match.  */  const char *p = beg - 1;  while ((p += 3) < end)    switch (p[0])      {      case '>':	if (p[-1] == '-' && p[-2] == '-')	  return p + 1;	break;      case '-':      at_dash:	if (p[-1] == '-')	  {	  at_dash_dash:	    if (++p == end) return NULL;	    switch (p[0])	      {	      case '>': return p + 1;	      case '-': goto at_dash_dash;	      }	  }	else	  {	    if ((p += 2) >= end) return NULL;	    switch (p[0])	      {	      case '>':		if (p[-1] == '-')		  return p + 1;		break;	      case '-':		goto at_dash;	      }	  }      }  return NULL;}/* Return non-zero of the string inside [b, e) are present in hash   table HT.  */static intname_allowed (const struct hash_table *ht, const char *b, const char *e){  char *copy;  if (!ht)    return 1;  BOUNDED_TO_ALLOCA (b, e, copy);  return hash_table_get (ht, copy) != NULL;}/* Advance P (a char pointer), with the explicit intent of being able   to read the next character.  If this is not possible, go to finish.  */#define ADVANCE(p) do {				\  ++p;						\  if (p >= end)					\    goto finish;				\} while (0)/* Skip whitespace, if any. */#define SKIP_WS(p) do {				\  while (ISSPACE (*p)) {			\    ADVANCE (p);				\  }						\} while (0)/* Skip non-whitespace, if any. */#define SKIP_NON_WS(p) do {			\  while (!ISSPACE (*p)) {			\    ADVANCE (p);				\  }						\} while (0)#ifdef STANDALONEstatic int tag_backout_count;#endif/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.   MAPFUN will be called with two arguments: pointer to an initialized   struct taginfo, and MAPARG.   ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to   be processed by this function.  If it is NULL, all the tags are   allowed.  The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.   (Obviously, the caller can filter out unwanted tags and attributes   just as well, but this is just an optimization designed to avoid   unnecessary copying for tags/attributes which the caller doesn't   want to know about.  These lists are searched linearly; therefore,   if you're interested in a large number of tags or attributes, you'd   better set these to NULL and filter them out yourself with a   hashing process most appropriate for your application.)  */voidmap_html_tags (const char *text, int size,	       void (*mapfun) (struct taginfo *, void *), void *maparg,	       int flags,	       const struct hash_table *allowed_tags,	       const struct hash_table *allowed_attributes){  /* storage for strings passed to MAPFUN callback; if 256 bytes is     too little, POOL_APPEND allocates more with malloc. */  char pool_initial_storage[256];  struct pool pool;  const char *p = text;  const char *end = text + size;  struct attr_pair attr_pair_initial_storage[8];  int attr_pair_size = countof (attr_pair_initial_storage);  int attr_pair_resized = 0;  struct attr_pair *pairs = attr_pair_initial_storage;  if (!size)    return;  POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));  {    int nattrs, end_tag;    const char *tag_name_begin, *tag_name_end;    const char *tag_start_position;    int uninteresting_tag;  look_for_tag:    POOL_REWIND (&pool);    nattrs = 0;    end_tag = 0;    /* Find beginning of tag.  We use memchr() instead of the usual       looping with ADVANCE() for speed. */    p = memchr (p, '<', end - p);    if (!p)      goto finish;    tag_start_position = p;    ADVANCE (p);    /* Establish the type of the tag (start-tag, end-tag or       declaration).  */    if (*p == '!')      {	if (!(flags & MHT_STRICT_COMMENTS)	    && p < end + 3 && p[1] == '-' && p[2] == '-')	  {	    /* If strict comments are not enforced and if we know	       we're looking at a comment, simply look for the	       terminating "-->".  Non-strict is the default because	       it works in other browsers and most HTML writers can't	       be bothered with getting the comments right.  */	    const char *comment_end = find_comment_end (p + 3, end);	    if (comment_end)	      p = comment_end;	  }	else	  {	    /* Either in strict comment mode or looking at a non-empty	       declaration.  Real declarations are much less likely to	       be misused the way comments are, so advance over them	       properly regardless of strictness.  */	    p = advance_declaration (p, end);	  }	if (p == end)	  goto finish;	goto look_for_tag;      }    else if (*p == '/')      {	end_tag = 1;	ADVANCE (p);      }    tag_name_begin = p;    while (NAME_CHAR_P (*p))      ADVANCE (p);    if (p == tag_name_begin)      goto look_for_tag;    tag_name_end = p;    SKIP_WS (p);    if (end_tag && *p != '>')      goto backout_tag;    if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))      /* We can't just say "goto look_for_tag" here because we need         the loop below to properly advance over the tag's attributes.  */      uninteresting_tag = 1;    else      {	uninteresting_tag = 0;	convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);      }    /* Find the attributes. */    while (1)      {	const char *attr_name_begin, *attr_name_end;	const char *attr_value_begin, *attr_value_end;	const char *attr_raw_value_begin, *attr_raw_value_end;	int operation = AP_DOWNCASE; /* stupid compiler. */	SKIP_WS (p);	if (*p == '/')	  {	    /* A slash at this point means the tag is about to be	       closed.  This is legal in XML and has been popularized	       in HTML via XHTML.  */	    /* <foo a=b c=d /> */	    /*              ^  */	    ADVANCE (p);	    SKIP_WS (p);	    if (*p != '>')	      goto backout_tag;	  }	/* Check for end of tag definition. */	if (*p == '>')	  break;	/* Establish bounds of attribute name. */	attr_name_begin = p;	/* <foo bar ...> */				/*      ^        */	while (NAME_CHAR_P (*p))	  ADVANCE (p);	attr_name_end = p;	/* <foo bar ...> */				/*         ^     */	if (attr_name_begin == attr_name_end)	  goto backout_tag;	/* Establish bounds of attribute value. */	SKIP_WS (p);	if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')	  {	    /* Minimized attribute syntax allows `=' to be omitted.               For example, <UL COMPACT> is a valid shorthand for <UL               COMPACT="compact">.  Even if such attributes are not               useful to Wget, we need to support them, so that the               tags containing them can be parsed correctly. */	    attr_raw_value_begin = attr_value_begin = attr_name_begin;	    attr_raw_value_end = attr_value_end = attr_name_end;	  }	else if (*p == '=')	  {	    ADVANCE (p);	    SKIP_WS (p);	    if (*p == '\"' || *p == '\'')	      {		int newline_seen = 0;		char quote_char = *p;		attr_raw_value_begin = p;		ADVANCE (p);		attr_value_begin = p; /* <foo bar="baz"> */				      /*           ^     */		while (*p != quote_char)		  {		    if (!newline_seen && *p == '\n')		      {			/* If a newline is seen within the quotes, it			   is most likely that someone forgot to close			   the quote.  In that case, we back out to			   the value beginning, and terminate the tag			   at either `>' or the delimiter, whichever			   comes first.  Such a tag terminated at `>'			   is discarded.  */			p = attr_value_begin;			newline_seen = 1;			continue;		      }		    else if (newline_seen && *p == '>')		      break;		    ADVANCE (p);		  }		attr_value_end = p; /* <foo bar="baz"> */				    /*              ^  */		if (*p == quote_char)		  ADVANCE (p);		else		  goto look_for_tag;		attr_raw_value_end = p;	/* <foo bar="baz"> */					/*               ^ */		operation = AP_PROCESS_ENTITIES;		if (flags & MHT_TRIM_VALUES)		  operation |= AP_TRIM_BLANKS;	      }	    else	      {		attr_value_begin = p; /* <foo bar=baz> */				      /*          ^    */		/* According to SGML, a name token should consist only		   of alphanumerics, . and -.  However, this is often		   violated by, for instance, `%' in `width=75%'.		   We'll be liberal and allow just about anything as		   an attribute value.  */		while (!ISSPACE (*p) && *p != '>')		  ADVANCE (p);		attr_value_end = p; /* <foo bar=baz qux=quix> */				    /*             ^          */		if (attr_value_begin == attr_value_end)		  /* <foo bar=> */		  /*          ^ */		  goto backout_tag;		attr_raw_value_begin = attr_value_begin;		attr_raw_value_end = attr_value_end;		operation = AP_PROCESS_ENTITIES;	      }	  }	else	  {	    /* We skipped the whitespace and found something that is	       neither `=' nor the beginning of the next attribute's	       name.  Back out.  */	    goto backout_tag;	/* <foo bar [... */				/*          ^    */	  }	/* If we're not interested in the tag, don't bother with any           of the attributes.  */	if (uninteresting_tag)	  continue;	/* If we aren't interested in the attribute, skip it.  We           cannot do this test any sooner, because our text pointer           needs to correctly advance over the attribute.  */	if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))	  continue;	GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,		    struct attr_pair);	pairs[nattrs].name_pool_index = pool.tail;	convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);	pairs[nattrs].value_pool_index = pool.tail;	convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);	pairs[nattrs].value_raw_beginning = attr_raw_value_begin;	pairs[nattrs].value_raw_size = (attr_raw_value_end					- attr_raw_value_begin);	++nattrs;      }    if (uninteresting_tag)      {	ADVANCE (p);	goto look_for_tag;      }    /* By now, we have a valid tag with a name and zero or more       attributes.  Fill in the data and call the mapper function.  */    {      int i;      struct taginfo taginfo;      taginfo.name      = pool.contents;      taginfo.end_tag_p = end_tag;      taginfo.nattrs    = nattrs;      /* We fill in the char pointers only now, when pool can no	 longer get realloc'ed.  If we did that above, we could get	 hosed by reallocation.  Obviously, after this point, the pool	 may no longer be grown.  */      for (i = 0; i < nattrs; i++)	{	  pairs[i].name = pool.contents + pairs[i].name_pool_index;	  pairs[i].value = pool.contents + pairs[i].value_pool_index;	}      taginfo.attrs = pairs;      taginfo.start_position = tag_start_position;      taginfo.end_position   = p + 1;      /* Ta-dam! */      (*mapfun) (&taginfo, maparg);      ADVANCE (p);    }    goto look_for_tag;  backout_tag:#ifdef STANDALONE    ++tag_backout_count;#endif    /* The tag wasn't really a tag.  Treat its contents as ordinary       data characters. */    p = tag_start_position + 1;    goto look_for_tag;  } finish:  POOL_FREE (&pool);  if (attr_pair_resized)    xfree (pairs);}#undef ADVANCE#undef SKIP_WS#undef SKIP_NON_WS#ifdef STANDALONEstatic voidtest_mapper (struct taginfo *taginfo, void *arg){  int i;  printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);  for (i = 0; i < taginfo->nattrs; i++)    printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);  putchar ('\n');  ++*(int *)arg;}int main (){  int size = 256;  char *x = (char *)xmalloc (size);  int length = 0;  int read_count;  int tag_counter = 0;  while ((read_count = fread (x + length, 1, size - length, stdin)))    {      length += read_count;      size <<= 1;      x = (char *)xrealloc (x, size);    }  map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);  printf ("TAGS: %d\n", tag_counter);  printf ("Tag backouts:     %d\n", tag_backout_count);  printf ("Comment backouts: %d\n", comment_backout_count);  return 0;}#endif /* STANDALONE */
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -