📄 html-parse.c

📁 wget讓你可以在console介面下
💻 C
📖 第 1 页 / 共 3 页
字号:
   * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end     of text, as well as embedded newlines.  */static voidconvert_and_copy (struct pool *pool, const char *beg, const char *end, int flags){  int old_tail = pool->tail;  /* Skip blanks if required.  We must do this before entities are     processed, so that blanks can still be inserted as, for instance,     `&#32;'.  */  if (flags & AP_TRIM_BLANKS)    {      while (beg < end && ISSPACE (*beg))	++beg;      while (end > beg && ISSPACE (end[-1]))	--end;    }  if (flags & AP_DECODE_ENTITIES)    {      /* Grow the pool, then copy the text to the pool character by	 character, processing the encountered entities as we go	 along.	 It's safe (and necessary) to grow the pool in advance because	 processing the entities can only *shorten* the string, it can	 never lengthen it.  */      const char *from = beg;      char *to;      int squash_newlines = flags & AP_TRIM_BLANKS;      POOL_GROW (pool, end - beg);      to = pool->contents + pool->tail;      while (from < end)	{	  if (*from == '&')	    {	      int entity = decode_entity (&from, end);	      if (entity != -1)		*to++ = entity;	      else		*to++ = *from++;	    }	  else if ((*from == '\n' || *from == '\r') && squash_newlines)	    ++from;	  else	    *to++ = *from++;	}      /* Verify that we haven't exceeded the original size.  (It	 shouldn't happen, hence the assert.)  */      assert (to - (pool->contents + pool->tail) <= end - beg);      /* Make POOL's tail point to the position following the string	 we've written.  */      pool->tail = to - pool->contents;      POOL_APPEND_CHR (pool, '\0');    }  else    {      /* Just copy the text to the pool.  */      POOL_APPEND (pool, beg, end);      POOL_APPEND_CHR (pool, '\0');    }  if (flags & AP_DOWNCASE)    {      char *p = pool->contents + old_tail;      for (; *p; p++)	*p = TOLOWER (*p);    }}/* Originally we used to adhere to rfc 1866 here, and allowed only   letters, digits, periods, and hyphens as names (of tags or   attributes).  However, this broke too many pages which used   proprietary or strange attributes, e.g. <img src="a.gif"   v:shapes="whatever">.   So now we allow any character except:     * whitespace     * 8-bit and control chars     * characters that clearly cannot be part of name:       '=', '>', '/'.   This only affects attribute and tag names; attribute values allow   an even greater variety of characters.  */#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127				\			&& (x) != '=' && (x) != '>' && (x) != '/')#ifdef STANDALONEstatic int comment_backout_count;#endif/* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In   strict comments mode, this is used for skipping over comments as   well.   To recap: any SGML declaration may have comments associated with   it, e.g.       <!MY-DECL -- isn't this fun? -- foo bar>   An HTML comment is merely an empty declaration (<!>) with a comment   attached, like this:       <!-- some stuff here -->   Several comments may be embedded in one comment declaration:       <!-- have -- -- fun -->   Whitespace is allowed between and after the comments, but not   before the first comment.  Additionally, this function attempts to   handle double quotes in SGML declarations correctly.  */static const char *advance_declaration (const char *beg, const char *end){  const char *p = beg;  char quote_char = '\0';	/* shut up, gcc! */  char ch;  enum {    AC_S_DONE,    AC_S_BACKOUT,    AC_S_BANG,    AC_S_DEFAULT,    AC_S_DCLNAME,    AC_S_DASH1,    AC_S_DASH2,    AC_S_COMMENT,    AC_S_DASH3,    AC_S_DASH4,    AC_S_QUOTE1,    AC_S_IN_QUOTE,    AC_S_QUOTE2  } state = AC_S_BANG;  if (beg == end)    return beg;  ch = *p++;  /* It looked like a good idea to write this as a state machine, but     now I wonder...  */  while (state != AC_S_DONE && state != AC_S_BACKOUT)    {      if (p == end)	state = AC_S_BACKOUT;      switch (state)	{	case AC_S_DONE:	case AC_S_BACKOUT:	  break;	case AC_S_BANG:	  if (ch == '!')	    {	      ch = *p++;	      state = AC_S_DEFAULT;	    }	  else	    state = AC_S_BACKOUT;	  break;	case AC_S_DEFAULT:	  switch (ch)	    {	    case '-':	      state = AC_S_DASH1;	      break;	    case ' ':	    case '\t':	    case '\r':	    case '\n':	      ch = *p++;	      break;	    case '>':	      state = AC_S_DONE;	      break;	    case '\'':	    case '\"':	      state = AC_S_QUOTE1;	      break;	    default:	      if (NAME_CHAR_P (ch))		state = AC_S_DCLNAME;	      else		state = AC_S_BACKOUT;	      break;	    }	  break;	case AC_S_DCLNAME:	  if (ch == '-')	    state = AC_S_DASH1;	  else if (NAME_CHAR_P (ch))	    ch = *p++;	  else	    state = AC_S_DEFAULT;	  break;	case AC_S_QUOTE1:	  /* We must use 0x22 because broken assert macros choke on	     '"' and '\"'.  */	  assert (ch == '\'' || ch == 0x22);	  quote_char = ch;	/* cheating -- I really don't feel like				   introducing more different states for				   different quote characters. */	  ch = *p++;	  state = AC_S_IN_QUOTE;	  break;	case AC_S_IN_QUOTE:	  if (ch == quote_char)	    state = AC_S_QUOTE2;	  else	    ch = *p++;	  break;	case AC_S_QUOTE2:	  assert (ch == quote_char);	  ch = *p++;	  state = AC_S_DEFAULT;	  break;	case AC_S_DASH1:	  assert (ch == '-');	  ch = *p++;	  state = AC_S_DASH2;	  break;	case AC_S_DASH2:	  switch (ch)	    {	    case '-':	      ch = *p++;	      state = AC_S_COMMENT;	      break;	    default:	      state = AC_S_BACKOUT;	    }	  break;	case AC_S_COMMENT:	  switch (ch)	    {	    case '-':	      state = AC_S_DASH3;	      break;	    default:	      ch = *p++;	      break;	    }	  break;	case AC_S_DASH3:	  assert (ch == '-');	  ch = *p++;	  state = AC_S_DASH4;	  break;	case AC_S_DASH4:	  switch (ch)	    {	    case '-':	      ch = *p++;	      state = AC_S_DEFAULT;	      break;	    default:	      state = AC_S_COMMENT;	      break;	    }	  break;	}    }  if (state == AC_S_BACKOUT)    {#ifdef STANDALONE      ++comment_backout_count;#endif      return beg + 1;    }  return p;}/* Find the first occurrence of the substring "-->" in [BEG, END) and   return the pointer to the character after the substring.  If the   substring is not found, return NULL.  */static const char *find_comment_end (const char *beg, const char *end){  /* Open-coded Boyer-Moore search for "-->".  Examine the third char;     if it's not '>' or '-', advance by three characters.  Otherwise,     look at the preceding characters and try to find a match.  */  const char *p = beg - 1;  while ((p += 3) < end)    switch (p[0])      {      case '>':	if (p[-1] == '-' && p[-2] == '-')	  return p + 1;	break;      case '-':      at_dash:	if (p[-1] == '-')	  {	  at_dash_dash:	    if (++p == end) return NULL;	    switch (p[0])	      {	      case '>': return p + 1;	      case '-': goto at_dash_dash;	      }	  }	else	  {	    if ((p += 2) >= end) return NULL;	    switch (p[0])	      {	      case '>':		if (p[-1] == '-')		  return p + 1;		break;	      case '-':		goto at_dash;	      }	  }      }  return NULL;}/* Return non-zero of the string inside [b, e) are present in hash   table HT.  */static intname_allowed (const struct hash_table *ht, const char *b, const char *e){  char *copy;  if (!ht)    return 1;  BOUNDED_TO_ALLOCA (b, e, copy);  return hash_table_get (ht, copy) != NULL;}/* Advance P (a char pointer), with the explicit intent of being able   to read the next character.  If this is not possible, go to finish.  */#define ADVANCE(p) do {				\  ++p;						\  if (p >= end)					\    goto finish;				\} while (0)/* Skip whitespace, if any. */#define SKIP_WS(p) do {				\  while (ISSPACE (*p)) {			\    ADVANCE (p);				\  }						\} while (0)/* Skip non-whitespace, if any. */#define SKIP_NON_WS(p) do {			\  while (!ISSPACE (*p)) {			\
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -