html-parse.c

来自「一个从网络上自动下载文件的自由工具」· C语言代码 · 共 1,074 行 · 第 1/3 页
1,074 行
   * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end     of text, as well as embedded newlines.  */static voidconvert_and_copy (struct pool *pool, const char *beg, const char *end, int flags){  int old_tail = pool->tail;  /* Skip blanks if required.  We must do this before entities are     processed, so that blanks can still be inserted as, for instance,     `&#32;'.  */  if (flags & AP_TRIM_BLANKS)    {      while (beg < end && ISSPACE (*beg))        ++beg;      while (end > beg && ISSPACE (end[-1]))        --end;    }  if (flags & AP_DECODE_ENTITIES)    {      /* Grow the pool, then copy the text to the pool character by         character, processing the encountered entities as we go         along.         It's safe (and necessary) to grow the pool in advance because         processing the entities can only *shorten* the string, it can         never lengthen it.  */      const char *from = beg;      char *to;      bool squash_newlines = !!(flags & AP_TRIM_BLANKS);      POOL_GROW (pool, end - beg);      to = pool->contents + pool->tail;      while (from < end)        {          if (*from == '&')            {              int entity = decode_entity (&from, end);              if (entity != -1)                *to++ = entity;              else                *to++ = *from++;            }          else if ((*from == '\n' || *from == '\r') && squash_newlines)            ++from;          else            *to++ = *from++;        }      /* Verify that we haven't exceeded the original size.  (It         shouldn't happen, hence the assert.)  */      assert (to - (pool->contents + pool->tail) <= end - beg);      /* Make POOL's tail point to the position following the string         we've written.  */      pool->tail = to - pool->contents;      POOL_APPEND_CHR (pool, '\0');    }  else    {      /* Just copy the text to the pool.  */      POOL_APPEND (pool, beg, end);      POOL_APPEND_CHR (pool, '\0');    }  if (flags & AP_DOWNCASE)    {      char *p = pool->contents + old_tail;      for (; *p; p++)        *p = TOLOWER (*p);    }}/* Originally we used to adhere to rfc 1866 here, and allowed only   letters, digits, periods, and hyphens as names (of tags or   attributes).  However, this broke too many pages which used   proprietary or strange attributes, e.g. <img src="a.gif"   v:shapes="whatever">.   So now we allow any character except:     * whitespace     * 8-bit and control chars     * characters that clearly cannot be part of name:       '=', '>', '/'.   This only affects attribute and tag names; attribute values allow   an even greater variety of characters.  */#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \                        && (x) != '=' && (x) != '>' && (x) != '/')#ifdef STANDALONEstatic int comment_backout_count;#endif/* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In   strict comments mode, this is used for skipping over comments as   well.   To recap: any SGML declaration may have comments associated with   it, e.g.       <!MY-DECL -- isn't this fun? -- foo bar>   An HTML comment is merely an empty declaration (<!>) with a comment   attached, like this:       <!-- some stuff here -->   Several comments may be embedded in one comment declaration:       <!-- have -- -- fun -->   Whitespace is allowed between and after the comments, but not   before the first comment.  Additionally, this function attempts to   handle double quotes in SGML declarations correctly.  */static const char *advance_declaration (const char *beg, const char *end){  const char *p = beg;  char quote_char = '\0';       /* shut up, gcc! */  char ch;  enum {    AC_S_DONE,    AC_S_BACKOUT,    AC_S_BANG,    AC_S_DEFAULT,    AC_S_DCLNAME,    AC_S_DASH1,    AC_S_DASH2,    AC_S_COMMENT,    AC_S_DASH3,    AC_S_DASH4,    AC_S_QUOTE1,    AC_S_IN_QUOTE,    AC_S_QUOTE2  } state = AC_S_BANG;  if (beg == end)    return beg;  ch = *p++;  /* It looked like a good idea to write this as a state machine, but     now I wonder...  */  while (state != AC_S_DONE && state != AC_S_BACKOUT)    {      if (p == end)        state = AC_S_BACKOUT;      switch (state)        {        case AC_S_DONE:        case AC_S_BACKOUT:          break;        case AC_S_BANG:          if (ch == '!')            {              ch = *p++;              state = AC_S_DEFAULT;            }          else            state = AC_S_BACKOUT;          break;        case AC_S_DEFAULT:          switch (ch)            {            case '-':              state = AC_S_DASH1;              break;            case ' ':            case '\t':            case '\r':            case '\n':              ch = *p++;              break;            case '>':              state = AC_S_DONE;              break;            case '\'':            case '\"':              state = AC_S_QUOTE1;              break;            default:              if (NAME_CHAR_P (ch))                state = AC_S_DCLNAME;              else                state = AC_S_BACKOUT;              break;            }          break;        case AC_S_DCLNAME:          if (ch == '-')            state = AC_S_DASH1;          else if (NAME_CHAR_P (ch))            ch = *p++;          else            state = AC_S_DEFAULT;          break;        case AC_S_QUOTE1:          /* We must use 0x22 because broken assert macros choke on             '"' and '\"'.  */          assert (ch == '\'' || ch == 0x22);          quote_char = ch;      /* cheating -- I really don't feel like                                   introducing more different states for                                   different quote characters. */          ch = *p++;          state = AC_S_IN_QUOTE;          break;        case AC_S_IN_QUOTE:          if (ch == quote_char)            state = AC_S_QUOTE2;          else            ch = *p++;          break;        case AC_S_QUOTE2:          assert (ch == quote_char);          ch = *p++;          state = AC_S_DEFAULT;          break;        case AC_S_DASH1:          assert (ch == '-');          ch = *p++;          state = AC_S_DASH2;          break;        case AC_S_DASH2:          switch (ch)            {            case '-':              ch = *p++;              state = AC_S_COMMENT;              break;            default:              state = AC_S_BACKOUT;            }          break;        case AC_S_COMMENT:          switch (ch)            {            case '-':              state = AC_S_DASH3;              break;            default:              ch = *p++;              break;            }          break;        case AC_S_DASH3:          assert (ch == '-');          ch = *p++;          state = AC_S_DASH4;          break;        case AC_S_DASH4:          switch (ch)            {            case '-':              ch = *p++;              state = AC_S_DEFAULT;              break;            default:              state = AC_S_COMMENT;              break;            }          break;        }    }  if (state == AC_S_BACKOUT)    {#ifdef STANDALONE      ++comment_backout_count;#endif      return beg + 1;    }  return p;}/* Find the first occurrence of the substring "-->" in [BEG, END) and   return the pointer to the character after the substring.  If the   substring is not found, return NULL.  */static const char *find_comment_end (const char *beg, const char *end){  /* Open-coded Boyer-Moore search for "-->".  Examine the third char;     if it's not '>' or '-', advance by three characters.  Otherwise,     look at the preceding characters and try to find a match.  */  const char *p = beg - 1;  while ((p += 3) < end)    switch (p[0])      {      case '>':        if (p[-1] == '-' && p[-2] == '-')          return p + 1;        break;      case '-':      at_dash:        if (p[-1] == '-')          {          at_dash_dash:            if (++p == end) return NULL;            switch (p[0])              {              case '>': return p + 1;              case '-': goto at_dash_dash;              }          }        else          {            if ((p += 2) >= end) return NULL;            switch (p[0])              {              case '>':                if (p[-1] == '-')                  return p + 1;                break;              case '-':                goto at_dash;              }          }      }  return NULL;}/* Return true if the string containing of characters inside [b, e) is   present in hash table HT.  */static boolname_allowed (const struct hash_table *ht, const char *b, const char *e){  char *copy;  if (!ht)    return true;  BOUNDED_TO_ALLOCA (b, e, copy);  return hash_table_get (ht, copy) != NULL;}/* Advance P (a char pointer), with the explicit intent of being able   to read the next character.  If this is not possible, go to finish.  */#define ADVANCE(p) do {                         \  ++p;                                          \  if (p >= end)                                 \    goto finish;                                \} while (0)/* Skip whitespace, if any. */#define SKIP_WS(p) do {                         \  while (ISSPACE (*p)) {                        \    ADVANCE (p);                                \  }                                             \} while (0)/* Skip non-whitespace, if any. */#define SKIP_NON_WS(p) do {                     \
html-parse.c - 源码说明

本页面展示了「一个从网络上自动下载文件的自由工具」中的 html-parse.c 源码文件，采用 C语言编程语言编写，共 1,074 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与网络相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?