html-url.c

来自「Wget很好的处理了http和ftp的下载,很值得学习的经典代码」· C语言代码 · 共 727 行 · 第 1/2 页
727 行
        /* We've found the index of tag_url_attributes where the           attributes of our tag begin.  */        first = i;        break;      }  assert (first != -1);  /* Loop over the "interesting" attributes of this tag.  In this     example, it will loop over "src" and "lowsrc".       <img src="foo.png" lowsrc="bar.png">     This has to be done in the outer loop so that the attributes are     processed in the same order in which they appear in the page.     This is required when converting links.  */  for (attrind = 0; attrind < tag->nattrs; attrind++)    {      /* Find whether TAG/ATTRIND is a combination that contains a         URL. */      char *link = tag->attrs[attrind].value;      const int size = countof (tag_url_attributes);      /* If you're cringing at the inefficiency of the nested loops,         remember that they both iterate over a very small number of         items.  The worst-case inner loop is for the IMG tag, which         has three attributes.  */      for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)        {          if (0 == strcasecmp (tag->attrs[attrind].name,                               tag_url_attributes[i].attr_name))            {              struct urlpos *up = append_url (link, tag, attrind, ctx);              if (up)                {                  int flags = tag_url_attributes[i].flags;                  if (flags & ATTR_INLINE)                    up->link_inline_p = 1;                  if (flags & ATTR_HTML)                    up->link_expect_html = 1;                }            }        }    }}/* Handle the BASE tag, for <base href=...>. */static voidtag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx){  struct urlpos *base_urlpos;  int attrind;  char *newbase = find_attr (tag, "href", &attrind);  if (!newbase)    return;  base_urlpos = append_url (newbase, tag, attrind, ctx);  if (!base_urlpos)    return;  base_urlpos->ignore_when_downloading = 1;  base_urlpos->link_base_p = 1;  if (ctx->base)    xfree (ctx->base);  if (ctx->parent_base)    ctx->base = uri_merge (ctx->parent_base, newbase);  else    ctx->base = xstrdup (newbase);}/* Mark the URL found in <form action=...> for conversion. */static voidtag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx){  int attrind;  char *action = find_attr (tag, "action", &attrind);  if (action)    {      struct urlpos *up = append_url (action, tag, attrind, ctx);      if (up)        up->ignore_when_downloading = 1;    }}/* Handle the LINK tag.  It requires special handling because how its   links will be followed in -p mode depends on the REL attribute.  */static voidtag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx){  int attrind;  char *href = find_attr (tag, "href", &attrind);  /* All <link href="..."> link references are external, except those     known not to be, such as style sheet and shortcut icon:       <link rel="stylesheet" href="...">       <link rel="shortcut icon" href="...">  */  if (href)    {      struct urlpos *up = append_url (href, tag, attrind, ctx);      if (up)        {          char *rel = find_attr (tag, "rel", NULL);          if (rel              && (0 == strcasecmp (rel, "stylesheet")                  || 0 == strcasecmp (rel, "shortcut icon")))            up->link_inline_p = 1;          else            /* The external ones usually point to HTML pages, such as               <link rel="next" href="..."> */            up->link_expect_html = 1;        }    }}/* Handle the META tag.  This requires special handling because of the   refresh feature and because of robot exclusion.  */static voidtag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx){  char *name = find_attr (tag, "name", NULL);  char *http_equiv = find_attr (tag, "http-equiv", NULL);  if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))    {      /* Some pages use a META tag to specify that the page be         refreshed by a new page after a given number of seconds.  The         general format for this is:           <meta http-equiv=Refresh content="NUMBER; URL=index2.html">         So we just need to skip past the "NUMBER; URL=" garbage to         get to the URL.  */      struct urlpos *entry;      int attrind;      int timeout = 0;      char *p;      char *refresh = find_attr (tag, "content", &attrind);      if (!refresh)        return;      for (p = refresh; ISDIGIT (*p); p++)        timeout = 10 * timeout + *p - '0';      if (*p++ != ';')        return;      while (ISSPACE (*p))        ++p;      if (!(   TOUPPER (*p)       == 'U'            && TOUPPER (*(p + 1)) == 'R'            && TOUPPER (*(p + 2)) == 'L'            &&          *(p + 3)  == '='))        return;      p += 4;      while (ISSPACE (*p))        ++p;      entry = append_url (p, tag, attrind, ctx);      if (entry)        {          entry->link_refresh_p = 1;          entry->refresh_timeout = timeout;          entry->link_expect_html = 1;        }    }  else if (name && 0 == strcasecmp (name, "robots"))    {      /* Handle stuff like:         <meta name="robots" content="index,nofollow"> */      char *content = find_attr (tag, "content", NULL);      if (!content)        return;      if (!strcasecmp (content, "none"))        ctx->nofollow = true;      else        {          while (*content)            {              /* Find the next occurrence of ',' or the end of                 the string.  */              char *end = strchr (content, ',');              if (end)                ++end;              else                end = content + strlen (content);              if (!strncasecmp (content, "nofollow", end - content))                ctx->nofollow = true;              content = end;            }        }    }}/* Dispatch the tag handler appropriate for the tag we're mapping   over.  See known_tags[] for definition of tag handlers.  */static voidcollect_tags_mapper (struct taginfo *tag, void *arg){  struct map_context *ctx = (struct map_context *)arg;  /* Find the tag in our table of tags.  This must not fail because     map_html_tags only returns tags found in interesting_tags.  */  struct known_tag *t = hash_table_get (interesting_tags, tag->name);  assert (t != NULL);  t->handler (t->tagid, tag, ctx);}/* Analyze HTML tags FILE and construct a list of URLs referenced from   it.  It merges relative links in FILE with URL.  It is aware of   <base href=...> and does the right thing.  */struct urlpos *get_urls_html (const char *file, const char *url, bool *meta_disallow_follow){  struct file_memory *fm;  struct map_context ctx;  int flags;  /* Load the file. */  fm = read_file (file);  if (!fm)    {      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));      return NULL;    }  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));  ctx.text = fm->content;  ctx.head = ctx.tail = NULL;  ctx.base = NULL;  ctx.parent_base = url ? url : opt.base_href;  ctx.document_file = file;  ctx.nofollow = false;  if (!interesting_tags)    init_interesting ();  /* Specify MHT_TRIM_VALUES because of buggy HTML generators that     generate <a href=" foo"> instead of <a href="foo"> (browsers     ignore spaces as well.)  If you really mean space, use &32; or     %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,     e.g. in <img src="foo.[newline]html">.  Such newlines are also     ignored by IE and Mozilla and are presumably introduced by     writing HTML with editors that force word wrap.  */  flags = MHT_TRIM_VALUES;  if (opt.strict_comments)    flags |= MHT_STRICT_COMMENTS;  map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,                 interesting_tags, interesting_attributes);  DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));  if (meta_disallow_follow)    *meta_disallow_follow = ctx.nofollow;  xfree_null (ctx.base);  read_file_free (fm);  return ctx.head;}/* This doesn't really have anything to do with HTML, but it's similar   to get_urls_html, so we put it here.  */struct urlpos *get_urls_file (const char *file){  struct file_memory *fm;  struct urlpos *head, *tail;  const char *text, *text_end;  /* Load the file.  */  fm = read_file (file);  if (!fm)    {      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));      return NULL;    }  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));  head = tail = NULL;  text = fm->content;  text_end = fm->content + fm->length;  while (text < text_end)    {      int up_error_code;      char *url_text;      struct urlpos *entry;      struct url *url;      const char *line_beg = text;      const char *line_end = memchr (text, '\n', text_end - text);      if (!line_end)        line_end = text_end;      else        ++line_end;      text = line_end;      /* Strip whitespace from the beginning and end of line. */      while (line_beg < line_end && ISSPACE (*line_beg))        ++line_beg;      while (line_end > line_beg && ISSPACE (*(line_end - 1)))        --line_end;      if (line_beg == line_end)        continue;      /* The URL is in the [line_beg, line_end) region. */      /* We must copy the URL to a zero-terminated string, and we         can't use alloca because we're in a loop.  *sigh*.  */      url_text = strdupdelim (line_beg, line_end);      if (opt.base_href)        {          /* Merge opt.base_href with URL. */          char *merged = uri_merge (opt.base_href, url_text);          xfree (url_text);          url_text = merged;        }      url = url_parse (url_text, &up_error_code);      if (!url)        {          logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),                     file, url_text, url_error (up_error_code));          xfree (url_text);          continue;        }      xfree (url_text);      entry = xnew0 (struct urlpos);      entry->url = url;      if (!head)        head = entry;      else        tail->next = entry;      tail = entry;    }  read_file_free (fm);  return head;}voidcleanup_html_url (void){  /* Destroy the hash tables.  The hash table keys and values are not     allocated by this code, so we don't need to free them here.  */  if (interesting_tags)    hash_table_destroy (interesting_tags);  if (interesting_attributes)    hash_table_destroy (interesting_attributes);}
html-url.c - 源码说明

本页面展示了「Wget很好的处理了http和ftp的下载,很值得学习的经典代码」中的 html-url.c 源码文件，采用 C语言编程语言编写，共 727 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Wget相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?