url.c

来自「Wget很好的处理了http和ftp的下载,很值得学习的经典代码」· C语言代码 · 共 2,138 行 · 第 1/5 页
2,138 行
#define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')/* FN_QUERY_SEP is the separator between the file name and the URL   query, normally '?'.  Since Windows cannot handle '?' as part of   file name, we use '@' instead there.  */#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')/* Quote path element, characters in [b, e), as file name, and append   the quoted string to DEST.  Each character is quoted as per   file_unsafe_char and the corresponding table.   If ESCAPED is true, the path element is considered to be   URL-escaped and will be unescaped prior to inspection.  */static voidappend_uri_pathel (const char *b, const char *e, bool escaped,                   struct growable *dest){  const char *p;  int quoted, outlen;  int mask;  if (opt.restrict_files_os == restrict_unix)    mask = filechr_not_unix;  else    mask = filechr_not_windows;  if (opt.restrict_files_ctrl)    mask |= filechr_control;  /* Copy [b, e) to PATHEL and URL-unescape it. */  if (escaped)    {      char *unescaped;      BOUNDED_TO_ALLOCA (b, e, unescaped);      url_unescape (unescaped);      b = unescaped;      e = unescaped + strlen (unescaped);    }  /* Defang ".." when found as component of path.  Remember that path     comes from the URL and might contain malicious input.  */  if (e - b == 2 && b[0] == '.' && b[1] == '.')    {      b = "%2E%2E";      e = b + 6;    }  /* Walk the PATHEL string and check how many characters we'll need     to quote.  */  quoted = 0;  for (p = b; p < e; p++)    if (FILE_CHAR_TEST (*p, mask))      ++quoted;  /* Calculate the length of the output string.  e-b is the input     string length.  Each quoted char introduces two additional     characters in the string, hence 2*quoted.  */  outlen = (e - b) + (2 * quoted);  GROW (dest, outlen);  if (!quoted)    {      /* If there's nothing to quote, we can simply append the string         without processing it again.  */      memcpy (TAIL (dest), b, outlen);    }  else    {      char *q = TAIL (dest);      for (p = b; p < e; p++)        {          if (!FILE_CHAR_TEST (*p, mask))            *q++ = *p;          else            {              unsigned char ch = *p;              *q++ = '%';              *q++ = XNUM_TO_DIGIT (ch >> 4);              *q++ = XNUM_TO_DIGIT (ch & 0xf);            }        }      assert (q - TAIL (dest) == outlen);    }    /* Perform inline case transformation if required.  */  if (opt.restrict_files_case == restrict_lowercase      || opt.restrict_files_case == restrict_uppercase)    {      char *q;      for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)        {          if (opt.restrict_files_case == restrict_lowercase)            *q = TOLOWER (*q);          else            *q = TOUPPER (*q);        }    }            TAIL_INCR (dest, outlen);}/* Append to DEST the directory structure that corresponds the   directory part of URL's path.  For example, if the URL is   http://server/dir1/dir2/file, this appends "/dir1/dir2".   Each path element ("dir1" and "dir2" in the above example) is   examined, url-unescaped, and re-escaped as file name element.   Additionally, it cuts as many directories from the path as   specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it   will produce "bar" for the above example.  For 2 or more, it will   produce "".   Each component of the path is quoted for use as file name.  */static voidappend_dir_structure (const struct url *u, struct growable *dest){  char *pathel, *next;  int cut = opt.cut_dirs;  /* Go through the path components, de-URL-quote them, and quote them     (if necessary) as file names.  */  pathel = u->path;  for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)    {      if (cut-- > 0)        continue;      if (pathel == next)        /* Ignore empty pathels.  */        continue;      if (dest->tail)        append_char ('/', dest);      append_uri_pathel (pathel, next, true, dest);    }}/* Return a unique file name that matches the given URL as good as   possible.  Does not create directories on the file system.  */char *url_file_name (const struct url *u){  struct growable fnres;        /* stands for "file name result" */  const char *u_file, *u_query;  char *fname, *unique;  fnres.base = NULL;  fnres.size = 0;  fnres.tail = 0;  /* Start with the directory prefix, if specified. */  if (opt.dir_prefix)    append_string (opt.dir_prefix, &fnres);  /* If "dirstruct" is turned on (typically the case with -r), add     the host and port (unless those have been turned off) and     directory structure.  */  if (opt.dirstruct)    {      if (opt.protocol_directories)        {          if (fnres.tail)            append_char ('/', &fnres);          append_string (supported_schemes[u->scheme].name, &fnres);        }      if (opt.add_hostdir)        {          if (fnres.tail)            append_char ('/', &fnres);          if (0 != strcmp (u->host, ".."))            append_string (u->host, &fnres);          else            /* Host name can come from the network; malicious DNS may               allow ".." to be resolved, causing us to write to               "../<file>".  Defang such host names.  */            append_string ("%2E%2E", &fnres);          if (u->port != scheme_default_port (u->scheme))            {              char portstr[24];              number_to_string (portstr, u->port);              append_char (FN_PORT_SEP, &fnres);              append_string (portstr, &fnres);            }        }      append_dir_structure (u, &fnres);    }  /* Add the file name. */  if (fnres.tail)    append_char ('/', &fnres);  u_file = *u->file ? u->file : "index.html";  append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);  /* Append "?query" to the file name. */  u_query = u->query && *u->query ? u->query : NULL;  if (u_query)    {      append_char (FN_QUERY_SEP, &fnres);      append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);    }  /* Zero-terminate the file name. */  append_char ('\0', &fnres);  fname = fnres.base;  /* Check the cases in which the unique extensions are not used:     1) Clobbering is turned off (-nc).     2) Retrieval with regetting.     3) Timestamping is used.     4) Hierarchy is built.     The exception is the case when file does exist and is a     directory (see `mkalldirs' for explanation).  */  if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)      && !(file_exists_p (fname) && !file_non_directory_p (fname)))    return fname;  unique = unique_name (fname, true);  if (unique != fname)    xfree (fname);  return unique;}/* Resolve "." and ".." elements of PATH by destructively modifying   PATH and return true if PATH has been modified, false otherwise.   The algorithm is in spirit similar to the one described in rfc1808,   although implemented differently, in one pass.  To recap, path   elements containing only "." are removed, and ".." is taken to mean   "back up one element".  Single leading and trailing slashes are   preserved.   For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive   test examples are provided below.  If you change anything in this   function, run test_path_simplify to make sure you haven't broken a   test case.  */static boolpath_simplify (char *path){  char *h = path;               /* hare */  char *t = path;               /* tortoise */  char *end = strchr (path, '\0');  while (h < end)    {      /* Hare should be at the beginning of a path element. */      if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))        {          /* Ignore "./". */          h += 2;        }      else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))        {          /* Handle "../" by retreating the tortoise by one path             element -- but not past beggining.  */          if (t > path)            {              /* Move backwards until T hits the beginning of the                 previous path element or the beginning of path. */              for (--t; t > path && t[-1] != '/'; t--)                ;            }          h += 3;        }      else        {          /* A regular path element.  If H hasn't advanced past T,             simply skip to the next path element.  Otherwise, copy             the path element until the next slash.  */          if (t == h)            {              /* Skip the path element, including the slash.  */              while (h < end && *h != '/')                t++, h++;              if (h < end)                t++, h++;            }          else            {              /* Copy the path element, including the final slash.  */              while (h < end && *h != '/')                *t++ = *h++;              if (h < end)                *t++ = *h++;            }        }    }  if (t != h)    *t = '\0';  return t != h;}/* Return the length of URL's path.  Path is considered to be   terminated by one or more of the ?query or ;params or #fragment,   depending on the scheme.  */static const char *path_end (const char *url){  enum url_scheme scheme = url_scheme (url);  const char *seps;  if (scheme == SCHEME_INVALID)    scheme = SCHEME_HTTP;       /* use http semantics for rel links */  /* +2 to ignore the first two separators ':' and '/' */  seps = init_seps (scheme) + 2;  return strpbrk_or_eos (url, seps);}/* Find the last occurrence of character C in the range [b, e), or   NULL, if none are present.  */#define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))/* Merge BASE with LINK and return the resulting URI.   Either of the URIs may be absolute or relative, complete with the   host name, or path only.  This tries to reasonably handle all   foreseeable cases.  It only employs minimal URL parsing, without   knowledge of the specifics of schemes.   I briefly considered making this function call path_simplify after   the merging process, as rfc1738 seems to suggest.  This is a bad   idea for several reasons: 1) it complexifies the code, and 2)   url_parse has to simplify path anyway, so it's wasteful to boot.  */char *uri_merge (const char *base, const char *link){  int linklength;  const char *end;  char *merge;  if (url_has_scheme (link))    return xstrdup (link);  /* We may not examine BASE past END. */  end = path_end (base);  linklength = strlen (link);  if (!*link)    {      /* Empty LINK points back to BASE, query string and all. */      return xstrdup (base);    }  else if (*link == '?')    {      /* LINK points to the same location, but changes the query         string.  Examples: */      /* uri_merge("path",         "?new") -> "path?new"     */      /* uri_merge("path?foo",     "?new") -> "path?new"     */      /* uri_merge("path?foo#bar", "?new") -> "path?new"     */      /* uri_merge("path#foo",     "?new") -> "path?new"     */      int baselength = end - base;      merge = xmalloc (baselength + linklength + 1);      memcpy (merge, base, baselength);      memcpy (merge + baselength, link, linklength);      merge[baselength + linklength] = '\0';    }  else if (*link == '#')    {      /* uri_merge("path",         "#new") -> "path#new"     */      /* uri_merge("path#foo",     "#new") -> "path#new"     */      /* uri_merge("path?foo",     "#new") -> "path?foo#new" */      /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */      int baselength;      const char *end1 = strchr (base, '#');      if (!end1)        end1 = base + strlen (base);      baselength = end1 - base;      merge = xmalloc (baselength + linklength + 1);      memcpy (merge, base, baselength);      memcpy (merge + baselength, link, linklength);      merge[baselength + linklength] = '\0';    }  else if (*link == '/' && *(link + 1) == '/')    {      /* LINK begins with "//" and so is a net path: we need to         replace everything after (and including) the double slash         with LINK. */      /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */      /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */      /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */      int span;      const char *slash;      const char *start_insert;      /* Look for first slash. */      slash = memchr (base, '/', end - base);      /* If found slash and it is a double slash, then replace         from this point, else default to replacing from the         beginning.  */      if (slash && *(slash + 1) == '/')        start_insert = slash;      else        start_insert = base;      span = start_insert - base;      merge = xmalloc (span + linklength + 1);      if (span)        memcpy (merge, base, span);      memcpy (merge + span, link, linklength);      merge[span + linklength] = '\0';    }  else if (*link == '/')    {      /* LINK is an absolute path: we need to replace everything         after (and including) the FIRST slash with LINK.         So, if BASE is "http://host/whatever/foo/bar", and LINK is         "/qux/xyzzy", our result should be         "http://host/qux/xyzzy".  */      int span;      const char *slash;      const char *start_insert = NULL; /* for gcc to shut up. */      const char *pos = base;      bool seen_slash_slash = false;
url.c - 源码说明

本页面展示了「Wget很好的处理了http和ftp的下载,很值得学习的经典代码」中的 url.c 源码文件，采用 C语言编程语言编写，共 2,138 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Wget相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?