convert.c

来自「wget (command line browser) source code」· C语言代码 · 共 960 行 · 第 1/2 页
960 行
/* Conversion of links to local files.   Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.This file is part of GNU Wget.GNU Wget is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 2 of the License, or (at your option) any later version.GNU Wget is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with Wget; if not, write to the Free SoftwareFoundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.In addition, as a special exception, the Free Software Foundationgives permission to link the code of its release of Wget with theOpenSSL project's "OpenSSL" library (or with modified versions of itthat use the same license as the "OpenSSL" library), and distributethe linked executables.  You must obey the GNU General Public Licensein all respects for all of the code used other than "OpenSSL".  If youmodify this file, you may extend this exception to your version of thefile, but you are not obligated to do so.  If you do not wish to doso, delete this exception statement from your version.  */#include <config.h>#include <stdio.h>#include <stdlib.h>#ifdef HAVE_STRING_H# include <string.h>#else# include <strings.h>#endif /* HAVE_STRING_H */#ifdef HAVE_UNISTD_H# include <unistd.h>#endif /* HAVE_UNISTD_H */#include <errno.h>#include <assert.h>#include <sys/types.h>#include "wget.h"#include "convert.h"#include "url.h"#include "recur.h"#include "utils.h"#include "hash.h"static struct hash_table *dl_file_url_map;struct hash_table *dl_url_file_map;/* List of HTML files downloaded in this Wget run, used for link   conversion after Wget is done.  The list and the set contain the   same information, except the list maintains the order.  Perhaps I   should get rid of the list, it's there for historical reasons.  */static slist *downloaded_html_list;struct hash_table *downloaded_html_set;static void convert_links PARAMS ((const char *, struct urlpos *));/* This function is called when the retrieval is done to convert the   links that have been downloaded.  It has to be called at the end of   the retrieval, because only then does Wget know conclusively which   URLs have been downloaded, and which not, so it can tell which   direction to convert to.   The "direction" means that the URLs to the files that have been   downloaded get converted to the relative URL which will point to   that file.  And the other URLs get converted to the remote URL on   the server.   All the downloaded HTMLs are kept in downloaded_html_files, and   downloaded URLs in urls_downloaded.  All the information is   extracted from these two lists.  */voidconvert_all_links (void){  slist *html;  long msecs;  int file_count = 0;  struct wget_timer *timer = wtimer_new ();  /* Destructively reverse downloaded_html_files to get it in the right order.     recursive_retrieve() used slist_prepend() consistently.  */  downloaded_html_list = slist_nreverse (downloaded_html_list);  for (html = downloaded_html_list; html; html = html->next)    {      struct urlpos *urls, *cur_url;      char *url;      char *file = html->string;      /* Determine the URL of the HTML file.  get_urls_html will need	 it.  */      url = hash_table_get (dl_file_url_map, file);      if (!url)	{	  DEBUGP (("Apparently %s has been removed.\n", file));	  continue;	}      DEBUGP (("Scanning %s (from %s)\n", file, url));      /* Parse the HTML file...  */      urls = get_urls_html (file, url, NULL);      /* We don't respect meta_disallow_follow here because, even if         the file is not followed, we might still want to convert the         links that have been followed from other files.  */      for (cur_url = urls; cur_url; cur_url = cur_url->next)	{	  char *local_name;	  struct url *u = cur_url->url;	  if (cur_url->link_base_p)	    {	      /* Base references have been resolved by our parser, so		 we turn the base URL into an empty string.  (Perhaps		 we should remove the tag entirely?)  */	      cur_url->convert = CO_NULLIFY_BASE;	      continue;	    }	  /* We decide the direction of conversion according to whether	     a URL was downloaded.  Downloaded URLs will be converted	     ABS2REL, whereas non-downloaded will be converted REL2ABS.  */	  local_name = hash_table_get (dl_url_file_map, u->url);	  /* Decide on the conversion type.  */	  if (local_name)	    {	      /* We've downloaded this URL.  Convert it to relative                 form.  We do this even if the URL already is in                 relative form, because our directory structure may                 not be identical to that on the server (think `-nd',                 `--cut-dirs', etc.)  */	      cur_url->convert = CO_CONVERT_TO_RELATIVE;	      cur_url->local_name = xstrdup (local_name);	      DEBUGP (("will convert url %s to local %s\n", u->url, local_name));	    }	  else	    {	      /* We haven't downloaded this URL.  If it's not already                 complete (including a full host name), convert it to                 that form, so it can be reached while browsing this                 HTML locally.  */	      if (!cur_url->link_complete_p)		cur_url->convert = CO_CONVERT_TO_COMPLETE;	      cur_url->local_name = NULL;	      DEBUGP (("will convert url %s to complete\n", u->url));	    }	}      /* Convert the links in the file.  */      convert_links (file, urls);      ++file_count;      /* Free the data.  */      free_urlpos (urls);    }  msecs = wtimer_elapsed (timer);  wtimer_delete (timer);  logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),	     file_count, (double)msecs / 1000);}static void write_backup_file PARAMS ((const char *, downloaded_file_t));static const char *replace_attr PARAMS ((const char *, int, FILE *,					 const char *));static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,						      const char *, int));static char *local_quote_string PARAMS ((const char *));static char *construct_relative PARAMS ((const char *, const char *));/* Change the links in one HTML file.  LINKS is a list of links in the   document, along with their positions and the desired direction of   the conversion.  */static voidconvert_links (const char *file, struct urlpos *links){  struct file_memory *fm;  FILE *fp;  const char *p;  downloaded_file_t downloaded_file_return;  struct urlpos *link;  int to_url_count = 0, to_file_count = 0;  logprintf (LOG_VERBOSE, _("Converting %s... "), file);  {    /* First we do a "dry run": go through the list L and see whether       any URL needs to be converted in the first place.  If not, just       leave the file alone.  */    int dry_count = 0;    struct urlpos *dry = links;    for (dry = links; dry; dry = dry->next)      if (dry->convert != CO_NOCONVERT)	++dry_count;    if (!dry_count)      {	logputs (LOG_VERBOSE, _("nothing to do.\n"));	return;      }  }  fm = read_file (file);  if (!fm)    {      logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),		 file, strerror (errno));      return;    }  downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);  if (opt.backup_converted && downloaded_file_return)    write_backup_file (file, downloaded_file_return);  /* Before opening the file for writing, unlink the file.  This is     important if the data in FM is mmaped.  In such case, nulling the     file, which is what fopen() below does, would make us read all     zeroes from the mmaped region.  */  if (unlink (file) < 0 && errno != ENOENT)    {      logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),		 file, strerror (errno));      read_file_free (fm);      return;    }  /* Now open the file for writing.  */  fp = fopen (file, "wb");  if (!fp)    {      logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),		 file, strerror (errno));      read_file_free (fm);      return;    }  /* Here we loop through all the URLs in file, replacing those of     them that are downloaded with relative references.  */  p = fm->content;  for (link = links; link; link = link->next)    {      char *url_start = fm->content + link->pos;      if (link->pos >= fm->length)	{	  DEBUGP (("Something strange is going on.  Please investigate."));	  break;	}      /* If the URL is not to be converted, skip it.  */      if (link->convert == CO_NOCONVERT)	{	  DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));	  continue;	}      /* Echo the file contents, up to the offending URL's opening         quote, to the outfile.  */      fwrite (p, 1, url_start - p, fp);      p = url_start;      switch (link->convert)	{	case CO_CONVERT_TO_RELATIVE:	  /* Convert absolute URL to relative. */	  {	    char *newname = construct_relative (file, link->local_name);	    char *quoted_newname = local_quote_string (newname);	    if (!link->link_refresh_p)	      p = replace_attr (p, link->size, fp, quoted_newname);	    else	      p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,					     link->refresh_timeout);	    DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",		     link->url->url, newname, link->pos, file));	    xfree (newname);	    xfree (quoted_newname);	    ++to_file_count;	    break;	  }	case CO_CONVERT_TO_COMPLETE:	  /* Convert the link to absolute URL. */	  {	    char *newlink = link->url->url;	    char *quoted_newlink = html_quote_string (newlink);	    if (!link->link_refresh_p)	      p = replace_attr (p, link->size, fp, quoted_newlink);	    else	      p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,					     link->refresh_timeout);	    DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",		     newlink, link->pos, file));	    xfree (quoted_newlink);	    ++to_url_count;	    break;	  }	case CO_NULLIFY_BASE:	  /* Change the base href to "". */	  p = replace_attr (p, link->size, fp, "");	  break;	case CO_NOCONVERT:	  abort ();	  break;	}    }  /* Output the rest of the file. */  if (p - fm->content < fm->length)    fwrite (p, 1, fm->length - (p - fm->content), fp);  fclose (fp);  read_file_free (fm);  logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);}/* Construct and return a malloced copy of the relative link from two   pieces of information: local name S1 of the referring file and   local name S2 of the referred file.   So, if S1 is "jagor.srce.hr/index.html" and S2 is   "jagor.srce.hr/images/news.gif", the function will return   "images/news.gif".   Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is   "fly.cc.fer.hr/images/fly.gif", the function will return   "../images/fly.gif".   Caveats: S1 should not begin with `/', unless S2 also begins with   '/'.  S1 should not contain things like ".." and such --   construct_relative ("fly/ioccc/../index.html",   "fly/images/fly.gif") will fail.  (A workaround is to call   something like path_simplify() on S1).  */static char *construct_relative (const char *s1, const char *s2){  int i, cnt, sepdirs1;  char *res;  if (*s2 == '/')    return xstrdup (s2);  /* S1 should *not* be absolute, if S2 wasn't.  */  assert (*s1 != '/');  i = cnt = 0;  /* Skip the directories common to both strings.  */  while (1)    {      while (s1[i] && s2[i]	     && (s1[i] == s2[i])	     && (s1[i] != '/')	     && (s2[i] != '/'))	++i;      if (s1[i] == '/' && s2[i] == '/')	cnt = ++i;      else	break;    }  for (sepdirs1 = 0; s1[i]; i++)    if (s1[i] == '/')      ++sepdirs1;  /* Now, construct the file as of:     - ../ repeated sepdirs1 time     - all the non-mutual directories of S2.  */  res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);  for (i = 0; i < sepdirs1; i++)    memcpy (res + 3 * i, "../", 3);  strcpy (res + 3 * i, s2 + cnt);  return res;}static voidwrite_backup_file (const char *file, downloaded_file_t downloaded_file_return){  /* Rather than just writing over the original .html file with the     converted version, save the former to *.orig.  Note we only do     this for files we've _successfully_ downloaded, so we don't     clobber .orig files sitting around from previous invocations. */  /* Construct the backup filename as the original name plus ".orig". */  size_t         filename_len = strlen(file);  char*          filename_plus_orig_suffix;  boolean        already_wrote_backup_file = FALSE;  slist*         converted_file_ptr;  static slist*  converted_files = NULL;  if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)    {      /* Just write "orig" over "html".  We need to do it this way	 because when we're checking to see if we've downloaded the	 file before (to see if we can skip downloading it), we don't	 know if it's a text/html file.  Therefore we don't know yet	 at that stage that -E is going to cause us to tack on	 ".html", so we need to compare vs. the original URL plus	 ".orig", not the original URL plus ".html.orig". */      filename_plus_orig_suffix = alloca (filename_len + 1);      strcpy(filename_plus_orig_suffix, file);      strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");    }  else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */    {      /* Append ".orig" to the name. */      filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));      strcpy(filename_plus_orig_suffix, file);      strcpy(filename_plus_orig_suffix + filename_len, ".orig");    }  /* We can get called twice on the same URL thanks to the     convert_all_links() call in main().  If we write the .orig file     each time in such a case, it'll end up containing the first-pass     conversion, not the original file.  So, see if we've already been     called on this file. */  converted_file_ptr = converted_files;  while (converted_file_ptr != NULL)    if (strcmp(converted_file_ptr->string, file) == 0)      {	already_wrote_backup_file = TRUE;	break;      }    else      converted_file_ptr = converted_file_ptr->next;  if (!already_wrote_backup_file)    {      /* Rename <file> to <file>.orig before former gets written over. */      if (rename(file, filename_plus_orig_suffix) != 0)	logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),		   file, filename_plus_orig_suffix, strerror (errno));      /* Remember that we've already written a .orig backup for this file.	 Note that we never free this memory since we need it till the	 convert_all_links() call, which is one of the last things the	 program does before terminating.  BTW, I'm not sure if it would be	 safe to just set 'converted_file_ptr->string' to 'file' below,	 rather than making a copy of the string...  Another note is that I	 thought I could just add a field to the urlpos structure saying	 that we'd written a .orig file for this URL, but that didn't work,	 so I had to make this separate list.	 -- Dan Harkless <wget@harkless.org>         This [adding a field to the urlpos structure] didn't work         because convert_file() is called from convert_all_links at         the end of the retrieval with a freshly built new urlpos         list.	 -- Hrvoje Niksic <hniksic@xemacs.org>      */      converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));      converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */      converted_file_ptr->next = converted_files;      converted_files = converted_file_ptr;    }}static int find_fragment PARAMS ((const char *, int, const char **,				  const char **));/* Replace an attribute's original text with NEW_TEXT. */static const char *replace_attr (const char *p, int size, FILE *fp, const char *new_text){  int quote_flag = 0;  char quote_char = '\"';	/* use "..." for quoting, unless the				   original value is quoted, in which				   case reuse its quoting char. */  const char *frag_beg, *frag_end;  /* Structure of our string is:
convert.c - 源码说明

本页面展示了「wget (command line browser) source code」中的 convert.c 源码文件，采用 C语言编程语言编写，共 960 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与command相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?