📄 httpenum-depth.c
字号:
static char rcsid[] = "$Id: httpenum-depth.c,v 2.8 2000/02/03 12:45:56 sxw Exp $";/* * httpenum-depth.c - Depth First RootNode URL enumerator for HTTP URLs * * Usage: httpenum-depth http-URL * * Outputs the following format: * * URL of tree root * URL <tab> md5 * ... * URL <tab> md5 * * DEBUG: section 42, level 1, 5, 9 Gatherer enumeration for HTTP * AUTHOR: Harvest derived * * Harvest Indexer http://harvest.sourceforge.net/ * ----------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail lee@arco.de if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <stdio.h>#include <string.h>#include <signal.h>#include <stdlib.h>#include <gdbm.h>#include "util.h"#include "url.h"#define PUBLIC extern#include "filter.h"typedef struct _list_t { void *ptr; struct _list_t *next;} list_t;/* * define HOST_COUNT_IP to 'count' visited hosts based on IP, not the * given hostname. This way aliased machines will be properly * enumerated * If you define HOST_COUNT_IP then support for HTTP/1.1 virtual hosts * and for hosts that have more than one IP address (for round-robin * load balancing) may not work */#undef HOST_COUNT_IP/* Global variables */int max_depth = 0;int cur_depth = 0;int depth_hist[100];/* Local variables */static int url_max = 0;static int nurls = 0;static int host_max = 0;static int nhosts = 0;static char *tree_root = NULL;static char *urldb_filename = NULL;static char *hostdb_filename = NULL;static char *md5db_filename = NULL;static GDBM_FILE urldbf = NULL;static GDBM_FILE hostdbf = NULL;static GDBM_FILE md5dbf = NULL;static FILE *not_visited = NULL;/* Local functions */static void process_url ();static void usage ();static void mark_retrieved ();static void mark_failed ();static void sigdie ();static int url_in_db ();static int md5_in_db ();static list_t *http_enum ();/* From robots-txt.c */extern int RobotsTxtCheck _PARAMS ((URL *));/* ---------------------------------------------------------------------- *//* * mark_failed() - Mark that a URL failed to be retrieved, so that the * enumerator doesn't try it again. This option may not be wanted by * some users and so should be configurable. */static voidmark_failed (URL * up){ datum k, d; Debug (42, 9, ("mark_failed: url='%s'", up->url)); k.dptr = xstrdup (up->url); k.dsize = strlen (k.dptr) + 1; d.dptr = xstrdup ("FailedAccess"); d.dsize = strlen (d.dptr) + 1; if (!gdbm_exists (urldbf, k) && gdbm_store (urldbf, k, d, GDBM_INSERT)) fatal ("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror (gdbm_errno)); xfree (k.dptr); xfree (d.dptr);}/* * mark_retrieved() - Mark that the given URL was successfully retrieved, * so that the URL is not retrieved again. This prevents cycles in the * enumeration. */static voidmark_retrieved (up)URL *up;{ datum k, d; Debug (42, 9, ("mark_retrieved: url='%s', md5='%s'\n", up->url, up->md5)); k.dptr = xstrdup (up->url); k.dsize = strlen (k.dptr) + 1; d.dptr = xstrdup (up->md5); d.dsize = strlen (d.dptr) + 1; if (!gdbm_exists (urldbf, k) && gdbm_store (urldbf, k, d, GDBM_INSERT)) fatal ("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror (gdbm_errno)); if (!gdbm_exists (md5dbf, d) && gdbm_store (md5dbf, d, k, GDBM_INSERT)) fatal ("GDBM MD5DB: %s: %s", k.dptr, gdbm_strerror (gdbm_errno)); xfree (k.dptr); xfree (d.dptr); if (up->redir_from_url != (char *) NULL) { Debug (42, 9, ("mark_retrieved: url='%s', md5='%s'\n", up->redir_from_url, up->md5)); k.dptr = xstrdup (up->redir_from_url); k.dsize = strlen (k.dptr) + 1; d.dptr = xstrdup (up->md5); d.dsize = strlen (d.dptr) + 1; if (!gdbm_exists (urldbf, k) && gdbm_store (urldbf, k, d, GDBM_INSERT)) fatal ("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror (gdbm_errno)); xfree (k.dptr); xfree (d.dptr); } if (nurls++ >= url_max) { Log ("Truncating RootNode %s at %d LeafNode URLs\n", tree_root, url_max); url_close (up); up = NULL; sigdie (0); }}/* * url_in_db() - check to see if the URL is in the database */static inturl_in_db (url)char *url;{ datum k; int r; Debug (42, 9, ("url_in_db: checking for url='%s'\n", url)); k.dptr = xstrdup (url); k.dsize = strlen (k.dptr) + 1; r = gdbm_exists (urldbf, k); xfree (k.dptr); return (r);}/* * md5_in_db() - check to see if the MD5 is in the database */static intmd5_in_db (md5)char *md5;{ datum k; int r; k.dptr = xstrdup (md5); k.dsize = strlen (k.dptr) + 1; r = gdbm_exists (md5dbf, k); xfree (k.dptr); return (r);}/* * host_in_db() - check to see if the host is in the database */static inthost_in_db (host)char *host;{ datum k; int r;#ifdef HOST_COUNT_IP Host *h; h = get_host (host); if (!h) return 0; k.dptr = xstrdup (h->dotaddr);#else k.dptr = xstrdup (host);#endif k.dsize = strlen (k.dptr) + 1; r = gdbm_exists (hostdbf, k); xfree (k.dptr); return (r);}/* * visit_server() - Determine if we should visit the server. Return * zero if we should not process the URL; otherwise, return non-zero. */static intvisit_server (up)URL *up;{ datum k, d;#ifdef HOST_COUNT_IP Host *h = NULL;#endif if (host_in_db (up->host)) /* Host is already in the db */ return (1); if (++nhosts > host_max) return (0);#ifdef HOST_COUNT_IP h = get_host (up->host); if (!h) return (0); k.dptr = xstrdup (h->dotaddr);#else k.dptr = xstrdup (up->host);#endif k.dsize = strlen (k.dptr) + 1; d.dptr = xstrdup (up->url); d.dsize = strlen (d.dptr) + 1; if (gdbm_store (hostdbf, k, d, GDBM_INSERT)) fatal ("GDBM HOSTDB: %s: %s", k.dptr, gdbm_strerror (gdbm_errno)); xfree (k.dptr); xfree (d.dptr); return (1);}inturl_is_allowed (up)URL *up;{ int y; Debug (42, 1, ("Checking URL: %s\nHost: %s\n", up->url, up->host)); if (url_in_db (up->url)) { /* Have we been here? */ Debug (42, 1, ("Already Visited URL: %s\n", up->url)); return 0; } if (y = filter_selection (up)) { /* Match the URL based on REs */ Debug (42, 1, ("Removing Candidate: [%s] %s\n", Filter_Type_Name[y], up->url)); if (not_visited) fprintf (not_visited, "[FILTER] %s\n", up->url); return 0; } if (!visit_server (up)) { /* Can we visit this server? */ Debug (42, 1, ("Disallowed to Visit Server: %s\n", up->url)); if (not_visited) fprintf (not_visited, "[SERVER] %s\n", up->url); return 0; } if (!RobotsTxtCheck (up)) { /* Is it disallowed by the robots.txt file */ Debug (42, 1, ("Disallowed by robots.txt file: %s\n", up->url)); if (not_visited) fprintf (not_visited, "[ROBOTS.TXT] %s\n", up->url); return 0; } return 1;}/* * http_enum() - Returns a linked list of all the URLs in this object, * or NULL on error. Checks for "text/html" and "text/x-soif" in MIME * headers and then runs "HTMLurls" on the file or urldb_getrefs() on the URL. */static list_t *http_enum (up)URL *up;{ list_t *head = NULL; list_t **Tail = NULL; list_t *l = NULL; FILE *fp = NULL; char *s = NULL; char *t0 = NULL; char *t1 = NULL; char *t2 = NULL; URL *tup = NULL; char buf[BUFSIZ]; int pipefds[2]; int pid; int err; int status; int count = 0; int nurls = 0; /* Check to see if we're allowed to visit the URL */ if (!url_is_allowed (up)) return (NULL); /* * Ack. Check for symbolic link loops in server generated HTML listings * Do this by comparing the last two pathname components. If they are * the same then guess its a loop. */ s = xstrdup (up->pathname); t0 = t1 = t2 = NULL; for (t0 = strtok (s, "/"); t0; t0 = strtok (NULL, "/")) { t2 = t1; t1 = t0; } if (t1 != NULL && t2 != NULL) { if (strcmp (t1, t2) == 0) { Debug (42, 1, ("Possible symlink loop: %s\n", up->url)); xfree (s); s = NULL; return (NULL); } } xfree (s); s = NULL; /* Recurse over the redirect chain */ while (((status = url_retrieve (up)) == -1) && count < HTTP_MAX_REDIRECTS) { count++; if (!url_is_allowed (up)) return (NULL); } if (status) { /* Grab the URL; success? */ Debug (42, 1, ("Cannot Retrieve URL: %s\n", up->url));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -