📄 httpenum-breadth.c
字号:
int y; if ((tup = url_open (url)) == NULL) return 0;; if (url_in_db (tup->url)) { /* Have we been here? */ Debug (42, 1, ("Already Visited URL: %s\n", tup->url)); url_close (tup); return 0; } if ((y = filter_selection (tup))) { Debug (42, 1, ("Removing Candidate: [%s] %s\n", Filter_Type_Name[y], tup->url)); if (not_visited) fprintf (not_visited, "[FILTER] %s\n", tup->url); url_close (tup); return 0; } if (!visit_server (tup)) { Debug (42, 1, ("Server count exceeded: %s\n", tup->url)); if (not_visited) fprintf (not_visited, "[SERVER] %s\n", tup->url); url_close (tup); return 0; } if (!RobotsTxtCheck (tup)) { Debug (42, 1, ("Disallowed by robots.txt: %s\n", tup->url)); if (not_visited) fprintf (not_visited, "[ROBOTS.TXT] %s\n", tup->url); url_close (tup); return 0; } url_close (tup); return 1;}/* * http_enum() - Builds a linked list of all the URLs in this object, * returns -1 on error. Checks for "text/html" and "text/x-soif" in MIME * headers and then runs "HTMLurls" on the file or urldb_getrefs() on the URL. */static inthttp_enum (up, depth)URL *up;int depth;{ FILE *fp = NULL;#if 0 /* kjl/7mar2002 */ char *enum_url = NULL;#endif char *s = NULL; char *t0 = NULL; char *t1 = NULL; char *t2 = NULL; int err; int nurls = 0; int pid; int pipefds[2]; int status; int count = 0; char *argv[5]; static char buf[BUFSIZ]; if (url_in_db (up->url)) { /* Have we been here? */ Debug (42, 1, ("Already Visited URL: %s\n", up->url)); return 0; } /* * Ack. Check for symbolic link loops in server generated HTML listings * Do this by comparing the last two pathname components. If they are * the same then guess its a loop. */ s = xstrdup (up->pathname); t0 = t1 = t2 = NULL; for (t0 = strtok (s, "/"); t0; t0 = strtok (NULL, "/")) { t2 = t1; t1 = t0; } if (t1 != NULL && t2 != NULL) { if (strcmp (t1, t2) == 0) { Debug (42, 1, ("Possible symlink loop: %s\n", up->url)); xfree (s); s = NULL; return 0; } } xfree (s); s = NULL; while (((status = url_retrieve (up)) == -1) && count < HTTP_MAX_REDIRECTS) { count++; if (!url_is_allowed (up->url)) return 0; } if (status) { /* Grab the URL; success? */ Debug (42, 1, ("Cannot Retrieve URL: %s\n", up->url));#ifdef DONT_RETRY_FAILS mark_failed (up);#endif return 0; } if (up->md5 && md5_in_db (up->md5)) { /* Have we been here? */ Debug (42, 1, ("Already Visited MD5: %s\n", up->url)); return 0; } /* Remember that we've been here before */ if (up->md5) mark_retrieved (up); argv[0] = argv[1] = argv[2] = argv[3] = argv[4] = NULL; /* Are we dealing with HTML or SOIF, if not we can't get href links */ if (up->http_mime_hdr != NULL) { if (strstr (up->http_mime_hdr, "text/html") != NULL) { argv[0] = "HTMLurls"; argv[1] = "--base-url"; argv[2] = up->url; argv[3] = up->filename; } else if (strstr (up->http_mime_hdr, "text/x-soif") != NULL) { char *refs; char *ptr; char *abs_ref; Log ("%s\tNot Modified\n", up->url); refs = urldb_getrefs (up->url); if (!refs) return -1; for (ptr = strtok (refs, "\n"); ptr; ptr = strtok (NULL, "\n")) { abs_ref = url_parse_relative (ptr, up->url); if (url_is_allowed (abs_ref)) { add_to_list (abs_ref, depth); } xfree (abs_ref); } xfree (refs); return 1; } else { fprintf (stdout, "%s\t%s\n", up->url, up->md5); /* URL <tab> MD5 */ fflush (stdout); return -1; } } /* Extract the HREF's */ if (pipe (pipefds) < 0) { log_errno ("pipe"); return -1; } if ((pid = fork ()) < 0) { log_errno ("fork"); close (pipefds[0]); close (pipefds[1]); return -1; } if (pid == 0) { /* child: HTMLurls */#if 0 /* kjl/7mar2002 */ enum_url = (char *) xmalloc (strlen (up->url) + 20); sprintf (enum_url, "ENUMERATOR_URL=%s", up->url); putenv (enum_url);#endif close (pipefds[0]); /* child wont read from pipe */ dup2 (pipefds[1], 1); /* stdout -> write:pipe */ close (pipefds[1]); /* close pipe, its now stdout */ execvp (argv[0], argv); sprintf (buf, "execvp: %s", argv[0]); log_errno (buf); _exit (1); } close (pipefds[1]); /* parent wont write */ if ((fp = fdopen (pipefds[0], "r")) == NULL) { log_errno ("fdopen"); return -1; } /* * For each HREF pointer, convert it to a URL, and add it to * the global list of URLs to process. */ while (fgets (buf, BUFSIZ, fp) != NULL) { if ((s = strrchr (buf, '\n')) != NULL) *s = '\0'; /* strip newline */ Debug (42, 1, ("Input: %s\n", buf)); if (url_is_allowed (buf)) { add_to_list (buf, depth); nurls++; } } fclose (fp); close (pipefds[0]); /* * Wait for HTMLurls to finish and decide on whether to index the page * according to its exit code */ if ((err = waitpid (pid, &status, 0)) != pid) { Debug (42, 1, ("WARNING: waiting for child %d got %d...\n", pid, err)); } /* If we had an exit code of 0 then index it, otherwise don't */ if (WEXITSTATUS (status) == 0) { fprintf (stdout, "%s\t%s\n", up->url, up->md5); fflush (stdout); } Debug (42, 1, ("Adding %d URLs from %s to workload\n", nurls, up->url)); return 1;}/* ---------------------------------------------------------------------- *//* * initialize() - Basic init routines */static voidinitialize (){ char *s = NULL; extern int liburl_conform_rfc1738; FILE *logfp = NULL;#ifdef USE_HOST_CACHE host_cache_init ();#endif cur_depth = max_depth = url_max = host_max = 0; if ((s = getenv ("HARVEST_URL_MAX")) != NULL) url_max = atoi (s); if ((s = getenv ("HARVEST_HOST_MAX")) != NULL) host_max = atoi (s); if ((s = getenv ("HARVEST_DEPTH_MAX")) != NULL) max_depth = atoi (s); if ((s = getenv ("HARVEST_DEPTH_CUR")) != NULL) cur_depth = atoi (s); if (url_max < 1) url_max = 250; /* hard-coded maximum */ if (host_max < 1) host_max = 1; /* hard-coded maximum */ if (max_depth < 1) max_depth = 0; /* hard-coded maximum */ host_filterfile = getenv ("HARVEST_HOST_FILTER"); url_filterfile = getenv ("HARVEST_URL_FILTER"); access_types = getenv ("HARVEST_ACCESS_TYPES"); filename_candidates = getenv ("HARVEST_CANDIDATES"); if ((s = getenv ("HARVEST_GATHERER_LOGFILE")) != (char *) NULL) logfp = fopen (s, "a+"); if (logfp == (FILE *) NULL) logfp = stderr; init_log3 ("httpenum-breadth", logfp, stderr); init_url (); liburl_conform_rfc1738 = 1; filter_initialize (); Debug (42, 5, ("access_mask: %#02X\n", access_mask)); /* Open GDBM databases to keep track of where we've been */ urldb_filename = xstrdup (tempnam (NULL, "Hurl")); urldbf = gdbm_open (urldb_filename, 0, GDBM_NEWDB, 0644, NULL); if (urldbf == NULL) { log_errno (urldb_filename); fatal ("gdbm_open: %s: %s", urldb_filename, gdbm_strerror (gdbm_errno)); } hostdb_filename = xstrdup (tempnam (NULL, "Hhost")); hostdbf = gdbm_open (hostdb_filename, 0, GDBM_NEWDB, 0644, NULL); if (hostdbf == NULL) { log_errno (hostdb_filename); fatal ("gdbm_open: %s: %s", hostdb_filename, gdbm_strerror (gdbm_errno)); } md5db_filename = xstrdup (tempnam (NULL, "Hmd5")); md5dbf = gdbm_open (md5db_filename, 0, GDBM_NEWDB, 0644, NULL); if (md5dbf == NULL) { log_errno (md5db_filename); fatal ("gdbm_open: %s: %s", md5db_filename, gdbm_strerror (gdbm_errno)); } /* open not-visited file */ if ((s = getenv ("HARVEST_NOT_VISITED_LOG")) != NULL) not_visited = fopen (s, "a+"); if (not_visited) setbuf (not_visited, NULL);}/* Die gracefully */static voidsigdie (x)int x;{ int i, sum;#ifdef USE_HOST_CACHE dump_host_cache (42, 9);#endif if (urldbf != NULL) gdbm_close (urldbf); if (hostdbf != NULL) gdbm_close (hostdbf); if (md5dbf != NULL) gdbm_close (md5dbf); if (not_visited) fclose (not_visited); /* (void) unlink (urldb_filename); */ crremove (urldb_filename); xfree (urldb_filename); /* (void) unlink (hostdb_filename); */ crremove (hostdb_filename); xfree (hostdb_filename); /* (void) unlink (md5db_filename); */ crremove (md5db_filename); xfree (md5db_filename); Log ("Statistics for URL '%s':\n", tree_root); while (depth_hist[max_depth] == 0) max_depth--; for (i = 0, sum = 0; i < 100; i++) { if (i > max_depth && depth_hist[i] == 0) break; Log (" Found %8d objects at depth %d\n", depth_hist[i], i); sum += depth_hist[i]; } Log ("Total for URL '%s': %8d objects\n", tree_root, sum); Debug (42, 1, ("httpenum-breadth: exiting (signal %d)\n", x)); exit (0);}/* ---------------------------------------------------------------------- */static voidusage (){ fprintf (stderr, "Usage: httpenum-breadth http-URL\n"); exit (1);}intmain (argc, argv)int argc;char **argv;{ URL *up = NULL; list_t *l = NULL; char *url = NULL; int depth = 0; debug_init (); /* from $HARVEST_DEBUG */ for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) { if (!strncmp (*argv, "-D", 2)) { debug_flag (*argv); } } if (argc != 1) usage (); for (depth = 0; depth < 100; depth++) depth_hist[depth] = 0; signal (SIGTERM, sigdie); /* Die gracefully */ signal (SIGINT, sigdie); signal (SIGPIPE, sigdie); /* Quickly clean up on broken pipe */ initialize (); /* Initialize */ Debug (42, 1, ("httpenum-breadth: Starting...\n")); /* Grab the RootNode URL from the command line */ if ((up = url_open (*argv)) == NULL || up->type != URL_HTTP) { usage (); } /* Mark the RootNode */ tree_root = xstrdup (up->url); Tail = &head; Log ("Processing URL: '%s'\n", tree_root); printf ("%s\n", up->url); /* Print tree root */ /* * helpdesk@ecs.soton.ac.uk -- Gatherer visits too many hosts * 6/3/96. Make sure the first URL we start with is added * to the list of servers visited. */ (void) visit_server (up); add_to_list (up->url, cur_depth); /* start at depth = 0 */ url_close (up); for (l = head; l; l = free_from_list (l)) { url = (char *) l->ptr; depth = l->depth; if (max_depth > 0 && depth > max_depth) { if (not_visited) fprintf (not_visited, "[DEPTH] %s\n", url); Debug (42, 1, ("Maximum Depth of %d Reached: %s\n", max_depth, url)); continue; } Debug (42, 1, ("Processing: [%2d] %s\n", depth, url)); if ((up = url_open (url)) == NULL) continue; if ((up->type != URL_HTTP)) { Debug (42, 1, ("Examining: [%d:%d] %s\n", depth, max_depth, up->url)); /* filter_selection() checks access_mask */ if (!filter_selection (up) && (depth <= max_depth)) { /* * Print URL with bogus MD5 to enumerate; * flush to keep pipe moving * URL <tab> MD5 */ fprintf (stdout, "%s\tDepth=%d:%d\n", up->url, depth, max_depth); fflush (stdout); Debug (42, 1, ("HTTPENUM Re-enumeration: %s\tDepth=%d:%d\n", up->url, depth, max_depth)); } url_close (up); continue; } /* search for more links from this one */ if (http_enum (up, depth + 1) != 0 && depth < 100) /* Increase retrieved count only for new URLs - HrS */ depth_hist[depth]++; url_close (up); } finish_url (); sigdie (0); /* NOTREACHED */}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -