📄 httpenum-depth.c
字号:
#ifdef DONT_RETRY_FAILS mark_failed (up);#endif return (NULL); } if (up->md5 && md5_in_db (up->md5)) { /* Have we been here? */ Debug (42, 1, ("Already Visited MD5: %s\n", up->url)); return (NULL); } /* Remember that we've been here before */ if (up->md5) mark_retrieved (up); /* Are we dealing with HTML or SOIF, if not we can't get href links */ if (up->http_mime_hdr == NULL) { fprintf (stdout, "%s\t%s\n", up->url, up->md5); fflush (stdout); return (NULL); } /* If we were given an HTML document, then use HTMLurls to summarise it */ if (strstr (up->http_mime_hdr, "text/html") != NULL) { /* Extract the HREF's */ if (pipe (pipefds) < 0) { log_errno ("pipe"); return (NULL); } if ((pid = fork ()) < 0) { log_errno ("fork"); return (NULL); } if (pid == 0) { /* child: HTMLurls */ char *argv[64];#if 0 /* kjl/7mar2002 */ char *enum_url = NULL; enum_url = (char *) xmalloc (strlen (up->url) + 20); sprintf (enum_url, "ENUMERATOR_URL=%s", up->url); putenv (enum_url);#endif close (pipefds[0]); /* child wont read from pipe */ dup2 (pipefds[1], 1); /* stdout -> write:pipe */ close (pipefds[1]); /* close pipe, its now stdout */ argv[0] = "HTMLurls"; argv[1] = "--base-url"; argv[2] = up->url; argv[3] = up->filename; argv[4] = NULL; execvp (argv[0], argv); sprintf (buf, "execvp: %s", argv[0]); log_errno (buf); _exit (1); } close (pipefds[1]); /* parent wont write */ if ((fp = fdopen (pipefds[0], "r")) == NULL) { log_errno ("fdopen"); return (NULL); } /* * For each HREF pointer, convert it to a URL, and add it to * the list of URLs to return. */ Tail = &head; while (fgets (buf, BUFSIZ, fp) != NULL) { if ((s = strrchr (buf, '\n')) != NULL) *s = '\0'; /* strip newline */ Debug (42, 1, ("httpenum-depth: Input: %s\n", buf)); /* Make a link in the list for this URL */ l = (list_t *) xmalloc (sizeof (list_t)); l->ptr = (void *) xstrdup (buf); l->next = (list_t *) NULL; *Tail = l; Tail = &(l->next); nurls++; } fclose (fp); close (pipefds[0]); /* Wait for HTMLurls to finish and decide on whether to index the page * according to its exit code */ if ((err = waitpid (pid, &status, 0)) != pid) { Debug (42, 1, ("WARNING: waiting for child %d got %d...\n", pid, err)); } /* If we had an exit code of 0 then index it, otherwise don't */ if (WEXITSTATUS (status) == 0) { fprintf (stdout, "%s\t%s\n", up->url, up->md5); fflush (stdout); } } else if (strstr (up->http_mime_hdr, "text/x-soif") != NULL) { /* If its a SOIF document then get it from the database */ char *refs = urldb_getrefs (up->url); char *ptr; char *abs_ref; Log ("%s\tNot Modified\n", up->url); /* Flush the URL */ fprintf (stdout, "%s\t%s\n", up->url, up->md5); fflush (stdout); Tail = &head; for (ptr = strtok (refs, "\n"); ptr; ptr = strtok (NULL, "\n")) { abs_ref = url_parse_relative (ptr, up->url); if (url_is_allowed (abs_ref)) { /* Make an absolute link in the list for this URL */ l = (list_t *) xmalloc (sizeof (list_t)); l->ptr = (void *) abs_ref; l->next = (list_t *) NULL; *Tail = l; Tail = &(l->next); nurls++; } else { xfree (abs_ref); } } xfree (refs); } else { fprintf (stdout, "%s\t%s\n", up->url, up->md5); fflush (stdout); return (NULL); } Debug (42, 1, ("Adding %d URLs from %s to workload\n", nurls, up->url)); return (head);}/* * process_url() - Retrieves the given URL (HTTP only), computes an MD5, * and extracts the list of HREF pointers within the HTML document. * If any of the HREF pointers are HTTP and the same host as the given * URL up, then it calls process_url() recursively. * up is closed on exit. */static voidprocess_url (up, depth)URL *up;int depth;{ list_t *head = NULL; list_t *l = NULL; list_t *next_l = NULL; char *url = NULL; URL *tup = NULL; if (max_depth > 0 && depth > (max_depth + 1)) { if (not_visited) fprintf (not_visited, "[DEPTH] %s\n", up->url); Debug (42, 1, ("Maximum Depth of %d Reached: %s\n", max_depth, up->url)); url_close (up); return; } Debug (42, 1, ("Processing: [%2d] %s\n", depth - 1, up->url)); if (depth - 1 < 100) depth_hist[depth - 1]++; if ((head = http_enum (up)) == NULL) { url_close (up); return; } url_close (up); /* * Now, for each URL in the list, call process_url() if * the URL is an HTTP url and it is on the same host */ for (l = head; l; l = next_l) { next_l = l->next; url = (char *) l->ptr; if ((url != NULL) && ((tup = url_open (url)) != NULL)) { if ((tup->type != URL_HTTP)) { Debug (42, 1, ("Examining: %s\n", tup->url)); /* filter_selection() checks access_mask */ if (!filter_selection (tup)) { int re_depth; /* Print URL with bogus MD5 to enumerate; */ /* flush to keep pipe moving */ /* URL <tab> MD5 */ re_depth = max_depth - depth + 2; if (re_depth < 1) re_depth = 1; fprintf (stdout, "%s\tDepth=%d\n", tup->url, re_depth); fflush (stdout); Debug (42, 1, ("HTTPENUM Re-enumeration: %s\tDepth=%d\n", tup->url, re_depth)); } url_close (tup); } else { process_url (tup, depth + 1); } } /* Free List Entry */ xfree (l->ptr); l->ptr = (void *) NULL; l->next = (list_t *) NULL; xfree (l); l = (list_t *) NULL; }}/* ---------------------------------------------------------------------- *//* * initialize() - Basic init routines */static voidinitialize (){ char *s; extern int liburl_conform_rfc1738; FILE *logfp = NULL;#ifdef USE_HOST_CACHE host_cache_init ();#endif cur_depth = max_depth = url_max = host_max = 0; if ((s = getenv ("HARVEST_URL_MAX")) != NULL) url_max = atoi (s); if ((s = getenv ("HARVEST_HOST_MAX")) != NULL) host_max = atoi (s); if ((s = getenv ("HARVEST_DEPTH_MAX")) != NULL) max_depth = atoi (s); if ((s = getenv ("HARVEST_DEPTH_CUR")) != NULL) cur_depth = atoi (s); if (url_max < 1) url_max = 250; /* hard-coded maximum */ if (host_max < 1) host_max = 1; /* hard-coded maximum */ if (max_depth < 1) max_depth = 0; /* hard-coded maximum */ host_filterfile = getenv ("HARVEST_HOST_FILTER"); url_filterfile = getenv ("HARVEST_URL_FILTER"); access_types = getenv ("HARVEST_ACCESS_TYPES"); if ((s = getenv ("HARVEST_GATHERER_LOGFILE")) != (char *) NULL) logfp = fopen (s, "a+"); if (logfp == (FILE *) NULL) logfp = stderr; init_log3 ("httpenum-depth", logfp, stderr); init_url (); liburl_conform_rfc1738 = 1; filter_initialize (); Debug (42, 5, ("access_mask: %#02X\n", access_mask)); /* Open GDBM databases to keep track of where we've been */ urldb_filename = xstrdup (tempnam (NULL, "Hurl")); urldbf = gdbm_open (urldb_filename, 0, GDBM_NEWDB, 0644, NULL); if (urldbf == NULL) { log_errno (urldb_filename); fatal ("gdbm_open: %s: %s", urldb_filename, gdbm_strerror (gdbm_errno)); } hostdb_filename = xstrdup (tempnam (NULL, "Hhost")); hostdbf = gdbm_open (hostdb_filename, 0, GDBM_NEWDB, 0644, NULL); if (hostdbf == NULL) { log_errno (hostdb_filename); fatal ("gdbm_open: %s: %s", hostdb_filename, gdbm_strerror (gdbm_errno)); } md5db_filename = xstrdup (tempnam (NULL, "Hmd5")); md5dbf = gdbm_open (md5db_filename, 0, GDBM_NEWDB, 0644, NULL); if (md5dbf == NULL) { log_errno (md5db_filename); fatal ("gdbm_open: %s: %s", md5db_filename, gdbm_strerror (gdbm_errno)); } /* open not-visited file */ if ((s = getenv ("HARVEST_NOT_VISITED_LOG")) != NULL) not_visited = fopen (s, "a+"); if (not_visited) setbuf (not_visited, NULL);}/* Die gracefully */static voidsigdie (x)int x;{ int i;#ifdef USE_HOST_CACHE dump_host_cache (42, 9);#endif if (urldbf != NULL) gdbm_close (urldbf); if (hostdbf != NULL) gdbm_close (hostdbf); if (md5dbf != NULL) gdbm_close (md5dbf); if (not_visited) fclose (not_visited); /* (void) unlink (urldb_filename); */ crremove (urldb_filename); xfree (urldb_filename); /* (void) unlink (hostdb_filename); */ crremove (hostdb_filename); xfree (hostdb_filename); /* (void) unlink (md5db_filename); */ crremove (md5db_filename); xfree (md5db_filename); for (i = 0; i < 100; i++) { if (i > max_depth && depth_hist[i] == 0) break; Log ("Found %8d objects at depth %d\n", depth_hist[i], i); } Debug (42, 1, ("httpenum-depth: exiting (signal %d)\n", x)); exit (0);}/* ---------------------------------------------------------------------- */static voidusage (){ fprintf (stderr, "Usage: httpenum-depth http-URL\n"); exit (1);}intmain (argc, argv)int argc;char **argv;{ URL *up = NULL; int depth = 0; debug_init (); /* from $HARVEST_DEBUG */ for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) { if (!strncmp (*argv, "-D", 2)) { debug_flag (*argv); } } if (argc != 1) usage (); for (depth = 0; depth < 100; depth++) depth_hist[depth] = 0; signal (SIGTERM, sigdie); /* Die gracefully */ signal (SIGINT, sigdie); signal (SIGPIPE, sigdie); /* Quickly clean up on broken pipe */ initialize (); /* Initialize */ Debug (42, 1, ("httpenum-depth: Starting...\n")); /* Grab the RootNode URL from the command line */ if ((up = url_open (*argv)) == NULL || up->type != URL_HTTP) { usage (); } /* Mark the RootNode */ tree_root = xstrdup (up->url); printf ("%s\n", up->url); /* Print tree root */ process_url (up, cur_depth + 1); /* Do the Enumeration recursively */ finish_url (); sigdie (0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -