📄 htmlurls.c
字号:
if (strncasecmp(p, "url", 3) == 0) { /* there is URL */ q += 3; while (isspace(*q)) q++; /* skip spaces */ if (*q == '=') { /* if there is URL = , skip it */ q++; /* skip '=' */ while (isspace(*q)) q++; /* skip spaces */ p = q; } /* else - this is HTML 4.0 conformant (?) refresh with (relative) URL starting with letters 'url' */ } q = xstrdup(p); /* copy name */ if ((p = strchr(q, '\"')) != NULL) { /* terminate string on '"' */ *p = '\0'; } else { if ((p = strchr(q, '>')) != NULL) /* terminate string on '>' */ *p = '\0'; } /* q should now contain the URL */ /* Relative URLs are processed by various browsers, so enable them */ if (base != (char *) NULL) { v = q; q = url_parse_relative(v, base); xfree(v); } if (q != (char *) NULL) { /* Just in case... */ add_buffer(urls, q, strlen(q)); /* Add URL to urls */ add_buffer(urls, "\n", 1); xfree(q); } return;}/* * process_framesrc() - Extracts the URL from the frame src tag. * * DFM HACK - deals with: * <FRAME SRC="url"> * <FRAME SRC = "url"> * <FRAME SRC = " url "> */void process_framesrc(s) char *s;{ char *p, *q, *tmps, *v; /* Find the SRC in the FRAME */ if ((tmps = strstr_icase(s, "src")) == NULL) return; /* Grab the URL from the SRC */ if ((p = strchr(tmps, '=')) != NULL) { p++; /* skip '=' */ while (isspace(*p) || (*p == '\"')) p++; /* skip space '"'s */ q = xstrdup(p); /* copy URL */ if ((p = strchr(q, '\"')) != NULL) /* terminate string */ *p = '\0'; if ((p = strchr(q, ' ')) != NULL) /* terminate string */ *p = '\0'; if (base != (char *) NULL) { v = q; q = url_parse_relative(v, base); xfree(v); } if (q != (char *) NULL) { add_buffer(urls, q, strlen(q)); /* Add URL to urls */ add_buffer(urls, "\n", 1); xfree(q); } return; }}/* * process_anchor() - Extracts the URL from the anchor href tag. * ALSO DOES SO FROM AREA TAG FOR CLIENT SIDE IMAGE MAPS DFM * Will process these anchors (HREF is case-insenstive): * * <A HREF="url"> * <A HREF = "url"> * <A HREF = " url "> * <A HREF='url'> * <A HREF = 'url'> * <A HREF = ' url '> * * */void process_anchor(s) char *s;{ char *p, *q, *tmps, *v; int singlequote, doublequote; singlequote = doublequote = 0; /* Find the HREF in the anchor */ if ((tmps = strstr_icase(s, "href")) == NULL) return; /* Grab the URL from the HREF */ if ((p = strchr(tmps, '=')) != NULL) { p++; /* skip '=' */ while (isspace(*p) || (*p == '\"') || (*p == '\'')) { if(*p == '\"') ++doublequote; if(*p == '\'') ++singlequote; p++; /* skip space '"'s */ } q = xstrdup(p); /* copy URL */ while ((p = strchr (q, '\n')) != NULL) { strcpy(p, p+1); } if (((p = strchr(q, '\"')) != NULL && doublequote == 1) || (((p = strchr(q, '\'')) != NULL) && singlequote == 1)) /* terminate string */ *p = '\0'; if ((p = strchr(q, ' ')) != NULL) /* terminate string */ *p = '\0'; if (base != (char *) NULL) { v = q; q = url_parse_relative(v, base); xfree(v); } if (q != (char *) NULL) { add_buffer(urls, q, strlen(q)); /* Add URL to urls */ add_buffer(urls, "\n", 1); xfree(q); } return; }}/* * read_file() - Reads the file fp into memory and returns a pointer to it. */Buffer *read_file(fp) FILE *fp;{ static Buffer *b; char buf[BUFSIZ]; int nread; b = create_buffer(BUFSIZ); while ((nread = fread(buf, 1, BUFSIZ, fp)) > 0) add_buffer(b, buf, nread); return (b);}void process_base(buf) char *buf;{ char *t = NULL; char *p = NULL; char *q = NULL; /* Find the HREF in the anchor */ if ((t = strstr_icase(buf, "href")) == (char *) NULL) return; /* Grab the URL from the HREF */ if ((p = strchr(t, '=')) != NULL) { p++; /* skip '=' */ while (isspace(*p) || (*p == '\"')) p++; /* skip space '"'s */ q = xstrdup(p); /* copy URL */ if ((p = strchr(q, '\"')) != NULL) /* terminate string */ *p = '\0'; if ((p = strchr(q, ' ')) != NULL) /* terminate string */ *p = '\0'; } if (q != (char *) NULL) { xfree(base); base = xstrdup(q); }}void process_node(mp) struct mark_up *mp;{ if ((mp->type == M_BASE) && (mp->start != NULL) && (strlen(mp->start) > 5)) process_base(mp->start); if ((mp->type == M_ANCHOR) && (mp->start != NULL) && (strlen(mp->start) > 5)) process_anchor(mp->start); /*DFM hack for FRAMES*/ if ((mp->type == M_FRAME) && (mp->start != NULL) && (strlen(mp->start) > 5)) process_framesrc(mp->start); /*DFM hack for AREA (client side Image Map*/ if ((mp->type == M_AREA) && (mp->start != NULL) && (strlen(mp->start) > 5)) process_anchor(mp->start); if ((mp->type == M_META) && (mp->start != NULL) && (strlen(mp->start) > 5)) { process_metarobots(mp->start); process_metapull(mp->start); /* HS */ }}static void free_struct_markup(x) struct mark_up *x;{ if (x->text) free(x->text); if (x->start) free(x->start); if (x->end) free(x->end); free(x);}int main(argc, argv) int argc; char *argv[];{ struct mark_up *HTMLParse(); struct mark_up *mp = NULL; struct mark_up *walker = NULL; struct mark_up *t = NULL; Buffer *b = NULL; FILE *fp = NULL; FILE *logfp = NULL; if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL) logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+"); if (logfp == (FILE *) NULL) logfp = stderr; init_log3("HTMLurls", logfp, stderr); debug_init(); for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) { if (!strncmp(*argv, "-D", 2)) { debug_flag(*argv); } else if (!strcmp(*argv, "--base-url")) { argc--; argv++; if (argc < 1) usage(); base = xstrdup(*argv); } } if (argc < 1) usage(); /* Parse the HTML file */ if ((fp = fopen(*argv, "r")) == NULL) { log_errno(*argv); exit(1); }#if 0 /* kjl/7mar2002 */ if (getenv("ENUMERATOR_URL")) Url = xstrdup(getenv("ENUMERATOR_URL")); if (Url == (char *) NULL) Url = xstrdup(*argv);#endif b = read_file(fp); fclose(fp); mp = HTMLParse(NULL, b->data); free_buffer(b); urls = create_buffer(BUFSIZ); /* Extract important information from the parsed HTML */ for (walker = mp; walker != NULL; t = walker, walker = walker->next, free_struct_markup(t)) process_node(walker); if (RobotsFollow) fwrite(urls->data, 1, urls->length, stdout); free_buffer(urls); if (RobotsIndex) exit(0); else exit(99); /* Pick a number, any number */}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -