📄 htmlurls.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
    if (strncasecmp(p, "url", 3) == 0) { /* there is URL */      q += 3;      while (isspace(*q)) q++;          /* skip spaces */      if (*q == '=') { 			/* if there is URL = , skip it */        q++;                            /* skip '=' */        while (isspace(*q)) q++;          /* skip spaces */	p = q;      } /* else - this is HTML 4.0 conformant (?) refresh with (relative) URL starting with letters 'url' */    }    q = xstrdup(p);                         /* copy name */    if ((p = strchr(q, '\"')) != NULL) {    /* terminate string on '"' */      *p = '\0';    } else {      if ((p = strchr(q, '>')) != NULL)     /* terminate string on '>' */      *p = '\0';    }    /* q should now contain the URL */    /* Relative URLs are processed by various browsers, so enable them */    if (base != (char *) NULL) {      v = q;      q = url_parse_relative(v, base);      xfree(v);    }    if (q != (char *) NULL) {             /* Just in case... */      add_buffer(urls, q, strlen(q));     /* Add URL to urls */      add_buffer(urls, "\n", 1);      xfree(q);    }    return;}/* *  process_framesrc() - Extracts the URL from the frame src tag. * *  DFM HACK - deals with: *      <FRAME SRC="url"> *      <FRAME SRC = "url"> *      <FRAME SRC = " url "> */void process_framesrc(s)     char *s;{    char *p, *q, *tmps, *v;    /* Find the SRC in the FRAME */    if ((tmps = strstr_icase(s, "src")) == NULL)        return;    /* Grab the URL from the SRC */    if ((p = strchr(tmps, '=')) != NULL) {        p++;                    /* skip '=' */        while (isspace(*p) || (*p == '\"'))            p++;                /* skip space '"'s */        q = xstrdup(p);          /* copy URL */        if ((p = strchr(q, '\"')) != NULL)      /* terminate string */            *p = '\0';        if ((p = strchr(q, ' ')) != NULL)       /* terminate string */            *p = '\0';        if (base != (char *) NULL) {            v = q;            q = url_parse_relative(v, base);            xfree(v);        }        if (q != (char *) NULL) {            add_buffer(urls, q, strlen(q));     /* Add URL to urls */            add_buffer(urls, "\n", 1);            xfree(q);        }        return;    }}/* *  process_anchor() - Extracts the URL from the anchor href tag. *  ALSO DOES SO FROM AREA TAG FOR CLIENT SIDE IMAGE MAPS DFM *  Will process these anchors (HREF is case-insenstive): * *      <A HREF="url"> *      <A HREF = "url"> *      <A HREF = " url "> *      <A HREF='url'> *      <A HREF = 'url'> *      <A HREF = ' url '> * * */void process_anchor(s)     char *s;{    char *p, *q, *tmps, *v;    int singlequote, doublequote;    singlequote = doublequote = 0;    /* Find the HREF in the anchor */    if ((tmps = strstr_icase(s, "href")) == NULL)	return;    /* Grab the URL from the HREF */    if ((p = strchr(tmps, '=')) != NULL) {	p++;			/* skip '=' */	while (isspace(*p) || (*p == '\"') || (*p == '\'')) {	    if(*p ==  '\"')		++doublequote;	    if(*p == '\'')		++singlequote;	    p++;		/* skip space '"'s */	}	q = xstrdup(p);		/* copy URL */	while ((p = strchr (q, '\n')) != NULL) {	    strcpy(p, p+1);	}	if (((p = strchr(q, '\"')) != NULL && doublequote == 1) || (((p = strchr(q, '\'')) != NULL) && singlequote == 1))	/* terminate string */	    *p = '\0';	if ((p = strchr(q, ' ')) != NULL)	/* terminate string */	    *p = '\0';	if (base != (char *) NULL) {	    v = q;	    q = url_parse_relative(v, base);	    xfree(v);	}	if (q != (char *) NULL) {	    add_buffer(urls, q, strlen(q));	/* Add URL to urls */	    add_buffer(urls, "\n", 1);	    xfree(q);	}	return;    }}/* *  read_file() - Reads the file fp into memory and returns a pointer to it. */Buffer *read_file(fp)     FILE *fp;{    static Buffer *b;    char buf[BUFSIZ];    int nread;    b = create_buffer(BUFSIZ);    while ((nread = fread(buf, 1, BUFSIZ, fp)) > 0)	add_buffer(b, buf, nread);    return (b);}void process_base(buf)     char *buf;{    char *t = NULL;    char *p = NULL;    char *q = NULL;    /* Find the HREF in the anchor */    if ((t = strstr_icase(buf, "href")) == (char *) NULL)	return;    /* Grab the URL from the HREF */    if ((p = strchr(t, '=')) != NULL) {	p++;			/* skip '=' */	while (isspace(*p) || (*p == '\"'))	    p++;		/* skip space '"'s */	q = xstrdup(p);		/* copy URL */	if ((p = strchr(q, '\"')) != NULL)	/* terminate string */	    *p = '\0';	if ((p = strchr(q, ' ')) != NULL)	/* terminate string */	    *p = '\0';    }    if (q != (char *) NULL) {	xfree(base);	base = xstrdup(q);    }}void process_node(mp)     struct mark_up *mp;{    if ((mp->type == M_BASE) &&	(mp->start != NULL) && (strlen(mp->start) > 5))	process_base(mp->start);    if ((mp->type == M_ANCHOR) &&	(mp->start != NULL) && (strlen(mp->start) > 5))	process_anchor(mp->start);	/*DFM hack for FRAMES*/    if ((mp->type == M_FRAME) &&        (mp->start != NULL) && (strlen(mp->start) > 5))        process_framesrc(mp->start);        /*DFM hack for AREA (client side Image Map*/    if ((mp->type == M_AREA) &&        (mp->start != NULL) && (strlen(mp->start) > 5))        process_anchor(mp->start);    if ((mp->type == M_META) &&        (mp->start != NULL) && (strlen(mp->start) > 5)) {            process_metarobots(mp->start);            process_metapull(mp->start);	/* HS */        }}static void free_struct_markup(x)     struct mark_up *x;{    if (x->text)	free(x->text);    if (x->start)	free(x->start);    if (x->end)	free(x->end);    free(x);}int main(argc, argv)     int argc;     char *argv[];{    struct mark_up *HTMLParse();    struct mark_up *mp = NULL;    struct mark_up *walker = NULL;    struct mark_up *t = NULL;    Buffer *b = NULL;    FILE *fp = NULL;    FILE *logfp = NULL;    if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL)	logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+");    if (logfp == (FILE *) NULL)	logfp = stderr;    init_log3("HTMLurls", logfp, stderr);    debug_init();    for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {	if (!strncmp(*argv, "-D", 2)) {	    debug_flag(*argv);	} else if (!strcmp(*argv, "--base-url")) {	    argc--;	    argv++;	    if (argc < 1)		usage();	    base = xstrdup(*argv);	}    }    if (argc < 1)	usage();    /* Parse the HTML file */    if ((fp = fopen(*argv, "r")) == NULL) {	log_errno(*argv);	exit(1);    }#if 0 /* kjl/7mar2002 */    if (getenv("ENUMERATOR_URL"))	Url = xstrdup(getenv("ENUMERATOR_URL"));    if (Url == (char *) NULL)	Url = xstrdup(*argv);#endif    b = read_file(fp);    fclose(fp);    mp = HTMLParse(NULL, b->data);    free_buffer(b);    urls = create_buffer(BUFSIZ);    /* Extract important information from the parsed HTML */    for (walker = mp; walker != NULL;	t = walker, walker = walker->next, free_struct_markup(t))	process_node(walker);    if (RobotsFollow) fwrite(urls->data, 1, urls->length, stdout);    free_buffer(urls);    if (RobotsIndex)      exit(0);    else      exit(99); /* Pick a number, any number */}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -