📄 html-lax.sum.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "HTML-lax.sum.c,v 1.4 1996/01/16 08:44:27 duane Exp";/* *  HTML-lax.sum.c - Non-strict HTML summarizer * *  Usage: HTML-lax.sum [--url-only | --text-only] --body-text filename * *  Outputs SOIF * *  DEBUG: * *  Darren Hardy, hardy@cs.colorado.edu, April 1994 *  Duane Wessels, wessels@cs.colorado.edu, October 1995 * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include "HTML.h"#include "util.h"#include "template.h"#if 0 /* kjl/7mar2002 *//* Global */char *Url = NULL;#endif/* Local Variables */static int intype[64];static Buffer *citations, *keywords, *title, *headings, *author, *urls, *body_text; /* hw headings added */static int url_only = 0;static int text_only = 0;static int html_body_text = 0;static void usage(){	fprintf(stderr, "Usage: HTML-lax.sum [--url-only | --text-only] --body-text filename\n");	exit(1);}/* *  read_file() - Reads the file fp into memory and returns a pointer to it. */Buffer *read_file(fp)FILE *fp;{	static Buffer *b;	char buf[BUFSIZ];	int nread;	b = create_buffer(BUFSIZ);	while ((nread = fread(buf, 1, BUFSIZ, fp)) > 0)		add_buffer(b, buf, nread);	return(b);}/* *  process_anchor() - Extracts the URL from the anchor href tag. */void process_anchor(s)char *s;{	char *p, *q, *tmps = s;	while ((p = strchr(tmps, '=')) != NULL) {		/* Check to see if there is a ABCD= */		if (p - 4 <= tmps) {			tmps = ++p;			continue;		}		if (strncasecmp(p-4, "href", 4) != 0) { 	/* href? */			tmps = ++p;			continue;		}		p++;					/* skip '=' */		while (isspace(*p) || (*p == '\"'))			p++;				/* skip space '"'s */		q = strdup(p);				/* copy URL */		if ((p = strchr(q, '\"')) != NULL)	/* terminate string */			*p = '\0';		if ((p = strchr(q, ' ')) != NULL)	/* terminate string */			*p = '\0';		add_buffer(urls, q, strlen(q));		/* Add URL to urls */		add_buffer(urls, "\n", 1);		xfree(q);		return;	}}void print_node(mp)struct mark_up *mp;{	printf("mp->type: %d\n", mp->type);	printf("mp->is_end: %d\n", mp->is_end);	printf("mp->text: %s\n", mp->text);	printf("mp->start: %s\n", mp->start);	printf("mp->end: %s\n", mp->end);	printf("\n");	fflush(stdout);}void process_node(mp)struct mark_up *mp;{	char *tptr;	int len;	if (mp->type < 0)		return;	if (mp->is_end) {		intype[mp->type]--;		return;	} else		intype[mp->type]++;	if (mp->text && strlen(mp->text) < 2)		return;	if (mp->start && strlen(mp->start) < 6)		return;	switch (mp->type) {	case M_NONE:		if (intype[M_TITLE]) {			add_buffer(title, mp->text, strlen(mp->text));			add_buffer(title, "\n", 1);		}		if (intype[M_STRONG]) {			add_buffer(keywords, mp->text, strlen(mp->text));			add_buffer(keywords, "\n", 1);		}		if (intype[M_CITATION]) {			add_buffer(citations, mp->text, strlen(mp->text));			add_buffer(citations, "\n", 1);		}		if (intype[M_ANCHOR]) {			add_buffer(keywords, mp->text, strlen(mp->text));			add_buffer(keywords, "\n", 1);		}		if (intype[M_ADDRESS]) {			add_buffer(author, mp->text, strlen(mp->text));			add_buffer(author, "\n", 1);		}		/* hw h1-h3 are now stored as headings */		if (intype[M_HEADER_1] || intype[M_HEADER_2] || intype[M_HEADER_3]) {			tptr = strdup (mp->text);			if (tptr) {				clean_white_space (tptr);				len = strlen (tptr);				if (len > 2) {					add_buffer(headings, tptr, len);					add_buffer(headings, "\n", 1);				}				free (tptr);			}		}		if (html_body_text && mp->text) {			tptr = strdup (mp->text);			if (tptr) {				clean_white_space (tptr);				len = strlen (tptr);				if (len > 2) {					add_buffer (body_text, tptr, len);					add_buffer (body_text, "\n", 1);				}				free (tptr);			}		}		if (text_only)			puts(mp->text);		break;	case M_ANCHOR:		process_anchor(mp->start);	default:		/* do nothing */		break;	}}static void free_struct_markup(x)struct mark_up *x;{	if (x->text)	free(x->text);	if (x->start)	free(x->start);	if (x->end)	free(x->end);	free(x);}int main(argc, argv)int argc;char *argv[];{	struct mark_up *mp = NULL, *walker, *t, *HTMLParse();	Buffer *b;	FILE *fp;	init_log(NULL, stderr);	argv++;	argc--;	if (argc < 1)		usage();	if (!strcmp(*argv, "--url-only")) {		url_only = 1;		argv++;		argc--;		if (argc < 1)			usage();	}	if (!strcmp(*argv, "--text-only")) {		text_only = 1;		argv++;		argc--;		if (argc < 1)			usage();	}        if (!strcmp(*argv, "--body-text")) {                html_body_text = 1;                argv++;                argc--;                if (argc < 1)                        usage();        }	memset(intype, '\0', 64);	/* Parse the HTML file */	if ((fp = fopen(*argv, "r")) == NULL) {		log_errno(*argv);		exit(1);	}#if 0 /* kjl/7mar2002 */        if (getenv("ENUMERATOR_URL"))                Url = xstrdup(getenv("ENUMERATOR_URL"));        if (Url == (char *) NULL)                Url = xstrdup(*argv);#endif	b = read_file(fp);	fclose(fp);	mp = HTMLParse(NULL, b->data);	free_buffer(b);	author = create_buffer(BUFSIZ);	keywords = create_buffer(BUFSIZ);	citations = create_buffer(BUFSIZ);	urls = create_buffer(BUFSIZ);	title = create_buffer(BUFSIZ);	headings = create_buffer(BUFSIZ); /* hw */	if (html_body_text)		body_text = create_buffer(BUFSIZ);	/* Extract important information from the parsed HTML */	for (walker = mp; walker != NULL;	     t = walker, walker = walker->next, free_struct_markup(t))		process_node(walker);	if (url_only) {		fwrite(urls->data, 1, urls->length, stdout);		exit(0);	}	if (text_only) {		exit(0);	}	if (author->length > 0) {		printf("%s{%u}:\t", T_AUTHOR, author->length); /* Author */		fwrite(author->data, 1, author->length, stdout);	}	if (keywords->length > 0) {		printf("%s{%u}:\t", T_KEYS, keywords->length); /* Keywords */		fwrite(keywords->data, 1, keywords->length, stdout);	}	if (urls->length > 0) {		printf("%s{%u}:\t", T_UREFS, urls->length); /* URL-References */		fwrite(urls->data, 1, urls->length, stdout);	}	if (citations->length > 0) {		printf("Citations{%u}:\t", citations->length);		fwrite(citations->data, 1, citations->length, stdout);	}	if (title->length > 0) {		printf("%s{%u}:\t", T_TITLE, title->length); /* Title */		fwrite(title->data, 1, title->length, stdout);	}	/* hw */	if (headings->length > 0) {		printf("%s{%u}:\t", T_HEADINGS, headings->length); /* Headings */		fwrite(headings->data, 1, headings->length, stdout);	}	if (html_body_text && body_text->length > 0) {		/* printf("body{%u}:\t", body_text->length); */		printf("%s{%u}:\t", T_FULLTEXT, body_text->length); /* Full-Text */		fwrite(body_text->data, 1, body_text->length, stdout);	}	exit(0);}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -