⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 displayobject.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: DisplayObject.c,v 2.5 1997/12/12 16:02:09 sxw Exp $";/* *  DisplayObject.c - httpd-based CGI program that HTML-ifies a *  SOIF Object for clients. * *  Usage:  Run from cgi-bin directory under httpd server * *  Supports	'object'	name of URL data relative to CGI server *		'attribute'	return data associated with the attribute only *		'type'		MIME data type for attribute data * *  Darren Hardy, University of Colorado - Boulder, July 1994 * * *  DEBUG: none *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <errno.h>#include <time.h>#include <memory.h>#include <signal.h>#include <sys/types.h>#include <sys/socket.h>#include <sys/time.h>#include <sys/stat.h>#include "util.h"#include "url.h"#include "template.h"#include "autoconf.h"/* *  USE_DIRECTORY_LISTING - For Directory-Listing Broker support */#ifndef USE_DIRECTORY_LISTING#define USE_DIRECTORY_LISTING#endif/* from util.c */void getword(), unescape_url(), plustospace(), nltospace();char x2c();void *xmalloc ();/* Local functions */void do_shutdown();char *get_time();static void htmlify_file();#define LF 10#define ENTSIZ 64typedef struct {	char *name;	char *val;} entry;entry *entries;char *errmsg;char *tfile = NULL;char *select_attribute = NULL;char *data_type = NULL;int wrote_http_header = 0;#ifdef USE_DIRECTORY_LISTINGstatic void do_directory_listing();#endif/* *  fatal_http() - Print error message to user via stdout; and nicely log *  the error to stderr where httpd will save it in error_log. */void fatal_http(s)char *s;{	if (!wrote_http_header)		printf("Content-type: text/html\r\n");	fprintf(stdout, "\r\n%s\n", s);	fflush(stdout);#ifdef DO_STDERR_LOGGING	fprintf(stderr, "[%s] DisplayObject: %s", get_time(), s);	fflush(stderr);#endif	do_shutdown(1);}void do_shutdown(sig)int sig;{	if (tfile != NULL)		(void)unlink(tfile);	_exit(sig);}/* get_time from NCSA httpd */char *get_time(){	time_t t;	char *time_string;	t = time(NULL);	time_string = ctime(&t);	time_string[strlen(time_string) - 1] = '\0';	return (time_string);}int main(argc, argv)int argc;char *argv[];{	int x, m = 0;	char *cl = getenv("QUERY_STRING");	char *gl = getenv("REQUEST_METHOD");	char *sh = getenv("SERVER_NAME");	char *sp = getenv("SERVER_PORT");	char *object = (char *) xmalloc (BUFSIZ);/* Holds the object pathname */	char *objurl = (char *) xmalloc (BUFSIZ);	URL *up = NULL;	extern int use_local_cache;	/* set up signal handlers */	(void) signal(SIGHUP, do_shutdown);	(void) signal(SIGINT, do_shutdown);	(void) signal(SIGQUIT, do_shutdown);	(void) signal(SIGTERM, do_shutdown);	(void) signal(SIGALRM, do_shutdown);	/* init data structures */	errmsg = (char *) xmalloc (BUFSIZ);	entries = (entry *) xmalloc (ENTSIZ * sizeof (entry));	for (x=0; x<ENTSIZ; x++) {		entries[x].name = (char *) xmalloc (BUFSIZ);		entries[x].val	= (char *) xmalloc (BUFSIZ);	}	/* validity checks */	if (!gl || (memcmp(gl, "GET", 3))) {		fatal_http("Reference with a METHOD of GET.");	} else if (cl == NULL) {		fatal_http("No query information to decode.");	} else if (sp == NULL || sh == NULL) {		fatal_http("Invalid CGI interface! No SERVER_NAME/SERVER_PORT");	}	/* parse CGI data */	for (x = 0; cl[0] != '\0' && x < ENTSIZ; x++) {		m = x;		getword(entries[x].val, cl, '&');		plustospace(entries[x].val);		unescape_url(entries[x].val);		nltospace(entries[x].val);		getword(entries[x].name, entries[x].val, '=');	}	object[0] = '\0';	for (x = 0; x <= m && x < ENTSIZ; x++) {		if (memcmp(entries[x].name, "object", 6) == 0)			strcpy(object, entries[x].val);		else if (memcmp(entries[x].name, "type", 6) == 0)			data_type = strdup(entries[x].val);		else if (memcmp(entries[x].name, "attribute", 6) == 0)			select_attribute = strdup(entries[x].val);	}	/* validity checks */	if (*object == '\0') {		fatal_http("Illegal DisplayObject usage.\n");	}	if (*object == '/')		sprintf(objurl, "http://%s:%s%s", sh, sp, object);	else		sprintf(objurl, "http://%s:%s/%s", sh, sp, object);	use_local_cache = 0;	init_url();	if ((up = url_open(objurl)) == NULL || url_retrieve(up)) {		sprintf(errmsg, "Cannot access object: '%s'\n", objurl);		fatal_http(errmsg);	}	htmlify_file(up->filename);	url_close(up);	finish_url();	fflush(stdout);	do_shutdown(0);}static void htmlify_file(filename)char *filename;{	FILE *fp, *ofp;	char buf[BUFSIZ];	int n;	Template *t;	AVList *walker;	AVPair *avp;	struct stat sb;	if ((fp = fopen(filename, "r")) == NULL) {		sprintf(errmsg, "fopen: %s: %s\n", filename, strerror(errno));		fatal_http(errmsg);	}	tfile = tempnam(NULL, "dobj");	if ((ofp = fopen(tfile, "w")) == NULL) {		sprintf(errmsg, "fopen: %s: %s\n", tfile, strerror(errno));		fatal_http(errmsg);	}	init_parse_template_file(fp);	if ((t = parse_template()) == NULL) {		fclose(ofp);		sprintf(errmsg, "SOIF Object: '%s' is corrupt.\n", filename);		fatal_http(errmsg);	}	if (select_attribute) {		if ((avp = extract_AVPair(t->list, select_attribute)) != NULL) {			fwrite(avp->value, 1, avp->vsize, ofp);		}	} else {		/* Now HTML-ify the object */		fprintf(ofp, "<HTML>\n");		fprintf(ofp, "<HEAD>\n");		fprintf(ofp, "<TITLE>SOIF Object for: %s</TITLE>\n", t->url);		fprintf(ofp, "</HEAD>\n");		fprintf(ofp, "<BODY>\n");#ifdef USE_DIRECTORY_LISTING		do_directory_listing(t, ofp);#endif		fprintf(ofp, "<H1><a href=\"/Harvest/brokers/soifhelp.html\">SOIF</a> Object for: %s</H1>\n", t->url);		fprintf(ofp, "<HR><PRE>\n");		fprintf(ofp, "@%s { <a href=\"%s\">%s</a>\n", t->template_type,			t->url, t->url);		for (walker = t->list; walker; walker = walker->next) {			if (walker->data->vsize == 0)				continue;			fprintf(ofp, "<STRONG>%s</STRONG>{%u}:\t",				walker->data->attribute,				(unsigned int) walker->data->vsize);#ifndef DONT_FORMAT_KEYWORDS			if (strstr(walker->data->attribute, "eywords"))				fprintf(ofp, "</PRE><P>\n");#endif			fwrite(walker->data->value, 1, walker->data->vsize, ofp);			fprintf(ofp, "\n");#ifndef DONT_FORMAT_KEYWORDS			if (strstr(walker->data->attribute, "eywords"))				fprintf(ofp, "</P><PRE>\n");#endif		}		fprintf(ofp, "}\n");		fprintf(ofp, "</PRE>\n");		fprintf(ofp, "<HR>\n");		fprintf(ofp, "<ADDRESS>This content summary was generated by the <a href=\"http://harvest.sourceforge.net/\">Harvest</a> system.</ADDRESS>\n");		fprintf(ofp, "</BODY>\n");		fprintf(ofp, "</HTML>\n");	}	free_template(t);	finish_parse_template();	fclose(fp);	fclose(ofp);	if (stat(tfile, &sb) < 0) {		sprintf(errmsg, "stat: %s: %s\n", tfile, strerror(errno));		fatal_http(errmsg);	}	/* write http header */	if (data_type != NULL)		printf("Content-type: %s\r\n", data_type);	else		printf("Content-type: text/html\r\n");	printf("Content-length: %ld\r\n\r\n", sb.st_size);	wrote_http_header = 1;	if ((fp = fopen(tfile, "r")) == NULL) {		sprintf(errmsg, "fopen: %s: %s\n", tfile, strerror(errno));		fatal_http(errmsg);	}	while ((n = fread(buf, 1, BUFSIZ - 1, fp)) > 0) {		alarm(300);	/* kill if write doesn't work in 5 minutes */		fwrite(buf, 1, n, stdout);	}	fclose(fp);}#ifdef USE_DIRECTORY_LISTINGstatic void do_directory_listing(t, ofp)Template *t;FILE *ofp;{	AVPair *catavp, *pnavp;	char *buf, *s;	int i = 0;	catavp = extract_AVPair(t->list, "category");	pnavp = extract_AVPair(t->list, "phone-numbers");	if (catavp == NULL || pnavp == NULL)		return;	fprintf(ofp, "<H2>Category: %s</H2>\n", catavp->value);	fprintf(ofp, "<H2>Phone Numbers</H2>\n");	fprintf(ofp, "<UL>\n");	buf = strdup(pnavp->value);	s = strtok(buf, "\n");	while (1) {		if (s == NULL)			break;		if (++i > 5) {			fprintf(ofp, "<HR>\n");			i = 1;		}		fprintf(ofp, "<LI> %s\n", s);		s = strtok(NULL, "\n");	}	fprintf(ofp, "</UL><HR>\n\n");}#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -