⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gather.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: gather.c,v 2.4 2000/01/21 17:37:33 sxw Exp $";/* *  gather.c - Simple, portable client to retrieve data from a Gatherer, *  and print it to stdout.  Uses the GNU zip compression to transmit *  the data over the network.  Must have 'gzip' in your path. * *  Usage: gather [-info | -nocompress] hostname port [timestamp] * *  DEBUG: none *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <string.h>#include <errno.h>#include <signal.h>#include <ctype.h>#include <sys/types.h>#include <sys/param.h>#include <netdb.h>#include <netinet/in.h>#include "util.h"/* Wait 5 minutes for a response from the remote Gatherer */#define WAIT_MAX_SECS	300static void sigdie(int unused){	fprintf(stderr, "gather: Timed out after waiting %d seconds for a response from the remote Gatherer.\n", WAIT_MAX_SECS);	exit(1);}static void usage(){	fprintf(stderr, "Usage: gather [-info | -nocompress] hostname port [timestamp]\n");	exit(1);}#define grab_data_default(x,y,z)	grab_data_v0_2(x,y,z)static int do_compress = 1;static int do_info = 0;/* Local functions */static void grab_data_v0_2();static void die();static void send_msg();static void gzip_message();static void do_plain_text_retrieve();static void do_compressed_retrieve();static struct hostent *xgethostbyname();static int check_for_gzip();int main(argc, argv)     int argc;     char *argv[];{	FILE *ifp = NULL;	FILE *ofp = NULL;	char *p = NULL;	char *server_host = NULL;	char *this_host = xstrdup(getfullhostname());	int s;	int server_port;	int timestamp = 0;	int version_major;	int version_minor;	int version_minor_minor = 0;	static char that_version[BUFSIZ];	static char buf[BUFSIZ];	static char that_host[BUFSIZ];	static char xbuf[4096];	struct hostent *hp = NULL;	struct sockaddr_in sa;	signal(SIGALRM, sigdie);	signal(SIGPIPE, SIG_DFL);	/* parent process may have ignored */	alarm(0);	do_compress = 1;	if (argc > 1 && !strcmp(argv[1], "-nocompress")) {		argc--;		argv++;		do_compress = 0;	}	if (argc > 1 && !strcmp(argv[1], "-info")) {		argc--;		argv++;		do_info = 1;	}	if (argc < 3)		usage();	server_host = xstrdup(argv[1]);	server_port = atoi(argv[2]);	if (argc == 4)		timestamp = atoi(argv[3]);	if (timestamp < 0 || server_port < 0)		usage();	for (p = argv[2]; *p; p++)		if (!isdigit(*p))			usage();	if (do_compress && check_for_gzip() != 0) {		fprintf(stderr, "gather: WARNING: 'gzip' not found.  Compression disabled.\n");		do_compress = 0;	}	errno = 0;	/* Find out who they are */	if ((hp = xgethostbyname(server_host)) == NULL) {		if (errno == 0)			fprintf(stderr, "gather: %s: Host unknown.\n",			    server_host);		else			perror(server_host);		exit(1);	}	/* Set up the Destination Address */	memset(&sa, '\0', sizeof(sa));	memcpy(&sa.sin_addr, hp->h_addr, hp->h_length);	sa.sin_family = AF_INET;	sa.sin_port = (unsigned short) htons(server_port);	/* Create a socket, and connect to the remote host */	if ((s = socket(PF_INET, SOCK_STREAM, 0)) < 0) {		perror("gather: socket");		exit(1);	}	if (connect(s, (struct sockaddr *) &sa, sizeof(sa)) < 0) {		perror("gather: connect");		exit(1);	}	/* Use buffered I/O to make sure we get the right number of bytes */	if ((ifp = fdopen(s, "r")) == NULL) {		perror("gather: fdopen");		exit(1);	}	if ((ofp = fdopen(s, "w")) == NULL) {		perror("gather: fdopen");		exit(1);	}	/* Grab welcome message */	alarm(WAIT_MAX_SECS);	if (fgets(buf, BUFSIZ, ifp) == NULL) {		fprintf(stderr, "gather: Did not receive welcome message.\n");		die(ofp);	}	alarm(0);	if (strncmp(buf, "000", 3) != 0) {	/* Check OK */		fprintf(stderr, "gather: Couldn't connect to %s:%d.\n",		    server_host, server_port);		fprintf(stderr, "gather: Did not receive handshake: %s", buf);		die(ofp);	}	if (sscanf(buf, "000 - HELLO %s %s", that_version, that_host) != 2) {		fprintf(stderr, "gather: Cannot parse handshake: %s\n", buf);		die(ofp);	}	version_major = version_minor = version_minor_minor = 0;	if ((sscanf(that_version, "%d.%d.%d", &version_major, &version_minor,		    &version_minor_minor) != 3) ||	    (sscanf(that_version, "%d.%d", &version_major, &version_minor) != 2)) {		fprintf(stderr, "gather: Cannot parse version number: %s\n",		    that_version);		die(ofp);	}	/* Say HELLO */	sprintf(buf, "HELLO %s\n", this_host);	send_msg(buf, ofp);	alarm(WAIT_MAX_SECS);	if (fgets(buf, BUFSIZ, ifp) == NULL) {		fprintf(stderr, "gather: Did not receive HELLO ack.\n");		die(ofp);	}	alarm(0);	if (strncmp(buf, "100", 3)) {	/* Check OK */		fprintf(stderr, "gather: Received: %s", buf);		/* don't exit, not a fatal error */	}	if (do_info) {		if (!(version_major > 0 ||			version_minor > 2 ||			version_minor_minor > 2)) {			send_msg("QUIT\n", ofp);			(void) close(fileno(ofp));			exit(0);		}		send_msg("INFO\n", ofp);		while (fgets(xbuf, 4096, ifp)) {			if (!strncmp(xbuf, "600", 3)) {				send_msg("QUIT\n", ofp);				(void) close(fileno(ofp));				exit(0);			}			if (!strncmp(xbuf, "601", 3) ||			    !strncmp(xbuf, "602", 3)) {				die(ofp);			}			fputs(xbuf, stdout);		}		die(ofp);	}	if (do_compress) {		/* Set mode to compressed data */		sprintf(buf, "SET compression\n");		send_msg(buf, ofp);		if (fgets(buf, BUFSIZ, ifp) == NULL) {			fprintf(stderr, "gather: Did not receive SET COMPRESSION ack.\n");			die(ofp);		}		if (strncmp(buf, "500", 3)) {			fprintf(stderr, "gather: Received: %s", buf);			die(ofp);		}	}	/* Issue SEND-UPDATE command */	sprintf(buf, "SEND-UPDATE %d\n", timestamp);	send_msg(buf, ofp);	if (fgets(buf, BUFSIZ, ifp) == NULL) {		fprintf(stderr, "gather: Did not receive SEND-UPDATE %d ack.\n", timestamp);		die(ofp);	}	if (strncmp(buf, "400", 3) != 0) {	/* Check OK */		fprintf(stderr, "gather: Received: %s", buf);		die(ofp);	}#ifdef DEBUG	fprintf(stderr, "Server is using protocol version %d.%d.x\n",	    version_major, version_minor);#endif	if (version_major == 0 && version_minor == 2)		grab_data_v0_2(ifp, ofp, this_host);	else		grab_data_default(ifp, ofp, this_host);	/* Quit */	fprintf(ofp, "QUIT\n");	fflush(ofp);	(void) close(s);	/* close the socket, and exit */	exit(0);}/* *  grab_data() - Version 0.2.x of the protocol.  Sets compression *  and then feeds all of the xfer'd data to gzip.  A closed socket is *  the end-of-transmission. */static void grab_data_v0_2(ifp, ofp, this_host)     FILE *ifp, *ofp;     char *this_host;{	if (do_compress)		do_compressed_retrieve(ifp, ofp);	else		do_plain_text_retrieve(ifp, ofp);}/* *  plain-text retrieve is not great because you have to check each line *  for the 499 tag. */static void do_plain_text_retrieve(ifp, ofp)     FILE *ifp;     FILE *ofp;{	static char buf[BUFSIZ+1];	char *pos;	while (fgets(buf, BUFSIZ+1, ifp)) {		if (!strncmp(buf, "499 - Sent", 10))			return;		/* try to find the length of data we read in.  Note that		 * use of fgets() here is bad because we may be reading		 * arbitrary binary data (including NULLs) in a SOIF value. */		while((pos = memchr(buf, '\n', BUFSIZ)) == NULL) {			fwrite(buf, 1, BUFSIZ, stdout);			fgets(buf, BUFSIZ+1, ifp);		}		fwrite(buf, 1, (pos - buf) + 1, stdout);	}}/* *  do_compressed_retrieve - Retrieves GNU zip'ed data from 'ifp', *  then writes the uncompressed data to stdout.  ofp is the *  socket back to the Gatherer. */static void do_compressed_retrieve(ifp, ofp)     FILE *ifp;     FILE *ofp;{	int pid;	int pfd[2];	int n;	static char buf[BUFSIZ];	if (pipe(pfd) < 0) {		perror("gather: pipe");		die(ofp);	}	if ((pid = fork()) < 0) {		perror("gather: fork");		die(ofp);	}	if (pid == 0) {		/* CHILD */		close(pfd[1]);		dup2(pfd[0], 0);	/* comp-read-pipe -> stdin */		execlp("gzip", "gzip", "-dc", NULL);		perror("gather: execlp: gzip");		gzip_message();		_exit(1);	}	/* PARENT */	/* Feed GNU gzip the data to uncompress */	close(pfd[0]);	alarm(WAIT_MAX_SECS);	while ((n = fread(buf, 1, BUFSIZ - 1, ifp)) > 0) {		alarm(0);		if (write(pfd[1], buf, n) < 0) {			perror("gather: write");			die(ofp);		}		alarm(WAIT_MAX_SECS);	}	close(pfd[1]);	(void) waitpid(pid, NULL, 0);}static struct hostent *xgethostbyname(name)     char *name;{	struct hostent *hp = NULL;	static char x[64];	unsigned long ip;	if (sscanf(name, "%[0-9].%[0-9].%[0-9].%[0-9]%s", x, x, x, x, x) == 4) {		ip = inet_addr(name);		hp = gethostbyaddr((char *) &ip, 4, AF_INET);		if (!hp) {	/* special hack for DNS's which don't work */			/* unknown if this works                   */			hp = (struct hostent *) malloc(sizeof(struct hostent));			memset(hp, '\0', sizeof(struct hostent));			hp->h_name = xstrdup(name);			hp->h_aliases = NULL;			hp->h_addrtype = AF_INET;			hp->h_length = 4;			hp->h_addr_list = (char **) malloc(sizeof(char *));;			*(hp->h_addr_list) = (char *) malloc(4);			memcpy(*(hp->h_addr_list), (char *) &ip, 4);		}	} else {		hp = gethostbyname(name);	}	return hp;}static void send_msg(buf, ofp)     char *buf;     FILE *ofp;{	int n = strlen(buf);#ifdef DEBUG	fprintf(stderr, "Sending: %s", buf);#endif	if (fwrite(buf, 1, n, ofp) != n) {		perror("gather: fwrite");		(void) close(fileno(ofp));		exit(1);	}	if (fflush(ofp) != 0) {		perror("gather: fflush");		(void) close(fileno(ofp));		exit(1);	}}static void gzip_message(){	fprintf(stderr, "Could not locate the gzip program. gzip is available in GNU's compression\n\software distribution at\n\n\	ftp://ftp.gnu.org/gnu/gzip/gzip-1.2.4a.shar\n");	fflush(stderr);}static void die(fp)     FILE *fp;{	fprintf(fp, "QUIT\n");	fflush(fp);	(void) close(fileno(fp));	exit(1);}/* Fork a 'gzip -V' command to see if gzip really exists */static int check_for_gzip(){	int status;	int pid;	if ((pid = fork()) < 0) {		perror("gather: fork");		return 1;	}	if (pid == 0) {		/* CHILD */		close(1);		close(2);		execlp("gzip", "gzip", "-V", NULL);		_exit(1);	}	/* PARENT */	(void) waitpid(pid, &status, 0);	return (status >> 8);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -