⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ftpget.bin.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
static char rcsid[] = "$Id: ftpget.c,v 2.2 2000/01/21 17:37:33 sxw Exp $";/* *  ftpget.c - An FTP client used by liburl and cached. * *  Duane Wessels, University of Colorado, September 1995 * *  DEBUG: section  26, level 1           ftpget - standalone liburl program. * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdlib.h>#include <unistd.h>#include <memory.h>#include <fcntl.h>#include <errno.h>#include <string.h>#include <signal.h>#include <sys/types.h>#include <sys/stat.h>#include <time.h>#include <sys/socket.h>#include <netinet/in.h>#include <arpa/inet.h>#include "util.h"typedef struct _ext_table_entry {	char *name;	char *mime_type;	char *mime_encoding;	char *icon;} ext_table_entry;#include "mime_table.h"#define FIELDSIZE 32#define FTP_PORT 21#define DEFAULT_MIME_TYPE "text/plain"#define F_HTTPIFY	0x01#define F_HDRSENT	0x02#define F_ISDIR		0x04#define F_NOERRS	0x08static char *http_time _PARAMS((time_t));typedef enum {	BEGIN,	PARSE_OK,	CONNECTED,	SERVICE_READY,	NEED_PASSWD,	LOGGED_IN,	TYPE_OK,	MDTM_OK,	SIZE_OK,	PORT_OK,	CWD_OK,	CWD_FAIL,	TRANSFER_BEGIN,	DATA_TRANSFER,	TRANSFER_DONE,	DONE,	FAIL_SOFT,		/* don't cache these */	FAIL_HARD		/* do cache these */} state_t;typedef struct _request {	char *host;	char *path;	char *type;	char *user;	char *pass;	char *path_escaped;	char *userinfo;	char *url;	int cfd;	int sfd;	int dfd;	int connect_attempts;	int login_attempts;	state_t state;	int rc;	char *errmsg;	time_t mdtm;	int size;	int flags;	char *mime_type;	char *mime_enc;	char *html_icon;	FILE *readme_fp;	struct _list_t *cmd_msg;} request_t;typedef struct _parts {	char type;	int size;	char *date;	char *name;	char *showname;	char *link;} parts_t;typedef struct _list_t {	char *ptr;	struct _list_t *next;} list_t;/* *  GLOBALS */int connect_retries = 1;int login_retries = 1;char *progname = NULL;char cbuf[BUFSIZ];		/* send command buffer */char *htmlbuf = NULL;char *server_reply_msg = NULL;struct sockaddr_in ifc_addr;int timeout = XFER_TIMEOUT;	/* 120, from config.h *//* This linked list holds the "continuation" lines before the final * reply code line is sent for a FTP command */list_t *cmd_msg = NULL;static int process_request _PARAMS((request_t *));static char *state_str[] ={    "BEGIN",    "PARSE_OK",    "CONNECTED",    "SERVICE_READY",    "NEED_PASSWD",    "LOGGED_IN",    "TYPE_OK",    "MDTM_OK",    "SIZE_OK",    "PORT_OK",    "CWD_OK",    "CWD_FAIL",    "TRANSFER_BEGIN",    "DATA_TRANSFER",    "TRANSFER_DONE",    "DONE",    "FAIL_SOFT",    "FAIL_HARD",};/* *  CACHED_RETRIEVE_ERROR_MSG args: *      $1 is URL, *      $2 is URL, *      $3 is protocol type string *      $4 is error code, *      $5 is error msg, *      $6 is message to user *      $7 is time string *      $8 is cached version *      $9 is cached hostname */#define CACHED_RETRIEVE_ERROR_MSG "\<HTML><HEAD>\n\<TITLE>ERROR: The requested URL could not be retrieved</TITLE>\n\</HEAD><BODY>\n\<H2>ERROR: The requested URL could not be retrieved</H2>\n\<HR>\n\<P>\n\While trying to retrieve the URL:\n\<A HREF=\"%s\">%s</A>\n\<P>\n\The following %s error was encountered:\n\<UL>\n\<LI><STRONG>ERROR %d -- %s</STRONG>\n\</UL>\n\<P>This means that:\n\<PRE>\n\    %s\n\</PRE>\n\<P> <HR>\n\<ADDRESS>\n\Generated %s, by ftpget/%s@%s\n\</ADDRESS>\n\</BODY></HTML>\n\\n"void fail(r)     request_t *r;{	FILE *fp = NULL;	char *longmsg = NULL;	time_t expire_time;	if (r->flags & F_NOERRS)		return;	switch (r->rc) {	case 0:		longmsg = "Success!  Huh?";		break;	case 2:		longmsg = "A local socket error occured.  Please try again.";		break;	case 3:		longmsg = "A network socket error occured.  Please try again.";		break;	case 4:		longmsg = "A network read or write error occured.  Please try again.";		break;	case 5:		longmsg = "An FTP protocol error occured.  Please try again.";		break;	case 10:		longmsg = "The given URL does not exist.";		break;	default:		break;	}	if ((r->flags & F_HTTPIFY) && !(r->flags & F_HDRSENT)) {		Debug(26, 1, ("Preparing HTML error message\n"));		expire_time = time(NULL) + 300;		/* XXX hardcoded 5 min */		htmlbuf = (char *) xmalloc(8192);		sprintf(htmlbuf, CACHED_RETRIEVE_ERROR_MSG,		    r->url,		    r->url,		    "FTP",		    304,		    r->errmsg,		    longmsg,		    http_time(0),		    HARVEST_VERSION,		    getfullhostname());		if ((fp = fdopen(dup(r->cfd), "w")) == NULL) {			log_errno2(__FILE__, __LINE__, "fdopen");			exit(1);		}		setbuf(fp, NULL);		fprintf(fp, "HTTP/1.0 500 Proxy Error\r\n");		fprintf(fp, "Expires: %s\r\n", mkrfc850(&expire_time));		fprintf(fp, "MIME-Version: 1.0\r\n");		fprintf(fp, "Server: Harvest %s\r\n", HARVEST_VERSION);		fprintf(fp, "Content-Type: text/html\r\n");		fprintf(fp, "Content-Length: %ld\r\n", strlen(htmlbuf));		fprintf(fp, "\r\n");		fputs(htmlbuf, fp);		fclose(fp);	} else if (r->flags & F_HTTPIFY) {		if ((fp = fdopen(dup(r->cfd), "w")) == NULL) {			log_errno2(__FILE__, __LINE__, "fdopen");			exit(1);		}		setbuf(fp, NULL);		htmlbuf = (char *) xmalloc(8192);		sprintf(htmlbuf, CACHED_RETRIEVE_ERROR_MSG,		    r->url,		    r->url,		    "FTP",		    304,		    r->errmsg,		    longmsg,		    http_time(0),		    HARVEST_VERSION,		    getfullhostname());		fputs(htmlbuf, fp);		xfree(r->errmsg);		fclose(fp);	} else if (r->errmsg) {		errorlog("%s\n\t<URL:%s>\n", r->errmsg, r->url);		xfree(r->errmsg);	}}void timeout_handler(sig, code, scp, addr)     int sig, code;     struct sigcontext *scp;     char *addr;{	errorlog("Timeout after %d seconds, exiting.\n", timeout);	exit(1);}/* *  If there are two extensions and both are listed in the types table *  then return the leftmost extention type.  The rightmost extention *  type becomes the content encoding (eg .gz) */void mime_get_type(r)     request_t *r;{	char *filename = NULL;	char *ext = NULL;	char *t = NULL;	char *type = NULL;	char *enc = NULL;	int i;	if (r->flags & F_ISDIR) {		r->mime_type = xstrdup("text/html");		return;	}	type = DEFAULT_MIME_TYPE;	if ((t = strrchr(r->path, '/')))		filename = xstrdup(t + 1);	else		filename = xstrdup(r->path);	if (!(t = strrchr(filename, '.')))		goto mime_get_type_done;	ext = xstrdup(t + 1);	for (i = 0; i < EXT_TABLE_LEN; i++) {		if (!strcmp(ext, ext_mime_table[i].name)) {			type = ext_mime_table[i].mime_type;			enc = ext_mime_table[i].mime_encoding;			break;		}	}	if (i == EXT_TABLE_LEN) {		for (i = 0; i < EXT_TABLE_LEN; i++) {			if (!strcasecmp(ext, ext_mime_table[i].name)) {				type = ext_mime_table[i].mime_type;				enc = ext_mime_table[i].mime_encoding;				break;			}		}	}	/* now check for another extension */	*t = '\0';	if (!(t = strrchr(filename, '.')))		goto mime_get_type_done;	xfree(ext);	ext = xstrdup(t + 1);	for (i = 0; i < EXT_TABLE_LEN; i++) {		if (!strcmp(ext, ext_mime_table[i].name)) {			type = ext_mime_table[i].mime_type;			break;		}	}	if (i == EXT_TABLE_LEN) {		for (i = 0; i < EXT_TABLE_LEN; i++) {			if (!strcasecmp(ext, ext_mime_table[i].name)) {				type = ext_mime_table[i].mime_type;				break;			}		}	}      mime_get_type_done:	xfree(filename);	xfree(ext);	r->mime_type = xstrdup(type);	if (enc)		r->mime_enc = xstrdup(enc);}char *mime_get_icon(name)     char *name;{	char *ext = NULL;	char *t = NULL;	int i = 0;	if (!(t = strrchr(name, '.')))		return xstrdup("unknown");	ext = xstrdup(t + 1);	Debug(26, 1, ("mime_get_icon: ext = '%s'\n", ext));	for (i = 0; i < EXT_TABLE_LEN; i++) {		if (!strcmp(ext, ext_mime_table[i].name)) {			Debug(26, 1, ("mime_get_icon: matched entry #%d\n", i));			Debug(26, 1, ("mime_get_icon: returning '%s'\n",				ext_mime_table[i].icon));			xfree(ext);			return xstrdup(ext_mime_table[i].icon);			/* NOTREACHED */		}	}	if (i == EXT_TABLE_LEN) {		for (i = 0; i < EXT_TABLE_LEN; i++) {			if (!strcasecmp(ext, ext_mime_table[i].name)) {				Debug(26, 1, ("mime_get_icon: matched entry #%d\n", i));				Debug(26, 1, ("mime_get_icon: returning '%s'\n",					ext_mime_table[i].icon));				xfree(ext);				return xstrdup(ext_mime_table[i].icon);				/* NOTREACHED */			}		}	}	return xstrdup("unknown");}static char *http_time(t)     time_t t;{	struct tm *gmt;	time_t when;	static char tbuf[128];	when = t ? t : time(NULL);	gmt = gmtime(&when);	strftime(tbuf, 128, "%A, %d-%b-%y %H:%M:%S GMT", gmt);	return tbuf;}void send_success_hdr(r)     request_t *r;{	FILE *fp = NULL;	if (r->flags & F_HDRSENT)		return;	r->flags |= F_HDRSENT;	mime_get_type(r);	if ((fp = fdopen(dup(r->cfd), "w")) == NULL) {		log_errno2(__FILE__, __LINE__, "fdopen");		exit(1);	}	setbuf(fp, NULL);	fprintf(fp, "HTTP/1.0 200 Gatewaying\r\n");	fprintf(fp, "MIME-Version: 1.0\r\n");	fprintf(fp, "Server: Harvest %s\r\n", HARVEST_VERSION);	if (r->mime_type)		fprintf(fp, "Content-Type: %s\r\n", r->mime_type);	if (r->size > 0)		fprintf(fp, "Content-Length: %d\r\n", r->size);	if (r->mime_enc)		fprintf(fp, "Content-Encoding: %s\r\n", r->mime_enc);	if (r->mdtm > 0)		fprintf(fp, "Last-Modified: %s\r\n", http_time(r->mdtm));	fprintf(fp, "\r\n");	fclose(fp);}/* *  read_reply() *  Read reply strings from an FTP server. * *  Returns the reply code. */int read_reply(fd)     int fd;{	FILE *fp = NULL;	static char buf[BUFSIZ];	static char xbuf[BUFSIZ];	int quit = 0;	char *t = NULL;	int code;	list_t **Tail = NULL;	list_t *l = NULL;	list_t *next = NULL;	for (l = cmd_msg; l; l = next) {		next = l->next;		xfree(l->ptr);		xfree(l);	}	cmd_msg = NULL;	Tail = &cmd_msg;	if (server_reply_msg) {		xfree(server_reply_msg);		server_reply_msg = NULL;	}	if ((fp = fdopen(dup(fd), "r")) == (FILE *) NULL) {		log_errno2(__FILE__, __LINE__, "fdopen");		exit(1);	}	setbuf(fp, NULL);	while (!quit) {		if (fgets(buf, BUFSIZ, fp) == (char *) NULL) {			alarm(timeout);		/* reset timeout timer */			sprintf(xbuf, "read failed: %s", strerror(errno));			server_reply_msg = xstrdup(xbuf);			fclose(fp);			return -1;		}		quit = (buf[2] >= '0' && buf[2] <= '9' && buf[3] == ' ');		if (!quit) {			l = (list_t *) xmalloc(sizeof(list_t));			l->ptr = xstrdup(&buf[4]);			l->next = NULL;			*Tail = l;			Tail = &(l->next);		}		if ((t = strchr(buf, '\r')))			*t = 0;		if ((t = strchr(buf, '\n')))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -