⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 http.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "$Id: http.c,v 2.7 2000/01/21 17:37:33 sxw Exp $";/* *  http.c - URL processing for http-specific URLs.  HTTP/1.0 compliant. * *  DEBUG: section  21, level 1, 5, 9   Common liburl HTTP routines *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <memory.h>#include <string.h>#include <signal.h>#include <ctype.h>#include <time.h>#include <sys/types.h>#include <sys/time.h>#include <sys/socket.h>#include <netinet/in.h>#include <netdb.h>#include <errno.h>#include "url.h"#include "util.h"#ifdef _HARVEST_AIX_#include <sys/select.h>#endif/* Local variables */static int read_timeout = 0;static int do_read ();struct http_auth {	char *type;	char *realm;	char *username;	char *passwd;	char *encoded;	struct http_auth *next;};static struct http_auth *HTTPAuth;static char *http_make_auth ();static char *ht_uuencode ();/* *  HTTP/1.0 Status Codes from: *     http://info.cern.ch/hypertext/WWW/Protocols/HTTP/HTRESP.html * *  HTTP/1.1 Status Codes from: *     RFC 2068 "Hypertext Transfer Protocol -- HTTP/1.1" */#define HTTP_SUCCESS_STATUS(x) \	( \	((x) == 200) ||	/* Success: OK */ \	((x) == 201) ||	/* Success: CREATED */ \	((x) == 202) ||	/* Success: Accepted */ \	((x) == 203) ||	/* Success: Partial Information */ \	((x) == 204) ||	/* Success: No Response */ \     	((x) == 205) ||	/* 1.1 Success: Reset Content */ \     	((x) == 206)   	/* 1.1 Success: Partial Content */ \	)#define HTTP_REDIRECTION_STATUS(x) \	( \	((x) == 301) ||	/* Redirection: Moved */ \	((x) == 302) ||	/* Redirection: Found */ \	((x) == 303) ||	/* Redirection: Method */ \	((x) == 304) ||	/* Redirection: Not Modified */ \	((x) == 305)   	/* 1.1 Redirection: Use Proxy */ \	)#define HTTP_UNAUTHORIZED_STATUS(x) \	( \	((x) == 401) ||	/* Error: Unauthorized */ \	((x) == 407)   	/* 1.1 Error: Proxy Authentication Required */ \	)#define HTTP_ERROR_STATUS(x) \	( \	((x) == 400) ||	/* Error: Bad request */ \	((x) == 402) ||	/* Error: PaymentRequired */ \	((x) == 403) ||	/* Error: Forbidden */ \	((x) == 404) ||	/* Error: Not found */ \	((x) == 405) ||	/* 1.1 Error: Method Not Allowed */ \	((x) == 406) ||	/* 1.1 Error: Not Acceptable */ \	((x) == 408) ||	/* 1.1 Error: Request Timeout */ \	((x) == 409) ||	/* 1.1 Error: Conflict */ \	((x) == 410) ||	/* 1.1 Error: Gone */ \	((x) == 411) ||	/* 1.1 Error: Length Required */ \	((x) == 412) ||	/* 1.1 Error: Precondition Failed */ \	((x) == 413) ||	/* 1.1 Error: Request Entity Too Large */ \	((x) == 414) ||	/* 1.1 Error: Request-URI Too Long */ \	((x) == 415) ||	/* 1.1 Error: Unsupported Media Type */ \	((x) == 500) ||	/* Error: Internal Error */ \	((x) == 501) ||	/* Error: Not implemented */ \	((x) == 502) ||	/* Error: Service temporarily overloaded */ \	((x) == 503) ||	/* Error: Gateway timeout 503 */ \	((x) == 504) ||	/* 1.1 Error: Gateway Timeout */ \	((x) == 505)   	/* 1.1 Error: HTTP Version Not Supported */ \	)#define HTTP_VALID_STATUS(x) \	(  \	HTTP_SUCCESS_STATUS(x)		|| \	HTTP_REDIRECTION_STATUS(x)	|| \	HTTP_ERROR_STATUS(x)		|| \	HTTP_UNAUTHORIZED_STATUS(x)  	\	)/* *  get_sockaddr() - create a socket, bind an address to it * *  Return values: * *      0       Success *      1       DNS errors */intget_sockaddr (hostname, sa)char *hostname;struct sockaddr_in *sa;{	Host *H = NULL;	if ((H = get_host (hostname)) == NULL) {		errorlog ("Cannot resolve %s\n", hostname);		return 1;	}	memcpy (&(sa->sin_addr.s_addr), H->ipaddr, H->addrlen);	return 0;}/* *  http_get() - retrieves the URL and prints into the file up->fp. *  Returns non-zero on error; 0 on success. * *  Return code indicates severity of error (DW 6/9/95): * *      -1      Indicates HTTP Redirect - may use -1/-2 in future to *              denote 301 / 302 *      1-9     'soft', maybe temporary errors.  Doesn't necessarily *              mean the object doens't exist. *      10+     'hard' errors from remote HTTPD.  The URL is invalid *              or no longer exists * *  Return codes: *      -1      Redirect *      0       Success *      1       DNS errors (from get_sockaddr()) *      2       socket()/bind() errors *      3       connect() errors *      4       network write/read errors *      10      HTTP errors * * *  Uses the HTTP/1.0 protocol as described in: *      http://info.cern.ch/hypertext/WWW/Protocols/HTTP/HTTP2.html * *  Patched by Hrvoje Stipetic <hrvoje.stipetic@hck.hr> to support *  modification of User-Agent: and From: HTTP headers.  Uses *  environment variables HARVEST_USER_AGENT and HARVEST_MAINTAINER_ADDRESS. * *  Patched by Judith Pluemer <judith@mathematik.uni-osnabrueck.de> *  to support incremental gathering on http-Servers. Additionally *  the Perl-Scripts uptime.pl and extract.pl are used. * *  Patched by Simon Wilkinson <sxw@tardis.ed.ac.uk> to remove dependencies *  on perl scripts for incremental gatherering */inthttp_get (up)URL *up;{	/*	 *  The following fix was added by Paul Johnson, 27/9/96, to handle	 *  the end-of-header detection properly.  The original routine	 *  could not spot a double-new-line (end of header marker) if it	 *  was split across two messages from the server.  Some HTTP	 *  servers (notably NCSA 1.5.2) send each line as a separate	 *  message, and hence break this code.	 *	 *  The solution is to add a new flag variable "last_char_was_eol"	 *  which is set when we see a \n and cleared when we see anything	 *  else.  If last_char_was_eol and we have a second \n, then we	 *  have found the end of the header.	 */	Buffer *mimebuf = NULL;	Buffer *reqbuf = NULL;	Buffer *urlbuf = NULL;	URL *new_up = NULL;	char *bufp = NULL;	char *host = NULL;	char *http_proxy = getenv ("http_proxy");	char *accept = getenv ("HARVEST_HTTP_ACCEPT");	char *p = NULL;	char *request = 0;	char *t = NULL;	char *tmp = NULL;	char *u = NULL;	int i;	int in_http_code = 1;	int in_http_data = 0;	int in_http_header = 0;	int last_char_was_eol = 0;	int n;	int nbytes;	int nw;	int port = 0;	int proxy_port = 0;	int s;	int x;	static char buf[BUFSIZ];	static char junk[128];	static char newURL[BUFSIZ];	static char proxy_host[128];	static char realm[128];	static char type[128];	struct sockaddr_in sa;	Debug (21, 1, ("http_get: http_proxy=%s\n",		       http_proxy ? http_proxy : "NULL"));	if (http_proxy) {		/* FIX: No error checking on sscanf */		sscanf (http_proxy, "http://%[^:]:%d/", proxy_host,			&proxy_port);		host = proxy_host;		port = proxy_port;		request = up->url;	} else {		host = up->host;		port = up->port;		request = up->raw_pathname;	/* spec says need escapes */	}	if ((x = get_sockaddr (host, &sa) > 0))		return x;	sa.sin_family = AF_INET;	sa.sin_port = (unsigned short) htons (port);	if ((s = socket (PF_INET, SOCK_STREAM, 0)) < 0) {		sprintf (buf, "HTTP socket: %s", host);		log_errno (buf);		return 2;	}	if (connect (s, (struct sockaddr *) &sa, sizeof (sa)) < 0) {		sprintf (buf, "HTTP connect: %s:%d [%s]",			 host, port, inet_ntoa (sa.sin_addr));		log_errno (buf);		close (s);		return 3;	}	reqbuf = create_buffer (BUFSIZ);	sprintf (buf, "GET %s HTTP/1.0\r\n", request);	add_buffer (reqbuf, buf, strlen (buf));	if (getenv ("HARVEST_USER_AGENT") != NULL)		sprintf (buf, "User-Agent: %s (Harvest/%s)\r\n",			 getenv ("HARVEST_USER_AGENT"), HARVEST_VERSION);	else		sprintf (buf, "User-Agent: Harvest/%s\r\n", HARVEST_VERSION);	add_buffer (reqbuf, buf, strlen (buf));	if (getenv ("HARVEST_MAINTAINER_ADDRESS") != NULL)		sprintf (buf, "From: %s\r\n",			 getenv ("HARVEST_MAINTAINER_ADDRESS"));	else		sprintf (buf, "From: %s@%s\r\n",			 getmylogin (), getfullhostname ());	add_buffer (reqbuf, buf, strlen (buf));	/* A minor HTTP/1.1 addition - make Harvest send the Host: header --JLa */	sprintf (buf, "Host: %s\r\n", up->host);	add_buffer (reqbuf, buf, strlen (buf));	/* Send an Accept: Header. This is part of HTTP/1.0 */	if (accept == NULL) {		sprintf (buf, "Accept: */*\r\n");		add_buffer (reqbuf, buf, strlen (buf));	} else if (accept[0] != '\0') {		sprintf (buf, "Accept: %s\n", accept);		add_buffer (reqbuf, buf, strlen (buf));	}	urlbuf = create_buffer (BUFSIZ);	sprintf (buf, "http://%s", up->host);	add_buffer (urlbuf, buf, strlen (buf));	if (up->port != 80) {		sprintf (buf, ":%d", up->port);		add_buffer (urlbuf, buf, strlen (buf));	}	/* FIXME: Is this right, or should we be using the escaped pathname? */	add_buffer (urlbuf, up->raw_pathname, strlen (up->raw_pathname));	up->lmt = urldb_getlmt (urlbuf->data);	if (up->lmt > 0) {		sprintf (buf, "If-Modified-Since: %s\r\n",			 mkrfc850 (&(up->lmt)));		add_buffer (reqbuf, buf, strlen (buf));	}#ifdef HTTP_AUTHENTICATION	if (up->auth_realm) {		if ((tmp = http_make_auth (up->auth_realm))) {			sprintf (buf, "Authorization: %s %s\r\n",				 up->auth_type, tmp);			add_buffer (reqbuf, buf, strlen (buf));			xfree (tmp);		}	} else		/*		 *  Another way to do HTTP authentication.  If they give		 *  http://user:pw@host:port/url-path		 *  then use it.		 */	if (up->user || up->password) {		char *xbuf = xmalloc (BUFSIZ);		sprintf (xbuf, "%s:%s",			 up->user ? up->user : "",			 up->password ? up->password : "");		sprintf (buf, "Authorization: Basic %s\r\n",			 ht_uuencode (xbuf));		xfree (xbuf);		add_buffer (reqbuf, buf, strlen (buf));	}#endif	add_buffer (reqbuf, "\r\n", 2);	Debug (21, 1, ("http_get: Sending HTTP Request: %s\n", reqbuf->data));	if (write (s, reqbuf->data, reqbuf->length) != reqbuf->length) {		sprintf (buf, "HTTP write: %s:%d", up->host, up->port);		log_errno (buf);		close (s);		free_buffer (reqbuf);		return 4;	}	free_buffer (reqbuf);	/* Now read the HTTP/1.0 response, and write data into a file */	memset (buf, '\0', BUFSIZ);	if ((up->fp = fopen (up->filename, "w+")) == NULL) {		log_errno (up->filename);		close (s);		return 4;	}	nbytes = 0;	while (1) {		read_timeout = 0;		n = do_read (s, buf, BUFSIZ - 9);	/* need a little extra at end */		buf[n] = 0;		Debug (21, 9, ("Read %d bytes: ---BEGIN---\n%s\n---END---\n",			       n, buf));		bufp = &buf[0];		if (n == 0)			break;	/* nothing left to do */		if (n < 0) {			if (read_timeout == 1) {				char *t = NULL;				int to = XFER_TIMEOUT;				if ((t =				     getenv ("HARVEST_XFER_TIMEOUT")) != NULL)					to = atol (t);				errorlog ("HTTP timeout: %s:%d (%d seconds).\n",					  up->host, up->port, to);			} else				errorlog ("HTTP read: %s:%d failed.\n",					  up->host, up->port);			close (s);			fclose (up->fp);			free_buffer (urlbuf);			return 4;		}		if (in_http_code) {			in_http_code = 0;			while (isspace (*bufp) && n > 0)				bufp++, n--;			/*			 * The next line altered by Simon Wilkinson			 * <sxw@tardis.ed.ac.uk> so Harvest will work with			 *  HTTP/1.1 compliant servers.			 */			if ((strlen (bufp) > 5) &&			    (memcmp (bufp, "HTTP/1", 6) != 0)) {				tmp = strchr (bufp, '\n');				if (tmp != NULL)					*tmp = '\0';	/* get one line */				up->http_version = xstrdup ("UNKNOWN");				Log ("WARNING: Invalid HTTP/1.0 response: %s: %s\n", up->url, bufp);				in_http_header = 0;				in_http_data = 1;			} else {				if ((strchr (bufp, '3') != NULL) &&				    (memcmp (strchr (bufp, '3'), "304", 3) ==				     0)) {					/*					 *  We've done a successful If-Modified-Since get					 *  so we now need to write the file out of the					 *  production db into where the data is expected					 */#ifdef USE_MD5					/* Pull the MD5 hash from the production db */					up->md5 = urldb_getmd5 (urlbuf->data);#endif					up->http_version = xstrdup ("HTTP/1.0");					up->http_reason_line =					    xstrdup ("HTTP/1.0 200 OK\n");					/*					 *  Create a fake set of Mime headers for the					 *  item					 */					mimebuf = create_buffer (BUFSIZ);					sprintf (buf, "HTTP/1.1 200 OK\n");

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -