📄 http.c
字号:
static char rcsid[] = "$Id: http.c,v 2.7 2000/01/21 17:37:33 sxw Exp $";/* * http.c - URL processing for http-specific URLs. HTTP/1.0 compliant. * * DEBUG: section 21, level 1, 5, 9 Common liburl HTTP routines * AUTHOR: Harvest derived * * Harvest Indexer http://harvest.sourceforge.net/ * ----------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail lee@arco.de if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <memory.h>#include <string.h>#include <signal.h>#include <ctype.h>#include <time.h>#include <sys/types.h>#include <sys/time.h>#include <sys/socket.h>#include <netinet/in.h>#include <netdb.h>#include <errno.h>#include "url.h"#include "util.h"#ifdef _HARVEST_AIX_#include <sys/select.h>#endif/* Local variables */static int read_timeout = 0;static int do_read ();struct http_auth { char *type; char *realm; char *username; char *passwd; char *encoded; struct http_auth *next;};static struct http_auth *HTTPAuth;static char *http_make_auth ();static char *ht_uuencode ();/* * HTTP/1.0 Status Codes from: * http://info.cern.ch/hypertext/WWW/Protocols/HTTP/HTRESP.html * * HTTP/1.1 Status Codes from: * RFC 2068 "Hypertext Transfer Protocol -- HTTP/1.1" */#define HTTP_SUCCESS_STATUS(x) \ ( \ ((x) == 200) || /* Success: OK */ \ ((x) == 201) || /* Success: CREATED */ \ ((x) == 202) || /* Success: Accepted */ \ ((x) == 203) || /* Success: Partial Information */ \ ((x) == 204) || /* Success: No Response */ \ ((x) == 205) || /* 1.1 Success: Reset Content */ \ ((x) == 206) /* 1.1 Success: Partial Content */ \ )#define HTTP_REDIRECTION_STATUS(x) \ ( \ ((x) == 301) || /* Redirection: Moved */ \ ((x) == 302) || /* Redirection: Found */ \ ((x) == 303) || /* Redirection: Method */ \ ((x) == 304) || /* Redirection: Not Modified */ \ ((x) == 305) /* 1.1 Redirection: Use Proxy */ \ )#define HTTP_UNAUTHORIZED_STATUS(x) \ ( \ ((x) == 401) || /* Error: Unauthorized */ \ ((x) == 407) /* 1.1 Error: Proxy Authentication Required */ \ )#define HTTP_ERROR_STATUS(x) \ ( \ ((x) == 400) || /* Error: Bad request */ \ ((x) == 402) || /* Error: PaymentRequired */ \ ((x) == 403) || /* Error: Forbidden */ \ ((x) == 404) || /* Error: Not found */ \ ((x) == 405) || /* 1.1 Error: Method Not Allowed */ \ ((x) == 406) || /* 1.1 Error: Not Acceptable */ \ ((x) == 408) || /* 1.1 Error: Request Timeout */ \ ((x) == 409) || /* 1.1 Error: Conflict */ \ ((x) == 410) || /* 1.1 Error: Gone */ \ ((x) == 411) || /* 1.1 Error: Length Required */ \ ((x) == 412) || /* 1.1 Error: Precondition Failed */ \ ((x) == 413) || /* 1.1 Error: Request Entity Too Large */ \ ((x) == 414) || /* 1.1 Error: Request-URI Too Long */ \ ((x) == 415) || /* 1.1 Error: Unsupported Media Type */ \ ((x) == 500) || /* Error: Internal Error */ \ ((x) == 501) || /* Error: Not implemented */ \ ((x) == 502) || /* Error: Service temporarily overloaded */ \ ((x) == 503) || /* Error: Gateway timeout 503 */ \ ((x) == 504) || /* 1.1 Error: Gateway Timeout */ \ ((x) == 505) /* 1.1 Error: HTTP Version Not Supported */ \ )#define HTTP_VALID_STATUS(x) \ ( \ HTTP_SUCCESS_STATUS(x) || \ HTTP_REDIRECTION_STATUS(x) || \ HTTP_ERROR_STATUS(x) || \ HTTP_UNAUTHORIZED_STATUS(x) \ )/* * get_sockaddr() - create a socket, bind an address to it * * Return values: * * 0 Success * 1 DNS errors */intget_sockaddr (hostname, sa)char *hostname;struct sockaddr_in *sa;{ Host *H = NULL; if ((H = get_host (hostname)) == NULL) { errorlog ("Cannot resolve %s\n", hostname); return 1; } memcpy (&(sa->sin_addr.s_addr), H->ipaddr, H->addrlen); return 0;}/* * http_get() - retrieves the URL and prints into the file up->fp. * Returns non-zero on error; 0 on success. * * Return code indicates severity of error (DW 6/9/95): * * -1 Indicates HTTP Redirect - may use -1/-2 in future to * denote 301 / 302 * 1-9 'soft', maybe temporary errors. Doesn't necessarily * mean the object doens't exist. * 10+ 'hard' errors from remote HTTPD. The URL is invalid * or no longer exists * * Return codes: * -1 Redirect * 0 Success * 1 DNS errors (from get_sockaddr()) * 2 socket()/bind() errors * 3 connect() errors * 4 network write/read errors * 10 HTTP errors * * * Uses the HTTP/1.0 protocol as described in: * http://info.cern.ch/hypertext/WWW/Protocols/HTTP/HTTP2.html * * Patched by Hrvoje Stipetic <hrvoje.stipetic@hck.hr> to support * modification of User-Agent: and From: HTTP headers. Uses * environment variables HARVEST_USER_AGENT and HARVEST_MAINTAINER_ADDRESS. * * Patched by Judith Pluemer <judith@mathematik.uni-osnabrueck.de> * to support incremental gathering on http-Servers. Additionally * the Perl-Scripts uptime.pl and extract.pl are used. * * Patched by Simon Wilkinson <sxw@tardis.ed.ac.uk> to remove dependencies * on perl scripts for incremental gatherering */inthttp_get (up)URL *up;{ /* * The following fix was added by Paul Johnson, 27/9/96, to handle * the end-of-header detection properly. The original routine * could not spot a double-new-line (end of header marker) if it * was split across two messages from the server. Some HTTP * servers (notably NCSA 1.5.2) send each line as a separate * message, and hence break this code. * * The solution is to add a new flag variable "last_char_was_eol" * which is set when we see a \n and cleared when we see anything * else. If last_char_was_eol and we have a second \n, then we * have found the end of the header. */ Buffer *mimebuf = NULL; Buffer *reqbuf = NULL; Buffer *urlbuf = NULL; URL *new_up = NULL; char *bufp = NULL; char *host = NULL; char *http_proxy = getenv ("http_proxy"); char *accept = getenv ("HARVEST_HTTP_ACCEPT"); char *p = NULL; char *request = 0; char *t = NULL; char *tmp = NULL; char *u = NULL; int i; int in_http_code = 1; int in_http_data = 0; int in_http_header = 0; int last_char_was_eol = 0; int n; int nbytes; int nw; int port = 0; int proxy_port = 0; int s; int x; static char buf[BUFSIZ]; static char junk[128]; static char newURL[BUFSIZ]; static char proxy_host[128]; static char realm[128]; static char type[128]; struct sockaddr_in sa; Debug (21, 1, ("http_get: http_proxy=%s\n", http_proxy ? http_proxy : "NULL")); if (http_proxy) { /* FIX: No error checking on sscanf */ sscanf (http_proxy, "http://%[^:]:%d/", proxy_host, &proxy_port); host = proxy_host; port = proxy_port; request = up->url; } else { host = up->host; port = up->port; request = up->raw_pathname; /* spec says need escapes */ } if ((x = get_sockaddr (host, &sa) > 0)) return x; sa.sin_family = AF_INET; sa.sin_port = (unsigned short) htons (port); if ((s = socket (PF_INET, SOCK_STREAM, 0)) < 0) { sprintf (buf, "HTTP socket: %s", host); log_errno (buf); return 2; } if (connect (s, (struct sockaddr *) &sa, sizeof (sa)) < 0) { sprintf (buf, "HTTP connect: %s:%d [%s]", host, port, inet_ntoa (sa.sin_addr)); log_errno (buf); close (s); return 3; } reqbuf = create_buffer (BUFSIZ); sprintf (buf, "GET %s HTTP/1.0\r\n", request); add_buffer (reqbuf, buf, strlen (buf)); if (getenv ("HARVEST_USER_AGENT") != NULL) sprintf (buf, "User-Agent: %s (Harvest/%s)\r\n", getenv ("HARVEST_USER_AGENT"), HARVEST_VERSION); else sprintf (buf, "User-Agent: Harvest/%s\r\n", HARVEST_VERSION); add_buffer (reqbuf, buf, strlen (buf)); if (getenv ("HARVEST_MAINTAINER_ADDRESS") != NULL) sprintf (buf, "From: %s\r\n", getenv ("HARVEST_MAINTAINER_ADDRESS")); else sprintf (buf, "From: %s@%s\r\n", getmylogin (), getfullhostname ()); add_buffer (reqbuf, buf, strlen (buf)); /* A minor HTTP/1.1 addition - make Harvest send the Host: header --JLa */ sprintf (buf, "Host: %s\r\n", up->host); add_buffer (reqbuf, buf, strlen (buf)); /* Send an Accept: Header. This is part of HTTP/1.0 */ if (accept == NULL) { sprintf (buf, "Accept: */*\r\n"); add_buffer (reqbuf, buf, strlen (buf)); } else if (accept[0] != '\0') { sprintf (buf, "Accept: %s\n", accept); add_buffer (reqbuf, buf, strlen (buf)); } urlbuf = create_buffer (BUFSIZ); sprintf (buf, "http://%s", up->host); add_buffer (urlbuf, buf, strlen (buf)); if (up->port != 80) { sprintf (buf, ":%d", up->port); add_buffer (urlbuf, buf, strlen (buf)); } /* FIXME: Is this right, or should we be using the escaped pathname? */ add_buffer (urlbuf, up->raw_pathname, strlen (up->raw_pathname)); up->lmt = urldb_getlmt (urlbuf->data); if (up->lmt > 0) { sprintf (buf, "If-Modified-Since: %s\r\n", mkrfc850 (&(up->lmt))); add_buffer (reqbuf, buf, strlen (buf)); }#ifdef HTTP_AUTHENTICATION if (up->auth_realm) { if ((tmp = http_make_auth (up->auth_realm))) { sprintf (buf, "Authorization: %s %s\r\n", up->auth_type, tmp); add_buffer (reqbuf, buf, strlen (buf)); xfree (tmp); } } else /* * Another way to do HTTP authentication. If they give * http://user:pw@host:port/url-path * then use it. */ if (up->user || up->password) { char *xbuf = xmalloc (BUFSIZ); sprintf (xbuf, "%s:%s", up->user ? up->user : "", up->password ? up->password : ""); sprintf (buf, "Authorization: Basic %s\r\n", ht_uuencode (xbuf)); xfree (xbuf); add_buffer (reqbuf, buf, strlen (buf)); }#endif add_buffer (reqbuf, "\r\n", 2); Debug (21, 1, ("http_get: Sending HTTP Request: %s\n", reqbuf->data)); if (write (s, reqbuf->data, reqbuf->length) != reqbuf->length) { sprintf (buf, "HTTP write: %s:%d", up->host, up->port); log_errno (buf); close (s); free_buffer (reqbuf); return 4; } free_buffer (reqbuf); /* Now read the HTTP/1.0 response, and write data into a file */ memset (buf, '\0', BUFSIZ); if ((up->fp = fopen (up->filename, "w+")) == NULL) { log_errno (up->filename); close (s); return 4; } nbytes = 0; while (1) { read_timeout = 0; n = do_read (s, buf, BUFSIZ - 9); /* need a little extra at end */ buf[n] = 0; Debug (21, 9, ("Read %d bytes: ---BEGIN---\n%s\n---END---\n", n, buf)); bufp = &buf[0]; if (n == 0) break; /* nothing left to do */ if (n < 0) { if (read_timeout == 1) { char *t = NULL; int to = XFER_TIMEOUT; if ((t = getenv ("HARVEST_XFER_TIMEOUT")) != NULL) to = atol (t); errorlog ("HTTP timeout: %s:%d (%d seconds).\n", up->host, up->port, to); } else errorlog ("HTTP read: %s:%d failed.\n", up->host, up->port); close (s); fclose (up->fp); free_buffer (urlbuf); return 4; } if (in_http_code) { in_http_code = 0; while (isspace (*bufp) && n > 0) bufp++, n--; /* * The next line altered by Simon Wilkinson * <sxw@tardis.ed.ac.uk> so Harvest will work with * HTTP/1.1 compliant servers. */ if ((strlen (bufp) > 5) && (memcmp (bufp, "HTTP/1", 6) != 0)) { tmp = strchr (bufp, '\n'); if (tmp != NULL) *tmp = '\0'; /* get one line */ up->http_version = xstrdup ("UNKNOWN"); Log ("WARNING: Invalid HTTP/1.0 response: %s: %s\n", up->url, bufp); in_http_header = 0; in_http_data = 1; } else { if ((strchr (bufp, '3') != NULL) && (memcmp (strchr (bufp, '3'), "304", 3) == 0)) { /* * We've done a successful If-Modified-Since get * so we now need to write the file out of the * production db into where the data is expected */#ifdef USE_MD5 /* Pull the MD5 hash from the production db */ up->md5 = urldb_getmd5 (urlbuf->data);#endif up->http_version = xstrdup ("HTTP/1.0"); up->http_reason_line = xstrdup ("HTTP/1.0 200 OK\n"); /* * Create a fake set of Mime headers for the * item */ mimebuf = create_buffer (BUFSIZ); sprintf (buf, "HTTP/1.1 200 OK\n");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -