⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 url.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
static char rcsid[] = "$Id: url.c,v 2.3 2000/01/21 17:37:33 sxw Exp $";/* *  url.c - URL processing code * *  DEBUG: section  20, level 1         Common liburl URL processing *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <ctype.h>#include <sys/socket.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <netinet/in.h>#include <arpa/inet.h>#include <netdb.h>#include <errno.h>#include "util.h"#include "url.h"#ifdef USE_CCACHE#include "ccache.h"#endif#define BIG_BUFSIZ (BUFSIZ<<3)/* Global variables */int liburl_conform_rfc1738 = 0;int liburl_sleep_time;#ifdef USE_LOCAL_CACHEint use_local_cache = 1;#elseint use_local_cache = 0;#endif/* Local Functions */static void Tolower();static void remove_dot();static void remove_dotdot();static URL *url_parse();static char *shsafe_path();static void get_lmt();#ifdef OLD_CODEstatic int compare_fullhost();#endif/* NOTE these rely on the order of 'enum url_types' in ../include/url.h */struct _url_table url_table[] = {	{		"unknown", 0, 0	},			/* URL_UNKNOWN, */	{		"file", 0, 0	},			/* URL_FILE,    */	{		"ftp", 21, ftp_get	},			/* URL_FTP,     */	{		"gopher", 70, gopher_get	},			/* URL_GOPHER,  */	{		"http", 80, http_get	},			/* URL_HTTP,    */	{		"news", 119, news_get	},			/* URL_NEWS,    */	{		"nop", 0, 0	},			/* URL_NOP,     */	{		"telnet", 25, 0	},			/* URL_TELNET,  */	{		"wais", 0, 0	},			/* URL_WAIS,    */	{		"x-", 0, 0	},			/* URL_X,       */	{		"mailto", 0, 0	},			/* URL_MAILTO,  */};static int init_called = 0;struct local_trans_table {	char *from;	char *to;	struct local_trans_table *next;};static struct local_trans_table *LocalTransTable = NULL;void url_initLocalServers(){	FILE *fp = NULL;	char *from = NULL;	char *to = NULL;	char *t = NULL;	char *buf = NULL;	struct local_trans_table *x;	LocalTransTable = NULL;	if ((t = getenv("HARVEST_URL_LOCAL_MAPPINGS")) == NULL)		return;	Debug(20, 1, ("url_initLocalServers: OPEN URLTABLE: %s\n", t));	if ((fp = fopen(t, "r")) == NULL)		return;	from = xmalloc(BUFSIZ);	to = xmalloc(BUFSIZ);	buf = xmalloc(BUFSIZ);	while (fgets(buf, BUFSIZ, fp)) {		if ((t = strchr(buf, '\n')))			*t = '\0';		if (sscanf(buf, "%s %s", from, to) != 2)			continue;		Debug(20, 1, ("url_initLocalServers: READ URLTABLE: %s --> %s\n",			from, to));		x = (struct local_trans_table *)		    xmalloc(sizeof(struct local_trans_table));		x->from = xstrdup(from);		x->to = xstrdup(to);		x->next = LocalTransTable;		LocalTransTable = x;	}	fclose(fp);	xfree(from);	xfree(to);	xfree(buf);}void init_url(){	char *s;	if (init_called)		return;	init_called = 1;	liburl_sleep_time = 1;	/* hard-coded default */	if ((s = getenv("HARVEST_URL_DELAY")) != NULL)		liburl_sleep_time = atoi(s);	if (liburl_sleep_time < 0)		liburl_sleep_time = 1;	if ((s = getenv("HARVEST_GATHERER_DBS")) != NULL)		urldb_init(s);#ifdef USE_LOCAL_CACHE	if (use_local_cache)		init_cache();#endif#ifdef USE_CCACHE	url_initCache(10, 600);#endif	url_initLocalServers();}void url_purge(){	if (!init_called)		init_url();#ifdef USE_LOCAL_CACHE	if (use_local_cache)		expire_cache();#endif}void finish_url(){#ifdef USE_LOCAL_CACHE	if (use_local_cache)		finish_cache();#endif#ifdef USE_CCACHE	url_shutdowncache();#endif}/* *  url_open() - Parses and initializes the given url into a URL structure. *  Returns a pointer to the structure on success; or returns NULL if the *  URL is not parseable, or if the URL's host is not valid. */URL *url_open(url)     char *url;{	static URL *up = NULL;	static char buf[BUFSIZ];	struct local_trans_table *l;	char *s, *local_filename = NULL;	char local_filename_buf[1024];	struct stat sb;	Debug(20, 1, ("url_open: %s\n", url));	if (!init_called) {		init_url();	}	if ((up = url_parse(url)) == NULL) {		url_close(up);		return (NULL);	}	for (l = LocalTransTable; !local_filename && l; l = l->next) {		if (strchr(l->from, '*')) {			/* Do wildcard based mapping */			if (url_matchAndSub(l->from, up->url, l->to, local_filename_buf,				1024) == 0 ) {				Debug(20, 1, ("Local Mapping: '%s' matched '%s'\n", up->url,					l->from));				local_filename = (char *) xmalloc(strlen(local_filename_buf)+1);				strcpy(local_filename, local_filename_buf);				Debug(20, 1, ("Mapped to: '%s'\n", local_filename));			}		} else if (!strncasecmp(up->url, l->from, strlen(l->from))) {			Debug(20, 1, ("Local Mapping: '%s' matched '%s'\n",				up->url, l->from));			s = up->url + strlen(l->from);			local_filename = (char *) xmalloc(strlen(l->to) +				strlen(s) + 1);			sprintf(local_filename, "%s%s", l->to, s);		}		if (local_filename) {			int fd=-1;			/* no HTTP involved, so unescape URI */			rfc1738_unescape(local_filename);			/* expand tilde to homedir */			if (url_tildeExpand(local_filename, local_filename_buf,					    1024) == 0) {				xfree(local_filename);				local_filename = (char *)				  xmalloc(strlen(local_filename_buf)+1);				strcpy(local_filename, local_filename_buf);				Debug(20, 1, ("Tilde expanded to %s\n",					      local_filename));			}			/*			 *  Don't use the mapping if the file is unreadable,			 *  if fstat() fails, if it's a special file, or if			 *  it's executable.			 */			if (stat(local_filename, &sb) < 0 ||			    !S_ISREG(sb.st_mode) ||			    (sb.st_mode & S_IXUSR) ||			    (fd = open(local_filename, O_RDONLY, 0)) < 0) {				xfree(local_filename);				local_filename = NULL;			}			if (fd >= 0)				(void) close(fd);		}		/* Special hacks for news: URLs.  We want to change     */		/* news:comp.sex.html into                              */		/* /var/spool/nov/comp/sex/html/.overview               */		/* The local mapping should be:                         */		/*    news:overview     /var/spool/nov/                 */		if (!strncasecmp("news:overview", l->from, 13)) {			if (up->type == URL_NEWS && (strchr(up->url, '@') == 0)) {				int fd;				char *group_path;				local_filename = xmalloc(strlen(l->to) +				    strlen(up->pathname) + 20);				group_path = xstrdup(up->pathname);				for (s = group_path; *s; s++)	/* dots to slashes */					if (*s == '.')						*s = '/';				sprintf(local_filename, "%s%s/.overview",				    l->to, group_path);				xfree(group_path);				if ((fd = open(local_filename, O_RDONLY, 0)) < 0 ||				    fstat(fd, &sb) < 0 || !S_ISREG(sb.st_mode) ||				    (sb.st_mode & S_IXUSR)) {					xfree(local_filename);					local_filename = NULL;				}				if (fd >= 0)					(void) close(fd);			}		}	}	if (local_filename != NULL	    && (s = xstrdup(local_filename))) {		Debug(20, 1, ("url_open: Local Mapping succeeded: %s -> %s\n",			up->url, local_filename));		if (up->type == URL_HTTP)			up->http_mime_hdr = xstrdup("text/html");		up->filename = s;	/* point to the symlink */		up->shsafe_filename = shsafe_path(up->filename);		up->flags |= URL_FLAG_LOCAL_MAPPED;		xfree(local_filename);		local_filename = NULL;		return (up);	}	/* Type-specific additions to the URL */	Debug(20, 5, ("url_open: type=%d\n", up->type));	switch (up->type) {	case URL_FILE:		up->filename = xstrdup(up->pathname);		up->shsafe_filename = shsafe_path(up->filename);		break;	case URL_FTP:		/*      If no userinfo yet, see if we can get it from the		 * **   FTPAuth.cf file (which came from FTP-Auth: in		 * **   gatherer.cf.                                    */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -