📄 url.c
字号:
static char rcsid[] = "$Id: url.c,v 2.3 2000/01/21 17:37:33 sxw Exp $";/* * url.c - URL processing code * * DEBUG: section 20, level 1 Common liburl URL processing * AUTHOR: Harvest derived * * Harvest Indexer http://harvest.sourceforge.net/ * ----------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail lee@arco.de if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <ctype.h>#include <sys/socket.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <netinet/in.h>#include <arpa/inet.h>#include <netdb.h>#include <errno.h>#include "util.h"#include "url.h"#ifdef USE_CCACHE#include "ccache.h"#endif#define BIG_BUFSIZ (BUFSIZ<<3)/* Global variables */int liburl_conform_rfc1738 = 0;int liburl_sleep_time;#ifdef USE_LOCAL_CACHEint use_local_cache = 1;#elseint use_local_cache = 0;#endif/* Local Functions */static void Tolower();static void remove_dot();static void remove_dotdot();static URL *url_parse();static char *shsafe_path();static void get_lmt();#ifdef OLD_CODEstatic int compare_fullhost();#endif/* NOTE these rely on the order of 'enum url_types' in ../include/url.h */struct _url_table url_table[] = { { "unknown", 0, 0 }, /* URL_UNKNOWN, */ { "file", 0, 0 }, /* URL_FILE, */ { "ftp", 21, ftp_get }, /* URL_FTP, */ { "gopher", 70, gopher_get }, /* URL_GOPHER, */ { "http", 80, http_get }, /* URL_HTTP, */ { "news", 119, news_get }, /* URL_NEWS, */ { "nop", 0, 0 }, /* URL_NOP, */ { "telnet", 25, 0 }, /* URL_TELNET, */ { "wais", 0, 0 }, /* URL_WAIS, */ { "x-", 0, 0 }, /* URL_X, */ { "mailto", 0, 0 }, /* URL_MAILTO, */};static int init_called = 0;struct local_trans_table { char *from; char *to; struct local_trans_table *next;};static struct local_trans_table *LocalTransTable = NULL;void url_initLocalServers(){ FILE *fp = NULL; char *from = NULL; char *to = NULL; char *t = NULL; char *buf = NULL; struct local_trans_table *x; LocalTransTable = NULL; if ((t = getenv("HARVEST_URL_LOCAL_MAPPINGS")) == NULL) return; Debug(20, 1, ("url_initLocalServers: OPEN URLTABLE: %s\n", t)); if ((fp = fopen(t, "r")) == NULL) return; from = xmalloc(BUFSIZ); to = xmalloc(BUFSIZ); buf = xmalloc(BUFSIZ); while (fgets(buf, BUFSIZ, fp)) { if ((t = strchr(buf, '\n'))) *t = '\0'; if (sscanf(buf, "%s %s", from, to) != 2) continue; Debug(20, 1, ("url_initLocalServers: READ URLTABLE: %s --> %s\n", from, to)); x = (struct local_trans_table *) xmalloc(sizeof(struct local_trans_table)); x->from = xstrdup(from); x->to = xstrdup(to); x->next = LocalTransTable; LocalTransTable = x; } fclose(fp); xfree(from); xfree(to); xfree(buf);}void init_url(){ char *s; if (init_called) return; init_called = 1; liburl_sleep_time = 1; /* hard-coded default */ if ((s = getenv("HARVEST_URL_DELAY")) != NULL) liburl_sleep_time = atoi(s); if (liburl_sleep_time < 0) liburl_sleep_time = 1; if ((s = getenv("HARVEST_GATHERER_DBS")) != NULL) urldb_init(s);#ifdef USE_LOCAL_CACHE if (use_local_cache) init_cache();#endif#ifdef USE_CCACHE url_initCache(10, 600);#endif url_initLocalServers();}void url_purge(){ if (!init_called) init_url();#ifdef USE_LOCAL_CACHE if (use_local_cache) expire_cache();#endif}void finish_url(){#ifdef USE_LOCAL_CACHE if (use_local_cache) finish_cache();#endif#ifdef USE_CCACHE url_shutdowncache();#endif}/* * url_open() - Parses and initializes the given url into a URL structure. * Returns a pointer to the structure on success; or returns NULL if the * URL is not parseable, or if the URL's host is not valid. */URL *url_open(url) char *url;{ static URL *up = NULL; static char buf[BUFSIZ]; struct local_trans_table *l; char *s, *local_filename = NULL; char local_filename_buf[1024]; struct stat sb; Debug(20, 1, ("url_open: %s\n", url)); if (!init_called) { init_url(); } if ((up = url_parse(url)) == NULL) { url_close(up); return (NULL); } for (l = LocalTransTable; !local_filename && l; l = l->next) { if (strchr(l->from, '*')) { /* Do wildcard based mapping */ if (url_matchAndSub(l->from, up->url, l->to, local_filename_buf, 1024) == 0 ) { Debug(20, 1, ("Local Mapping: '%s' matched '%s'\n", up->url, l->from)); local_filename = (char *) xmalloc(strlen(local_filename_buf)+1); strcpy(local_filename, local_filename_buf); Debug(20, 1, ("Mapped to: '%s'\n", local_filename)); } } else if (!strncasecmp(up->url, l->from, strlen(l->from))) { Debug(20, 1, ("Local Mapping: '%s' matched '%s'\n", up->url, l->from)); s = up->url + strlen(l->from); local_filename = (char *) xmalloc(strlen(l->to) + strlen(s) + 1); sprintf(local_filename, "%s%s", l->to, s); } if (local_filename) { int fd=-1; /* no HTTP involved, so unescape URI */ rfc1738_unescape(local_filename); /* expand tilde to homedir */ if (url_tildeExpand(local_filename, local_filename_buf, 1024) == 0) { xfree(local_filename); local_filename = (char *) xmalloc(strlen(local_filename_buf)+1); strcpy(local_filename, local_filename_buf); Debug(20, 1, ("Tilde expanded to %s\n", local_filename)); } /* * Don't use the mapping if the file is unreadable, * if fstat() fails, if it's a special file, or if * it's executable. */ if (stat(local_filename, &sb) < 0 || !S_ISREG(sb.st_mode) || (sb.st_mode & S_IXUSR) || (fd = open(local_filename, O_RDONLY, 0)) < 0) { xfree(local_filename); local_filename = NULL; } if (fd >= 0) (void) close(fd); } /* Special hacks for news: URLs. We want to change */ /* news:comp.sex.html into */ /* /var/spool/nov/comp/sex/html/.overview */ /* The local mapping should be: */ /* news:overview /var/spool/nov/ */ if (!strncasecmp("news:overview", l->from, 13)) { if (up->type == URL_NEWS && (strchr(up->url, '@') == 0)) { int fd; char *group_path; local_filename = xmalloc(strlen(l->to) + strlen(up->pathname) + 20); group_path = xstrdup(up->pathname); for (s = group_path; *s; s++) /* dots to slashes */ if (*s == '.') *s = '/'; sprintf(local_filename, "%s%s/.overview", l->to, group_path); xfree(group_path); if ((fd = open(local_filename, O_RDONLY, 0)) < 0 || fstat(fd, &sb) < 0 || !S_ISREG(sb.st_mode) || (sb.st_mode & S_IXUSR)) { xfree(local_filename); local_filename = NULL; } if (fd >= 0) (void) close(fd); } } } if (local_filename != NULL && (s = xstrdup(local_filename))) { Debug(20, 1, ("url_open: Local Mapping succeeded: %s -> %s\n", up->url, local_filename)); if (up->type == URL_HTTP) up->http_mime_hdr = xstrdup("text/html"); up->filename = s; /* point to the symlink */ up->shsafe_filename = shsafe_path(up->filename); up->flags |= URL_FLAG_LOCAL_MAPPED; xfree(local_filename); local_filename = NULL; return (up); } /* Type-specific additions to the URL */ Debug(20, 5, ("url_open: type=%d\n", up->type)); switch (up->type) { case URL_FILE: up->filename = xstrdup(up->pathname); up->shsafe_filename = shsafe_path(up->filename); break; case URL_FTP: /* If no userinfo yet, see if we can get it from the * ** FTPAuth.cf file (which came from FTP-Auth: in * ** gatherer.cf. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -