📄 filter.c
字号:
static char rcsid[] = "$Id: filter.c,v 2.3 2000/02/03 12:45:56 sxw Exp $";/* * filter.c - RootNode URL enumerator filter support * * DEBUG: section 44, level 1, 5 Gatherer enumeration filter routines * AUTHOR: Harvest derived * * Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ * --------------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail harvest@tardis.ed.ac.uk if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#ifndef USE_POSIX_REGEX#define USE_POSIX_REGEX /* put before includes; always use POSIX it */#endif#include <stdio.h>#include <string.h>#include <stdlib.h>#include "util.h"#include "url.h"#include "filter.h"#define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"#define NUMBERS "0123456789"static void init_filterre();static char *host_dotaddr = 0;char *Filter_Type_Name[] ={ "Allowed", "Denied", "Denied Host", "Denied URL", "Denied Scheme", "Unknown",};/* * filter_selection() - Returns non-zero if the enumerator should NOT * follow the URL; othwerwise returns 0; */int filter_selection(up) URL *up;{ int r = Filter_ALLOW; char *hostport = NULL; Host *h = NULL; if (host_filter != NULL && nhost_filter > 0) { if ((h = get_host(up->host))) {/* waw be sure we get dot address AND port! */ host_dotaddr = xmalloc(strlen(h->dotaddr) + 10); sprintf(host_dotaddr, "%s:%d", h->dotaddr, up->port); } else host_dotaddr = (char *) 0; if ((up->type == URL_HTTP) || (up->type == URL_FTP) || (up->type == URL_GOPHER)) { hostport = xmalloc(strlen(up->host) + 10); sprintf(hostport, "%s:%d", up->host, up->port); if (filter_match(hostport, host_filter, nhost_filter)) r = Filter_DENY_HOST; xfree(hostport); } else { if (filter_match(up->host, host_filter, nhost_filter)) r = Filter_DENY_HOST; } if (host_dotaddr) xfree(host_dotaddr); } if (r == Filter_ALLOW && url_filter != NULL && nurl_filter > 0) if (filter_match(up->pathname, url_filter, nurl_filter)) r = Filter_DENY_URL; if (r == Filter_ALLOW && access_mask != 0) if (!(1 << up->type & access_mask)) r = Filter_DENY_ACCESS; Debug(44, 1, ("filter_selection: returning '%s' for %s\n", r ? "DON'T FOLLOW" : "FOLLOW", up->url)); return (r);}/* * filter_match() - Returns non-zero if the enumerator should NOT * follow the URL; othwerwise returns 0; */int filter_match(data, tbl, ntbl) char *data; struct filter_regex *tbl; int ntbl;{ int i; char *olddata = data; if (tbl == NULL || ntbl < 1 || data == NULL) return 0; for (i = 0; i < ntbl; i++) { if (tbl[i].filtertype == Filter_UNKNOWN) continue; /* hack: match host dot address if this is the host filter */ /* and the pattern contains no letters, but some digits */ if ((tbl == host_filter) && (strpbrk(tbl[i].pattern, LETTERS) == (char *) 0) && (strpbrk(tbl[i].pattern, NUMBERS) != (char *) 0) && (host_dotaddr != (char *) NULL)) data = host_dotaddr; if (do_match(data, tbl[i].compiled_pattern)) { Debug(44, 5, ("filter_match: '%s' matches expression '%s'\n", data, tbl[i].pattern)); Debug(44, 5, ("filter_match: Returning '%s'\n", tbl[i].filtertype == Filter_DENY ? "DENY" : "ALLOW")); if (tbl[i].filtertype == Filter_DENY) return 1; if (tbl[i].filtertype == Filter_ALLOW) return 0; return 0; } data = olddata; } return 0;}/* * filter_initialize() - Initializes the RE-based candidate selection. */void filter_initialize(){ int i; char *t; char *s; host_filter = url_filter = NULL; nhost_filter = nurl_filter = 0; if (host_filterfile != NULL) { host_filter = xmalloc(MAX_FILTERS * sizeof(struct filter_regex)); for (i = 0; i < MAX_FILTERS; i++) host_filter[i].filtertype = Filter_UNKNOWN; init_filterre(host_filterfile, host_filter, &nhost_filter); } if (url_filterfile != NULL) { url_filter = xmalloc(MAX_FILTERS * sizeof(struct filter_regex)); for (i = 0; i < MAX_FILTERS; i++) url_filter[i].filtertype = Filter_UNKNOWN; init_filterre(url_filterfile, url_filter, &nurl_filter); } access_mask = 0; if (access_types != NULL) { t = strdup(access_types); for (s = strtok(t, "|"); s; s = strtok(0, "|")) { if (!strcasecmp(s, "FILE")) access_mask |= 1 << URL_FILE; if (!strcasecmp(s, "FTP")) access_mask |= 1 << URL_FTP; if (!strcasecmp(s, "GOPHER")) access_mask |= 1 << URL_GOPHER; if (!strcasecmp(s, "HTTP")) access_mask |= 1 << URL_HTTP; if (!strcasecmp(s, "NEWS")) access_mask |= 1 << URL_NEWS; if (!strcasecmp(s, "TELNET")) access_mask |= 1 << URL_TELNET; if (!strcasecmp(s, "WAIS")) access_mask |= 1 << URL_WAIS; } xfree(t); }}/* * init_filterre() - Initializes the given type_regex array with the regular * expressions from filename. Returns 0 on success; non-zero otherwise. * * File format looks like: * # comment * Allow Regular-Expression * Deny Regular-Expression */static void init_filterre(filename, t, nt) char *filename; struct filter_regex *t; int *nt;{ FILE *fp = NULL; char *s = NULL; int ret; static char buf[BUFSIZ]; static char pat[BUFSIZ]; static char what[BUFSIZ]; if ((fp = fopen(filename, "r")) == NULL) { log_errno(filename); return; } while (fgets(buf, BUFSIZ, fp) != NULL) { if (buf[0] == '#' || buf[0] == '\n') continue; if ((s = strrchr(buf, '\n')) != NULL) *s = '\0'; for (s = &buf[0]; isspace((int) *s); s++); if (sscanf(s, "%s %s", what, pat) != 2) { errorlog("Ignoring in %s: %s\n", filename, buf); continue; } if (!strcasecmp(what, "allow")) t[*nt].filtertype = Filter_ALLOW; else if (!strcasecmp(what, "deny")) t[*nt].filtertype = Filter_DENY; else { t[*nt].filtertype = Filter_UNKNOWN; errorlog("%s: Unknown filter directive: %s\n", filename, what); } t[*nt].pattern = strdup(pat); ret = regcomp(&t[*nt].compiled_pattern, t[*nt].pattern, USE_RE_SYNTAX); if (ret != 0) { errorlog("Could not compile regular expression: %s", t[*nt].pattern); xfree(t[*nt].pattern); t[*nt].pattern = NULL; t[*nt].filtertype = Filter_UNKNOWN; continue; } if (++(*nt) >= MAX_FILTERS) { Log("WARNING: %s has too many types.\n", filename); break; } } fclose(fp);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -