📄 filter.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: filter.c,v 2.3 2000/02/03 12:45:56 sxw Exp $";/* *  filter.c - RootNode URL enumerator filter support * *  DEBUG: section  44, level 1, 5      Gatherer enumeration filter routines *  AUTHOR: Harvest derived * *  Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ *  --------------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#ifndef USE_POSIX_REGEX#define USE_POSIX_REGEX		/* put before includes; always use POSIX it */#endif#include <stdio.h>#include <string.h>#include <stdlib.h>#include "util.h"#include "url.h"#include "filter.h"#define LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"#define NUMBERS "0123456789"static void init_filterre();static char *host_dotaddr = 0;char *Filter_Type_Name[] ={    "Allowed",    "Denied",    "Denied Host",    "Denied URL",    "Denied Scheme",    "Unknown",};/* *  filter_selection() - Returns non-zero if the enumerator should NOT *  follow the URL; othwerwise returns 0; */int filter_selection(up)     URL *up;{    int r = Filter_ALLOW;    char *hostport = NULL;    Host *h = NULL;    if (host_filter != NULL && nhost_filter > 0) {        if ((h = get_host(up->host))) {/*    waw be sure we get dot address AND port! */            host_dotaddr = xmalloc(strlen(h->dotaddr) + 10);            sprintf(host_dotaddr, "%s:%d", h->dotaddr, up->port);        }	else	    host_dotaddr = (char *) 0;	if ((up->type == URL_HTTP) || (up->type == URL_FTP) ||	    (up->type == URL_GOPHER)) {	    hostport = xmalloc(strlen(up->host) + 10);	    sprintf(hostport, "%s:%d", up->host, up->port);	    if (filter_match(hostport, host_filter, nhost_filter))		r = Filter_DENY_HOST;	    xfree(hostport);	} else {	    if (filter_match(up->host, host_filter, nhost_filter))		r = Filter_DENY_HOST;	}	if (host_dotaddr)	    xfree(host_dotaddr);    }    if (r == Filter_ALLOW && url_filter != NULL && nurl_filter > 0)	if (filter_match(up->pathname, url_filter, nurl_filter))	    r = Filter_DENY_URL;    if (r == Filter_ALLOW && access_mask != 0)	if (!(1 << up->type & access_mask))	    r = Filter_DENY_ACCESS;    Debug(44, 1, ("filter_selection: returning '%s' for %s\n",	    r ? "DON'T FOLLOW" : "FOLLOW", up->url));    return (r);}/* *  filter_match() - Returns non-zero if the enumerator should NOT *  follow the URL; othwerwise returns 0; */int filter_match(data, tbl, ntbl)     char *data;     struct filter_regex *tbl;     int ntbl;{    int i;    char *olddata = data;    if (tbl == NULL || ntbl < 1 || data == NULL)	return 0;    for (i = 0; i < ntbl; i++) {	if (tbl[i].filtertype == Filter_UNKNOWN)	    continue;	/* hack: match host dot address if this is the host filter */	/* and the pattern contains no letters, but some digits    */	if ((tbl == host_filter) &&	    (strpbrk(tbl[i].pattern, LETTERS) == (char *) 0) &&	    (strpbrk(tbl[i].pattern, NUMBERS) != (char *) 0) &&	    (host_dotaddr != (char *) NULL))	    data = host_dotaddr;	if (do_match(data, tbl[i].compiled_pattern)) {	    Debug(44, 5, ("filter_match: '%s' matches expression '%s'\n", data, tbl[i].pattern));	    Debug(44, 5, ("filter_match: Returning '%s'\n", tbl[i].filtertype == Filter_DENY ? "DENY" : "ALLOW"));	    if (tbl[i].filtertype == Filter_DENY)		return 1;	    if (tbl[i].filtertype == Filter_ALLOW)		return 0;	    return 0;	}	data = olddata;    }    return 0;}/* *  filter_initialize() - Initializes the RE-based candidate selection. */void filter_initialize(){    int i;    char *t;    char *s;    host_filter = url_filter = NULL;    nhost_filter = nurl_filter = 0;    if (host_filterfile != NULL) {	host_filter = xmalloc(MAX_FILTERS * sizeof(struct filter_regex));	for (i = 0; i < MAX_FILTERS; i++)	    host_filter[i].filtertype = Filter_UNKNOWN;	init_filterre(host_filterfile, host_filter, &nhost_filter);    }    if (url_filterfile != NULL) {	url_filter = xmalloc(MAX_FILTERS * sizeof(struct filter_regex));	for (i = 0; i < MAX_FILTERS; i++)	    url_filter[i].filtertype = Filter_UNKNOWN;	init_filterre(url_filterfile, url_filter, &nurl_filter);    }    access_mask = 0;    if (access_types != NULL) {	t = strdup(access_types);	for (s = strtok(t, "|"); s; s = strtok(0, "|")) {	    if (!strcasecmp(s, "FILE"))		access_mask |= 1 << URL_FILE;	    if (!strcasecmp(s, "FTP"))		access_mask |= 1 << URL_FTP;	    if (!strcasecmp(s, "GOPHER"))		access_mask |= 1 << URL_GOPHER;	    if (!strcasecmp(s, "HTTP"))		access_mask |= 1 << URL_HTTP;	    if (!strcasecmp(s, "NEWS"))		access_mask |= 1 << URL_NEWS;	    if (!strcasecmp(s, "TELNET"))		access_mask |= 1 << URL_TELNET;	    if (!strcasecmp(s, "WAIS"))		access_mask |= 1 << URL_WAIS;	}	xfree(t);    }}/* *  init_filterre() - Initializes the given type_regex array with the regular *  expressions from filename.  Returns 0 on success; non-zero otherwise. * *  File format looks like: *      # comment *      Allow   Regular-Expression *      Deny    Regular-Expression */static void init_filterre(filename, t, nt)     char *filename;     struct filter_regex *t;     int *nt;{    FILE *fp = NULL;    char *s = NULL;    int ret;    static char buf[BUFSIZ];    static char pat[BUFSIZ];    static char what[BUFSIZ];    if ((fp = fopen(filename, "r")) == NULL) {	log_errno(filename);	return;    }    while (fgets(buf, BUFSIZ, fp) != NULL) {	if (buf[0] == '#' || buf[0] == '\n')	    continue;	if ((s = strrchr(buf, '\n')) != NULL)	    *s = '\0';	for (s = &buf[0]; isspace((int) *s); s++);	if (sscanf(s, "%s %s", what, pat) != 2) {	    errorlog("Ignoring in %s: %s\n", filename, buf);	    continue;	}	if (!strcasecmp(what, "allow"))	    t[*nt].filtertype = Filter_ALLOW;	else if (!strcasecmp(what, "deny"))	    t[*nt].filtertype = Filter_DENY;	else {	    t[*nt].filtertype = Filter_UNKNOWN;	    errorlog("%s: Unknown filter directive: %s\n",		filename, what);	}	t[*nt].pattern = strdup(pat);	ret = regcomp(&t[*nt].compiled_pattern, t[*nt].pattern,	    USE_RE_SYNTAX);	if (ret != 0) {	    errorlog("Could not compile regular expression: %s",		t[*nt].pattern);	    xfree(t[*nt].pattern);	    t[*nt].pattern = NULL;	    t[*nt].filtertype = Filter_UNKNOWN;	    continue;	}	if (++(*nt) >= MAX_FILTERS) {	    Log("WARNING: %s has too many types.\n", filename);	    break;	}    }    fclose(fp);}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -