📄 condition.c
字号:
/***************************************************************************//* This code is part of WWW grabber called pavuk *//* Copyright (c) 1997 - 2001 Stefan Ondrejicka *//* Distributed under GPL 2 or later *//***************************************************************************/#include "config.h"#include <sys/types.h>#include <sys/socket.h>#include <netinet/in.h>#ifdef HAVE_ARPA_INET_H#include <arpa/inet.h>#endif#include <netdb.h>#include <stdlib.h>#include <string.h>#include <stdio.h>#ifdef HAVE_FNMATCH#include <fnmatch.h>#else#include "fnmatch.h"#endif#include "url.h"#include "condition.h"#include "tools.h"#include "uexit.h"#include "re.h"#include "dns.h"#include "debugl.h"#include "jsbind.h"#include "tag_pattern.h"#include "dlhash_tools.h"static int cond_unsupported(url *, cond_info_t *);static int cond_lmax(url *, cond_info_t *);static int cond_dmax(url *, cond_info_t *);static int cond_noftp(url *, cond_info_t *);static int cond_nhttp(url *, cond_info_t *);static int cond_nossl(url *, cond_info_t *);static int cond_nogopher(url *, cond_info_t *);static int cond_noftps(url *, cond_info_t *);static int cond_nocgi(url *, cond_info_t *);static int cond_asite(url *, cond_info_t *);static int cond_dsite(url *, cond_info_t *);static int cond_adomain(url *, cond_info_t *);static int cond_ddomain(url *, cond_info_t *);static int cond_aprefix(url *, cond_info_t *);static int cond_dprefix(url *, cond_info_t *);static int cond_asfx(url *, cond_info_t *);static int cond_dsfx(url *, cond_info_t *);static int cond_pattern(url *, cond_info_t *);static int cond_rpattern(url *, cond_info_t *);static int cond_skip_pattern(url *, cond_info_t *);static int cond_skip_rpattern(url *, cond_info_t *);static int cond_url_pattern(url *, cond_info_t *);static int cond_url_rpattern(url *, cond_info_t *);static int cond_skip_url_pattern(url *, cond_info_t *);static int cond_skip_url_rpattern(url *, cond_info_t *);static int cond_dont_leave_site(url *, cond_info_t *);static int cond_dont_leave_dir(url *, cond_info_t *);static int cond_user_condition(url *, cond_info_t *);static int cond_aip_pattern(url *, cond_info_t *);static int cond_dip_pattern(url *, cond_info_t *);static int cond_site_level(url *, cond_info_t *);static int cond_dont_leave_site_enter_dir(url *, cond_info_t *);static int cond_leave_level(url *, cond_info_t *);static int cond_aport(url *, cond_info_t *);static int cond_dport(url *, cond_info_t *);static int cond_max_size(url *, cond_info_t *);static int cond_min_size(url *, cond_info_t *);static int cond_amime_type(url *, cond_info_t *);static int cond_dmime_type(url *, cond_info_t *);static int cond_newer_than(url *, cond_info_t *);static int cond_older_than(url *, cond_info_t *);static int cond_tag_pattern(url *, cond_info_t *);static int cond_tag_rpattern(url *, cond_info_t *);#define CL0 (1 << 0)#define CL1 (1 << 1)#define CL2 (1 << 2)#define CL3 (1 << 3)#define CLALL (CL0 | CL1 | CL2 | CL3)struct cond_type_info_t{ cond_type_t type; char *name; int (*cond_func) (url *, cond_info_t *); bool_t standard; int level;};static const struct cond_type_info_t cond_type_info[] = { {CONDT_UNSUP, "unsupported", cond_unsupported, TRUE, CLALL}, {CONDT_NOFTP, "-noftp", cond_noftp, TRUE, CL0 | CL2}, {CONDT_NOHTTP, "-nohttp", cond_nhttp, TRUE, CL0 | CL2}, {CONDT_NOSSL, "-nossl", cond_nossl, TRUE, CL0 | CL2}, {CONDT_NOGOPHER, "-nogopher", cond_nogopher, TRUE, CL0 | CL2}, {CONDT_NOFTPS, "-noftps", cond_noftps, TRUE, CL0 | CL2}, {CONDT_NOCGI, "-nocgi", cond_nocgi, TRUE, CL0 | CL2}, {CONDT_LMAX, "-lmax", cond_lmax, TRUE, CL0 | CL2}, {CONDT_DMAX, "-dmax", cond_dmax, TRUE, CL1 | CL2}, {CONDT_ASITE, "-asite", cond_asite, TRUE, CL0 | CL2}, {CONDT_DSITE, "-dsite", cond_dsite, TRUE, CL0 | CL2}, {CONDT_ADOMAIN, "-adomain", cond_adomain, TRUE, CL0 | CL2}, {CONDT_DDOMAIN, "-ddomain", cond_ddomain, TRUE, CL0 | CL2}, {CONDT_APREFIX, "-aprefix", cond_aprefix, TRUE, CL0 | CL2}, {CONDT_DPREFIX, "-dprefix", cond_dprefix, TRUE, CL0 | CL2}, {CONDT_ASFX, "-asfx", cond_asfx, TRUE, CL0 | CL2}, {CONDT_DSFX, "-dsfx", cond_dsfx, TRUE, CL0 | CL2}, {CONDT_DONT_LEAVE_SITE, "-dont_leave_site", cond_dont_leave_site, TRUE, CL0 | CL2}, {CONDT_DONT_LEAVE_DIR, "-dont_leave_dir", cond_dont_leave_dir, TRUE, CL0 | CL2}, {CONDT_SITE_LEVEL, "-site_level", cond_site_level, TRUE, CL0 | CL2}, {CONDT_LEAVE_LEVEL, "-leave_level", cond_leave_level, TRUE, CL0 | CL2}, {CONDT_DONT_LEAVE_SITE_ENTER_DIR, "-dont_leave_site_enter_dir", cond_dont_leave_site_enter_dir, TRUE, CL0 | CL2}, {CONDT_APORTS, "-aport", cond_aport, TRUE, CL0 | CL2}, {CONDT_DPORTS, "-dport", cond_dport, TRUE, CL0 | CL2}, {CONDT_MAX_SIZE, "-max_size", cond_max_size, TRUE, CL3}, {CONDT_MIN_SIZE, "-min_size", cond_min_size, TRUE, CL3}, {CONDT_AMIME_TYPE, "-amimet", cond_amime_type, TRUE, CL3}, {CONDT_DMIME_TYPE, "-dmimet", cond_dmime_type, TRUE, CL3}, {CONDT_NEWER_THAN, "-newer_than", cond_newer_than, TRUE, CL3}, {CONDT_OLDER_THAN, "-older_than", cond_older_than, TRUE, CL3}, {CONDT_AIP_PATTERN, "-aip_pattern", cond_aip_pattern, TRUE, CL0 | CL2}, {CONDT_DIP_PATTERN, "-dip_pattern", cond_dip_pattern, TRUE, CL0 | CL2}, {CONDT_PATTERN, "-pattern", cond_pattern, FALSE, CL0 | CL2}, {CONDT_RPATTERN, "-rpattern", cond_rpattern, FALSE, CL0 | CL2}, {CONDT_SKIP_PATTERN, "-skip_pattern", cond_skip_pattern, FALSE, CL0 | CL2}, {CONDT_SKIP_RPATTERN, "-skip_rpattern", cond_skip_rpattern, FALSE, CL0 | CL2}, {CONDT_URL_PATTERN, "-url_pattern", cond_url_pattern, FALSE, CL0 | CL2}, {CONDT_URL_RPATTERN, "-url_rpattern", cond_url_rpattern, FALSE, CL0 | CL2}, {CONDT_SKIP_URL_PATTERN, "-skip_url_pattern", cond_skip_url_pattern, FALSE, CL0 | CL2}, {CONDT_SKIP_URL_RPATTERN, "-skip_url_rpattern", cond_skip_url_rpattern, FALSE, CL0 | CL2}, {CONDT_TAG_PATTERN, "-tag_pattern", cond_tag_pattern, TRUE, CL0}, {CONDT_TAG_RPATTERN, "-tag_rpattern", cond_tag_rpattern, TRUE, CL0}, {CONDT_USER_CONDITION, "-user_condition", cond_user_condition, FALSE, CL1 | CL3},};static const struct cond_type_info_t *cond_type_info_find(char *name){ int i; for(i = 0; i < NUM_ELEM(cond_type_info); i++) { if(!strcasecmp(cond_type_info[i].name, name)) return &(cond_type_info[i]); } return NULL;}int url_append_one_condition(char *name, url * urlp, cond_info_t * condp){ const struct cond_type_info_t *cond; if((cond = cond_type_info_find(name))) { condp->reason = cond->type; return (cond->cond_func(urlp, condp) == TRUE); } else return -1;}#define DLMSG(i) \ { \ char *urlstr = url_to_urlstr(urlp, FALSE); \ DEBUG_LIMITS("Failed URL condition (%s) -> %s\n", cond_type_info[i].name, \ urlstr); \ _free(urlstr); \ condp->reason = cond_type_info[i].type; \ }#ifdef HAVE_REGEXstatic int url_append_condition_patterns_default(url * urlp, cond_info_t * condp){ int pm1, pm2, pm3, pm4; pm1 = cond_pattern(urlp, condp); pm3 = cond_rpattern(urlp, condp); if(priv_cfg.condition.pattern && priv_cfg.condition.rpattern) { if(!pm1 && !pm3) { if(!pm1) { DLMSG(CONDT_PATTERN); } else { DLMSG(CONDT_RPATTERN); } return FALSE; } } else if(priv_cfg.condition.pattern) { if(!pm1) { DLMSG(CONDT_PATTERN); return FALSE; } } else if(priv_cfg.condition.rpattern) { if(!pm3) { DLMSG(CONDT_RPATTERN); return FALSE; } } pm2 = cond_skip_pattern(urlp, condp); pm4 = cond_skip_rpattern(urlp, condp); if(priv_cfg.condition.skip_pattern && priv_cfg.condition.rskip_pattern) { if(!pm2 && !pm4) { if(!pm2) { DLMSG(CONDT_SKIP_PATTERN); } else { DLMSG(CONDT_SKIP_RPATTERN); } return FALSE; } } else if(priv_cfg.condition.skip_pattern) { if(!pm2) { DLMSG(CONDT_SKIP_PATTERN); return FALSE; } } else if(priv_cfg.condition.rskip_pattern) { if(!pm4) { DLMSG(CONDT_SKIP_RPATTERN); return FALSE; } } pm1 = cond_url_pattern(urlp, condp); pm3 = cond_url_rpattern(urlp, condp); if(priv_cfg.condition.url_pattern && priv_cfg.condition.rurl_pattern) { if(!pm1 && !pm3) { if(!pm1) { DLMSG(CONDT_URL_PATTERN); } else { DLMSG(CONDT_URL_RPATTERN); } return FALSE; } } else if(priv_cfg.condition.url_pattern) { if(!pm1) { DLMSG(CONDT_URL_PATTERN); return FALSE; } } else if(priv_cfg.condition.rurl_pattern) { if(!pm3) { DLMSG(CONDT_URL_RPATTERN); return FALSE; } } pm2 = cond_skip_url_pattern(urlp, condp); pm4 = cond_skip_url_rpattern(urlp, condp); if(priv_cfg.condition.skip_url_pattern && priv_cfg.condition.rskip_url_pattern) { if(!pm2 && !pm4) { if(!pm2) { DLMSG(CONDT_SKIP_URL_PATTERN); } else { DLMSG(CONDT_SKIP_URL_RPATTERN); } return FALSE; } } else if(priv_cfg.condition.skip_url_pattern) { if(!pm2) { DLMSG(CONDT_SKIP_URL_PATTERN); return FALSE; } } else if(priv_cfg.condition.rskip_url_pattern) { if(!pm4) { DLMSG(CONDT_SKIP_URL_RPATTERN); return FALSE; } } return TRUE;}#elsestatic int url_append_condition_patterns_default(url * urlp, cond_info_t * condp){ int pm1; pm1 = cond_pattern(urlp, condp); if(cfg.condition.pattern && !pm1) { DLMSG(CONDT_PATTERN); return FALSE; } pm1 = cond_skip_pattern(urlp, condp); if(cfg.condition.skip_pattern && !pm1) { DLMSG(CONDT_SKIP_PATTERN); return FALSE; } pm1 = cond_url_pattern(urlp, condp); if(cfg.condition.url_pattern && !pm1) { DLMSG(CONDT_URL_PATTERN); return FALSE; } pm1 = cond_skip_url_pattern(urlp, condp); if(cfg.condition.skip_url_pattern && !pm1) { DLMSG(CONDT_SKIP_URL_PATTERN); return FALSE; } return TRUE;}#endif/********************************************************//* check wheter URL match given limiting conditions *//* following default pavuk rules *//********************************************************/static int url_append_condition_default(url * urlp, cond_info_t * condp){ int i; int level; level = (1 << condp->level); if(!cfg.condition.limit_inlines && (urlp->status & URL_INLINE_OBJ)) return TRUE; for(i = 0; i < NUM_ELEM(cond_type_info); i++) { if(cond_type_info[i].standard && (cond_type_info[i].level & level) && !cond_type_info[i].cond_func(urlp, condp)) { DLMSG(i); return FALSE; } } if((cond_type_info[CONDT_PATTERN].level & level)) { if(!url_append_condition_patterns_default(urlp, condp)) return FALSE; } if((cond_type_info[CONDT_USER_CONDITION].level & level)) { if(!cond_user_condition(urlp, condp)) { DLMSG(CONDT_USER_CONDITION); return FALSE; } } return TRUE;}/********************************************************//* check wheter URL match given limiting conditions *//********************************************************/int url_append_condition(url * urlp, cond_info_t * condp){#ifdef HAVE_MOZJS int rv; if(!cond_unsupported(urlp, condp)) return FALSE; rv = pjs_run_cond_check_func(urlp, condp); return (rv < 0) ? url_append_condition_default(urlp, condp) : rv;#else return url_append_condition_default(urlp, condp);#endif}/********************************************************//* check wheter site is from one of domains from aray *//********************************************************/static bool_t domain_condition(char *site, char **l){ char **p = l; int sl = strlen(site); while(*p) { int dl = strlen(*p); if(dl <= sl) { if(!strcasecmp(*p, site + sl - dl)) return TRUE; } p++; } return FALSE;}/********************************************************//* check wheter up have suffix sfx *//********************************************************/static bool_t cmp_sfx(char *up, char *sfx){ char *pom = NULL; int nlen, slen; int rv; slen = strlen(sfx); nlen = strlen(up); if(nlen < slen) return FALSE; rv = (!strcmp(sfx, up + nlen - slen)); _free(pom); return rv;}/********************************************************//* check wheter url have match one of sufixes from aray *//********************************************************/static bool_t sfx_condition(url * urlr, char **l){ char **pp = l; char *p = url_get_full_path(urlr); int rv = FALSE; while(*pp) { if(cmp_sfx(p, *pp)) { rv = TRUE; break; } pp++; } _free(p); return rv;}/********************************************************//* check wheter url path have one of prefixes from *//* priv_cfg.condition.dir_prefix *//********************************************************/static bool_t prefix_condition(url * urlr, char **l){ char **pp = l; char *p = url_get_full_path(urlr); int rv = FALSE; while(*pp) { if(!strncmp(*pp, p, strlen(*pp))) { rv = TRUE; break; } pp++; } _free(p); return rv;}/********************************************************//* check wheter string is mattached by at least one *//* wildcard pattern from list *//********************************************************/static bool_t cmp_pattern(char *str, char **pattern){ char **pp; for(pp = pattern; pp && *pp; pp++) { if(!fnmatch(*pp, str, 0)) return FALSE; } return (pattern != NULL);}static bool_t cmp_dlpattern(char *str, dllist * pattern){ for(; pattern; pattern = pattern->next) { if(!fnmatch((char *) pattern->data, str, 0)) return FALSE; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -