📄 robots-txt.c
字号:
static char rcsid[]="$Id: robots-txt.c,v 2.5 1997/11/21 17:40:55 sxw Exp $";/* * DEBUG: section 48, level 1, 5 Gatherer enumeration robots.txt stuff * * Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ * --------------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail harvest@tardis.ed.ac.uk if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <stdlib.h>#include <unistd.h>#include "util.h"#include "url.h"#define ROBOTS_TXT_DISALLOW 0#define ROBOTS_TXT_ALLOW 1typedef struct _word_list { char *word; struct _word_list *next;} word_list;typedef struct _record { word_list *user_agent; word_list *disallow; struct _record *next;} record;typedef struct _robots_txt { char *server; record *record_list; struct _robots_txt *next;} robots_txt;robots_txt *RobotsTxtHead = NULL;robots_txt **RobotsTxtTail = &RobotsTxtHead;static robots_txt *RobotsTxtFindServer _PARAMS((char *));static record *RobotsTxtFindRecord _PARAMS((robots_txt *, char *));static robots_txt *RobotsTxtLoad _PARAMS((char *, int));int RobotsTxtCheck _PARAMS((URL * up));static char *UA_prefix = "Harvest ";static char *this_UA = NULL;/* Match the pattern from the robots.txt file against the URL we're trying * to access. According to the latest robots.txt spec case IS significant and * we shouldn't do RE's. */static int pattern_match(pattern, path) char *pattern; char *path;{ if (!strncmp(pattern, path, strlen(pattern))) return 1; return 0;}int RobotsTxtCheck(up) URL *up;{ robots_txt *R = NULL; record *Q = NULL; word_list *W = NULL; static char server[BUFSIZ]; char *user_UA; if (up == (URL *) NULL) { errorlog("RobotsTxtCheck: NULL URL\n"); return ROBOTS_TXT_DISALLOW; } Debug(48, 1, ("RobotsTxtCheck: URL %s\n", up->url)); if (up->port == url_table[up->type].port) { sprintf(server, "%s://%s", url_table[up->type].scheme, up->host); } else { sprintf(server, "%s://%s:%d", url_table[up->type].scheme, up->host, up->port); } R = RobotsTxtFindServer(server); if (R == NULL) R = RobotsTxtLoad(server, up->type); /* Can't find a robots.txt file for this server, assume its okay * to visit */ if (R == NULL) return ROBOTS_TXT_ALLOW; Debug(48, 5, ("RobotsTxtCheck: Found data for server %s\n", server)); /* Generate our UA string - if we haven't already done so * This ensures that "Harvest" always appears at the start of the UA * string - meaning that we're always identifiable */ if (this_UA==NULL) { user_UA=getenv("HARVEST_USER_AGENT"); if (user_UA==NULL) this_UA=UA_prefix; else { this_UA=xmalloc(strlen(UA_prefix)+strlen(user_UA)+1); strcpy(this_UA,UA_prefix); strcat(this_UA,user_UA); } } Q = RobotsTxtFindRecord(R, this_UA); if (Q == NULL) { Debug(48, 1, ("RobotsTxtCheck: No match for UA '%s'\n", this_UA)); Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_ALLOW\n")); return ROBOTS_TXT_ALLOW; } Debug(48, 5, ("RobotsTxtCheck: Found record for UA '%s'\n", this_UA)); for (W = Q->disallow; W; W = W->next) { Debug(48, 5, ("RobotsTxtCheck: Pattern %s\n", W->word)); if ((W->word !=NULL) && (pattern_match(W->word, up->raw_pathname))) { Debug(48, 1, ("RobotsTxtCheck: Matched '%s'\n", W->word)); Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_DISALLOW\n")); return ROBOTS_TXT_DISALLOW; } } Debug(48, 1, ("RobotsTxtCheck: No matches.\n")); Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_ALLOW\n")); return ROBOTS_TXT_ALLOW;}static robots_txt *RobotsTxtLoad(server, type) char *server; int type;{ robots_txt *R = NULL; record *Q = NULL; record **QT = NULL; word_list *W = NULL; word_list **UAWLT = NULL; word_list **DAWLT = NULL; URL *up = NULL; static char url[BUFSIZ]; static char buf[256]; char *t = NULL; enum { none, user_agent, disallow } lastline = none; if (server == (char *) NULL) { errorlog("RobotsTxtLoad: NULL server!\n"); return NULL; } R = (robots_txt *) xmalloc(sizeof(robots_txt)); R->server = xstrdup(server); *RobotsTxtTail = R; RobotsTxtTail = &R->next; switch (type) { case URL_HTTP: case URL_FTP: sprintf(url, "%s/robots.txt", server); break; case URL_GOPHER: sprintf(url, "%s/00/robots.txt", server); break; default: return R; /* NOTREACHED */ } if ((up = url_open(url)) == (URL *) NULL) { Debug(48, 1, ("RobotsTxtLoad: Bad URL: %s\n", url)); return R; } if (url_retrieve(up)) { Debug(48, 1, ("RobotsTxtLoad: %s: cannot retrieve\n", url)); url_close(up); return R; } if ((up->fp = fopen(up->filename, "r")) == NULL) { log_errno2(__FILE__, __LINE__, up->filename); url_close(up); return R; } QT = &R->record_list; Debug(48, 1, ("RobotsTxtLoad: Reading %s\n", url)); while (fgets(buf, 256, up->fp)) { if ((t = strchr(buf, '\n'))) *t = '\0'; if ((t = strchr(buf, '\r'))) *t = '\0'; Debug(48, 5, ("%s: %s\n", url, buf)); if ((t = strchr(buf, '#'))) *t = '\0'; if (buf[0] == '\0') continue; if ((t = strtok(buf, ":\t ")) == NULL) continue; if (!strcasecmp(t, "Disallow")) { if (lastline == user_agent) { *QT = Q; QT = &Q->next; W=xmalloc(sizeof(word_list)); if ((t = strtok(NULL, " \t"))) W->word=xstrdup(t); *DAWLT = W; DAWLT = &W->next; lastline = disallow; } else if (lastline == disallow) { W = xmalloc(sizeof(word_list)); if ((t = strtok(NULL, " \t"))) W->word = xstrdup(t); *DAWLT = W; DAWLT = &W->next; } else { errorlog("Malformed robots.txt file from %s\n", server); } } else if (!strcasecmp(t, "User-Agent")) { if (lastline != user_agent) { Q = xmalloc(sizeof(record)); UAWLT = &Q->user_agent; DAWLT = &Q->disallow; } W = xmalloc(sizeof(word_list)); if ((t = strtok(NULL, " \t"))) W->word = xstrdup(t); *UAWLT = W; UAWLT = &W->next; lastline = user_agent; } else if (!strcasecmp(t, "User_Agent")) { errorlog("Malformed robots.txt file from %s\n", server); if (lastline != user_agent) { Q = xmalloc(sizeof(record)); UAWLT = &Q->user_agent; DAWLT = &Q->disallow; } W = xmalloc(sizeof(word_list)); if ((t = strtok(NULL, " \t"))) W->word = xstrdup(t); *UAWLT = W; UAWLT = &W->next; lastline = user_agent; } else if (!strcasecmp(t, "User")) { errorlog("Malformed robots.txt file from %s\n", server); if (lastline != user_agent) { Q = xmalloc(sizeof(record)); UAWLT = &Q->user_agent; DAWLT = &Q->disallow; } W = xmalloc(sizeof(word_list)); if ((t = strtok(NULL, " \t"))) { if (strcasecmp(t, "Agent")) { W->word = xstrdup(t); } else if ((t = strtok(NULL, " \t"))) { W->word = xstrdup(t); } } *UAWLT = W; UAWLT = &W->next; lastline = user_agent; } else if (!strcasecmp(t, "Agent")) { errorlog("Malformed robots.txt file from %s\n", server); if (lastline !=user_agent) { Q = xmalloc(sizeof(record)); UAWLT = &Q->user_agent; DAWLT = &Q->disallow; } W = xmalloc(sizeof(word_list)); if ((t = strtok(NULL, " \t"))) W->word = xstrdup(t); *UAWLT = W; UAWLT = &W->next; lastline = user_agent; } } fclose(up->fp); url_close(up); return R;}static robots_txt *RobotsTxtFindServer(server) char *server;{ robots_txt *R = NULL; if (server == (char *) NULL) { errorlog("RobotsTxtFind: NULL server!\n"); return NULL; } Debug(48, 1, ("RobotsTxtFind: %s: Finding\n", server)); for (R = RobotsTxtHead; R; R = R->next) { Debug(48, 1, ("RobotsTxtFind: Checking %s\n", R->server)); if (!strcasecmp(R->server, server)) return R; } Debug(48, 1, ("RobotsTxtFind: %s: Not Found\n", server)); return NULL;}static record *RobotsTxtFindRecord(R, UA) robots_txt *R; char *UA;{ record *Q = NULL; record *wildcard = NULL; word_list *W = NULL; char *copy; char *str; for (Q = R->record_list; Q; Q = Q->next) { for (W = Q->user_agent; W; W = W->next) { if (W->word !=NULL) { if (!strcmp(W->word, "*")) wildcard = Q; else { /* Check to see if any of the space seperated entries in the * records User-Agent field match with our UA */ copy=xstrdup(W->word); str=strtok(copy," \t"); while ((str!=NULL) && (strstr(UA,copy)==NULL)) str=strtok(NULL," \t"); xfree(copy); if (str!=NULL) return Q; /* Is our UA contained completely within their User-Agent */ if (strstr(W->word, UA)) return Q; if (!strncasecmp(W->word, UA, strlen(UA))) return Q; } } } } return wildcard;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -