📄 robots-txt.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[]="$Id: robots-txt.c,v 2.5 1997/11/21 17:40:55 sxw Exp $";/* *  DEBUG: section  48, level 1, 5      Gatherer enumeration robots.txt stuff * *  Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ *  --------------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. *  *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. *   *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. *   *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <stdlib.h>#include <unistd.h>#include "util.h"#include "url.h"#define ROBOTS_TXT_DISALLOW 0#define ROBOTS_TXT_ALLOW 1typedef struct _word_list {    char *word;    struct _word_list *next;} word_list;typedef struct _record {    word_list *user_agent;    word_list *disallow;    struct _record *next;} record;typedef struct _robots_txt {    char *server;    record *record_list;    struct _robots_txt *next;} robots_txt;robots_txt *RobotsTxtHead = NULL;robots_txt **RobotsTxtTail = &RobotsTxtHead;static robots_txt *RobotsTxtFindServer _PARAMS((char *));static record *RobotsTxtFindRecord _PARAMS((robots_txt *, char *));static robots_txt *RobotsTxtLoad _PARAMS((char *, int));int RobotsTxtCheck _PARAMS((URL * up));static char *UA_prefix = "Harvest ";static char *this_UA   = NULL;/* Match the pattern from the robots.txt file against the URL we're trying * to access. According to the latest robots.txt spec case IS significant and * we shouldn't do RE's. */static int pattern_match(pattern, path)     char *pattern;     char *path;{    if (!strncmp(pattern, path, strlen(pattern)))	return 1;    return 0;}int RobotsTxtCheck(up)     URL *up;{    robots_txt *R = NULL;    record *Q = NULL;    word_list *W = NULL;    static char server[BUFSIZ];    char *user_UA;        if (up == (URL *) NULL) {	errorlog("RobotsTxtCheck: NULL URL\n");	return ROBOTS_TXT_DISALLOW;    }    Debug(48, 1, ("RobotsTxtCheck: URL %s\n", up->url));    if (up->port == url_table[up->type].port) {	sprintf(server, "%s://%s",	    url_table[up->type].scheme,	    up->host);    } else {	sprintf(server, "%s://%s:%d",	    url_table[up->type].scheme,	    up->host,	    up->port);    }    R = RobotsTxtFindServer(server);    if (R == NULL)	R = RobotsTxtLoad(server, up->type);    /* Can't find a robots.txt file for this server, assume its okay     * to visit */    if (R == NULL)	return ROBOTS_TXT_ALLOW;    Debug(48, 5, ("RobotsTxtCheck: Found data for server %s\n", server));    /* Generate our UA string - if we haven't already done so     * This ensures that "Harvest" always appears at the start of the UA     * string - meaning that we're always identifiable     */    if (this_UA==NULL) {      user_UA=getenv("HARVEST_USER_AGENT");      if (user_UA==NULL) 	this_UA=UA_prefix;      else {	this_UA=xmalloc(strlen(UA_prefix)+strlen(user_UA)+1);	strcpy(this_UA,UA_prefix);	strcat(this_UA,user_UA);      }    }    Q = RobotsTxtFindRecord(R, this_UA);    if (Q == NULL) {	Debug(48, 1, ("RobotsTxtCheck: No match for UA '%s'\n", this_UA));	Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_ALLOW\n"));	return ROBOTS_TXT_ALLOW;    }    Debug(48, 5, ("RobotsTxtCheck: Found record for UA '%s'\n", this_UA));    for (W = Q->disallow; W; W = W->next) {	Debug(48, 5, ("RobotsTxtCheck: Pattern %s\n", W->word));	if ((W->word !=NULL) && (pattern_match(W->word, up->raw_pathname))) {	  Debug(48, 1, ("RobotsTxtCheck: Matched '%s'\n", W->word));	  Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_DISALLOW\n"));	  return ROBOTS_TXT_DISALLOW;	}    }        Debug(48, 1, ("RobotsTxtCheck: No matches.\n"));    Debug(48, 1, ("RobotsTxtCheck: Returning ROBOTS_TXT_ALLOW\n"));    return ROBOTS_TXT_ALLOW;}static robots_txt *RobotsTxtLoad(server, type)     char *server;     int type;{    robots_txt *R = NULL;    record *Q = NULL;    record **QT = NULL;    word_list *W = NULL;    word_list **UAWLT = NULL;    word_list **DAWLT = NULL;    URL *up = NULL;    static char url[BUFSIZ];    static char buf[256];    char *t = NULL;    enum {	none, user_agent, disallow    } lastline = none;    if (server == (char *) NULL) {	errorlog("RobotsTxtLoad: NULL server!\n");	return NULL;    }    R = (robots_txt *) xmalloc(sizeof(robots_txt));    R->server = xstrdup(server);    *RobotsTxtTail = R;    RobotsTxtTail = &R->next;    switch (type) {    case URL_HTTP:    case URL_FTP:	sprintf(url, "%s/robots.txt", server);	break;    case URL_GOPHER:	sprintf(url, "%s/00/robots.txt", server);	break;    default:	return R;	/* NOTREACHED */    }    if ((up = url_open(url)) == (URL *) NULL) {	Debug(48, 1, ("RobotsTxtLoad: Bad URL: %s\n", url));	return R;    }    if (url_retrieve(up)) {	Debug(48, 1, ("RobotsTxtLoad: %s: cannot retrieve\n", url));	url_close(up);	return R;    }    if ((up->fp = fopen(up->filename, "r")) == NULL) {	log_errno2(__FILE__, __LINE__, up->filename);	url_close(up);	return R;    }    QT = &R->record_list;    Debug(48, 1, ("RobotsTxtLoad: Reading %s\n", url));    while (fgets(buf, 256, up->fp)) {	if ((t = strchr(buf, '\n')))	    *t = '\0';	if ((t = strchr(buf, '\r')))	    *t = '\0';	Debug(48, 5, ("%s: %s\n", url, buf));	if ((t = strchr(buf, '#')))	    *t = '\0';	if (buf[0] == '\0')	    continue;	if ((t = strtok(buf, ":\t ")) == NULL)	    continue;	if (!strcasecmp(t, "Disallow")) {	    if (lastline == user_agent) {	        *QT = Q;	        QT  = &Q->next;		W=xmalloc(sizeof(word_list));		if ((t = strtok(NULL, " \t")))		  W->word=xstrdup(t);		*DAWLT = W;		DAWLT = &W->next;		lastline = disallow;	    }	    else if (lastline == disallow) {	        W = xmalloc(sizeof(word_list));	        if ((t = strtok(NULL, " \t")))		    W->word = xstrdup(t);	        *DAWLT = W;	        DAWLT  = &W->next;	    }	    else {      	        errorlog("Malformed robots.txt file from %s\n", server);	    }	} else if (!strcasecmp(t, "User-Agent")) {	    if (lastline != user_agent) {		Q = xmalloc(sizeof(record));		UAWLT = &Q->user_agent;		DAWLT = &Q->disallow;	    }	    W = xmalloc(sizeof(word_list));	    if ((t = strtok(NULL, " \t")))		W->word = xstrdup(t);	    *UAWLT = W;	    UAWLT = &W->next;	    lastline = user_agent;	} else if (!strcasecmp(t, "User_Agent")) {	        errorlog("Malformed robots.txt file from %s\n", server);	        if (lastline != user_agent) {	        Q = xmalloc(sizeof(record));	        UAWLT = &Q->user_agent;	        DAWLT = &Q->disallow;	    }	    W = xmalloc(sizeof(word_list));	    if ((t = strtok(NULL, " \t")))		W->word = xstrdup(t);	    *UAWLT = W;	    UAWLT = &W->next;	    lastline = user_agent;	} else if (!strcasecmp(t, "User")) {	        errorlog("Malformed robots.txt file from %s\n", server);	        if (lastline != user_agent) {	 	  Q = xmalloc(sizeof(record));		  UAWLT = &Q->user_agent;	   	  DAWLT = &Q->disallow;	      }	      W = xmalloc(sizeof(word_list));	    if ((t = strtok(NULL, " \t"))) {	        if (strcasecmp(t, "Agent")) {		    W->word = xstrdup(t);		} else if ((t = strtok(NULL, " \t"))) {		      W->word = xstrdup(t);	        }	    }	      *UAWLT = W;	      UAWLT = &W->next;	      lastline = user_agent;	} else if (!strcasecmp(t, "Agent")) {	        errorlog("Malformed robots.txt file from %s\n", server);		if (lastline !=user_agent) {	        Q = xmalloc(sizeof(record));		UAWLT = &Q->user_agent;		DAWLT = &Q->disallow;	   }	   W = xmalloc(sizeof(word_list));	   if ((t = strtok(NULL, " \t")))	       W->word = xstrdup(t);	   *UAWLT = W;	   UAWLT  = &W->next;	   lastline = user_agent;	     }    }    fclose(up->fp);    url_close(up);    return R;}static robots_txt *RobotsTxtFindServer(server)     char *server;{    robots_txt *R = NULL;    if (server == (char *) NULL) {	errorlog("RobotsTxtFind: NULL server!\n");	return NULL;    }    Debug(48, 1, ("RobotsTxtFind: %s: Finding\n", server));    for (R = RobotsTxtHead; R; R = R->next) {	Debug(48, 1, ("RobotsTxtFind: Checking %s\n", R->server));	if (!strcasecmp(R->server, server))	    return R;    }    Debug(48, 1, ("RobotsTxtFind: %s: Not Found\n", server));    return NULL;}static record *RobotsTxtFindRecord(R, UA)     robots_txt *R;     char *UA;{    record *Q = NULL;    record *wildcard = NULL;    word_list *W = NULL;    char *copy;    char *str;    for (Q = R->record_list; Q; Q = Q->next) {	for (W = Q->user_agent; W; W = W->next) {	  if (W->word !=NULL) {	    if (!strcmp(W->word, "*"))		wildcard = Q;	    else {	      /* Check to see if any of the space seperated entries in the	       * records User-Agent field match with our UA	       */	      copy=xstrdup(W->word);	      str=strtok(copy," \t");	      while ((str!=NULL) && (strstr(UA,copy)==NULL)) 		str=strtok(NULL," \t");	      xfree(copy);	      if (str!=NULL) 		return Q;	      /* Is our UA contained completely within their User-Agent */	      if (strstr(W->word, UA))		return Q;	      if (!strncasecmp(W->word, UA, strlen(UA)))		return Q;	    }	  }	}    }    return wildcard;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -