⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 robottxt.c

📁 www工具包
💻 C
字号:
/***	@(#) $Id: RobotTxt.c,v 1.4 1999/02/22 22:10:12 frystyk Exp $**	**	W3C Webbot can be found at "http://www.w3.org/Robot/"**	**	Copyright 仼 1995-1998 World Wide Web Consortium, (Massachusetts**	Institute of Technology, Institut National de Recherche en**	Informatique et en Automatique, Keio University). All Rights**	Reserved. This program is distributed under the W3C's Software**	Intellectual Property License. This program is distributed in the hope**	that it will be useful, but WITHOUT ANY WARRANTY; without even the**	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR**	PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more**	details.****  Authors:**	JP		John Punin****  History:**	Oct 1998	Written*/#include "HTRobMan.h"#include "RobotTxt.h"PUBLIC UserAgent * new_user_agent (void){    UserAgent * ua;    if ((ua = (UserAgent *) HT_CALLOC(1, sizeof(UserAgent))) == NULL)        HT_OUTOFMEM("new_user_agent");    ua->disallow = HTList_new();    return ua;}PUBLIC char * get_name_user_agent (UserAgent * ua){    return ua ? ua->name : NULL;}PUBLIC HTList * get_disallow_user_agent (UserAgent * ua){    return ua ? ua->disallow : NULL;}PUBLIC BOOL set_name_user_agent (UserAgent * ua, char * name){    if (ua && name) {	StrAllocCopy(ua->name, name);	return YES;    }    return NO;}PUBLIC BOOL add_disallow_user_agent (UserAgent * ua, char * disallow){    if (ua && disallow) {	char * da = NULL;	StrAllocCopy(da, disallow);	/* @@@ Should be an association list (HTAssocList) @@@ */	return HTList_addObject(ua->disallow, da);    }    return NO;}PUBLIC BOOL delete_user_agent (UserAgent * ua){    if (ua) {	HT_FREE(ua->name);	if (ua->disallow) {	    HTList *cur = ua->disallow;	    char *pres;	    while ((pres = (char *) HTList_nextObject(cur)))		HT_FREE(pres);	    HTList_delete(ua->disallow);	}	return YES;    }    return NO;}PUBLIC BOOL delete_all_user_agents(HTList *user_agents){    if (user_agents) {	HTList *cur = user_agents;	UserAgent *pres;	while ((pres = (UserAgent *) HTList_nextObject(cur)))	    delete_user_agent(pres);	return HTList_delete(user_agents);    }    return NO;}PUBLIC char * get_regular_expression (HTList * user_agents, char * name_robot){    if (user_agents && name_robot) {	HTChunk *ch = HTChunk_new (1024);	HTList *cur = user_agents;	UserAgent *pres;	UserAgent *ua_gen=NULL;	int found=0;	while ((pres = (UserAgent *) HTList_nextObject(cur))) {	    char *name = get_name_user_agent(pres);	    if(!strcmp(name,"*"))		ua_gen = pres;	    if(!strcmp(name,name_robot)) {		put_string_disallow(ch,pres);		found = 1;	    }	}	if(!found && ua_gen) put_string_disallow(ch,ua_gen);	return (HTChunk_toCString (ch));    }    return NULL;}PUBLIC BOOL put_string_disallow (HTChunk * ch, UserAgent * ua){    if (ch && ua) {	HTList *cur = get_disallow_user_agent(ua);	char *pres;	int first = 1;  	while ((pres = (char *) HTList_nextObject(cur))) {	    if(!first)		HTChunk_puts (ch,"|");	    else		first = 0;	    HTChunk_puts (ch,pres);	}	return YES;    }    return NO;}PUBLIC void print_user_agent(UserAgent *ua){  HTList *cur = ua->disallow;  char *pres;  HTTRACE(APP_TRACE, "User Agent : %s \n" _ ua->name);  while ((pres = (char*) HTList_nextObject(cur)))      HTTRACE(APP_TRACE, "Disallow : %s \n" _ pres);}PUBLIC void print_all_user_agents(HTList * user_agents){    HTList *cur = user_agents;    UserAgent *pres;    while ((pres = (UserAgent *) HTList_nextObject(cur)))    {	HTTRACE(APP_TRACE, "\nNew User Agent\n");	print_user_agent(pres);    }}PUBLIC HTList * get_all_user_agents(char * rob_str){    if (rob_str) {	char * ptr = rob_str;	HTList * user_agents = HTList_new();	/* skip blank spaces */	while(isspace((int)*ptr))	    ptr++;	/* skip comments */	ptr = skip_comments(ptr);	if(!get_user_agents(ptr,user_agents))	    HTTRACE(APP_TRACE, "Something is wrong in robots.txt\n");	return user_agents;    }    return NULL;}PUBLIC char * skip_comments(char *ptr){    if (ptr && *ptr == '#') {	do {	    while(*ptr != '\n')		ptr++;	    while(isspace((int)*ptr))		ptr++;	} while (*ptr == '#');    }    return ptr;}PUBLIC void scan_name_until_eoline(char *robot_str, char *name){  char *ptr = robot_str;  char *ntr = name;  while(*ptr != '\n' && *ptr != '#')    {      *ntr = *ptr;      ntr++; ptr++;      if(*ptr == '\0')	break;    }  *ntr = '\0';}PUBLIC void scan_name_until_space(char *robot_str, char *name){  char *ptr = robot_str;  char *ntr = name;  while(!isspace((int)*ptr) && *ptr != '#')    {      *ntr = *ptr;      ntr++; ptr++;      if(*ptr == '\0')	break;    }  *ntr = '\0';}PUBLIC BOOL get_user_agents(char * ptr, HTList *user_agents){  char *uastr = "user-agent:";  char *disstr = "disallow:";  int luastr = 10;  int ldisstr = 9;  char name[2000];  int indices[200];  int i = 0;  if (ptr && !strncasecomp(ptr,uastr,luastr)) {      UserAgent *ua = NULL;      do {	i=0;	do {	  ua = new_user_agent();	  HTList_appendObject(user_agents,(void *)ua);	  indices[i++] = HTList_indexOf(user_agents, (void *)ua);	  ptr += luastr + 1;	  while(isspace((int)*ptr))	    ptr++;	  scan_name_until_eoline(ptr,name); 	  ptr += strlen(name) + 1;	  while(isspace((int)*ptr))	    ptr++;	  ptr = skip_comments(ptr);	  set_name_user_agent(ua,name);	} while(!strncasecomp(ptr,uastr,luastr));	if(!strncasecomp(ptr, disstr,ldisstr))	  {	    do {	      ptr += ldisstr + 1;	      scan_name_until_space(ptr,name); 	      ptr += strlen(name) + 1;	      while(isspace((int)*ptr))		ptr++;	      ptr = skip_comments(ptr);	      if(i==1)		add_disallow_user_agent(ua,name);	      else		{		  int j;		  for(j = 0 ; j < i ; j++)		    {		      ua = HTList_objectAt(user_agents, indices[j]);		      add_disallow_user_agent(ua,name);		    }		}	    } while(!strncasecomp(ptr,disstr,ldisstr));	  }	else	  return NO;      } while(!strncasecomp(ptr,uastr,luastr));      return YES;    }  else    return NO;}PUBLIC char * scan_robots_txt(char *rob_str, char *name_robot){  char *reg_exp_exclude = NULL;  HTList * user_agents = get_all_user_agents(rob_str);  /*print_all_user_agents(user_agents);*/  reg_exp_exclude = get_regular_expression(user_agents, name_robot);  delete_all_user_agents(user_agents);  return reg_exp_exclude;}#ifdef ROBOTS_TXT_STANDALONEint main(int argc, char *argv[]){  char *text;  char *reg_exp;  char *filename= argc > 1 ? argv[1] : "robots.txt";  FILE *fp;  struct stat statb;  /* make sure the file is a regular text file and open it */  if(stat(filename, &statb) == -1 ||     (statb.st_mode & S_IFMT ) != S_IFREG ||     !(fp = fopen(filename, "r")))     {      if((statb.st_mode & S_IFMT) == S_IFREG)	perror(filename);      else	HTTRACE(ALL_TRACE, "%s : not a regular file \n" _ filename);      return 1;    }  if(!(text = malloc((unsigned)(statb.st_size +1))))    {      HTTRACE(ALL_TRACE, "Can't alloc enough space for %s" _ filename);      fclose(fp);      return;    }  if(!fread(text,sizeof(char), statb.st_size + 1, fp))    HTTRACE(APP_TRACE, "Warning: may not have read entire file!\n");  text[statb.st_size] = 0; /* be sure to NULL-terminate */  fclose(fp);  if(argc > 2)    {      reg_exp = scan_robots_txt(text,argv[2]);      if(reg_exp)	{	  HTTRACE(ALL_TRACE, "REG EXP : %s \n" _ reg_exp);	  free(reg_exp);	}    }  free(text);  return 0;}#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -