robots.c

来自「网络爬虫程序」· C语言 代码 · 共 413 行

C
413
字号
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include <unistd.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <time.h>#include "config.h"#include "condition.h"#include "mime.h"#include "robots.h"#include "url.h"#include "tools.h"#include "doc.h"#include "abstract.h"#include "tools.h"#include "mode.h"#include "times.h"#include "errcode.h"#include "gcinfo.h"#include "gui_api.h"#ifdef HAVE_MT#define LOCK_ROBOTS_ENTRY(ent)  mt_pthread_mutex_lock(&(ent)->lock, "robots entry")#define UNLOCK_ROBOTS_ENTRY(ent) mt_pthread_mutex_unlock(&(ent)->lock, "robots entry")#else#define LOCK_ROBOTS_ENTRY(ent)#define UNLOCK_ROBOTS_ENTRY(ent)#endifstatic char *get_robots(url * urlp);static void parse_robots(char *, char *, char ***, char ***);static robotlim **robots = NULL;static char *get_max_match(char *str, char **pat){  char *rv = NULL;  int maxlen = -1;  while(pat && *pat)  {    if(!strncmp(*pat, str, strlen(*pat)))    {      int len = strlen(*pat);      if(len > maxlen)      {        rv = *pat;        maxlen = len;      }    }    pat++;  }  return rv;}/***************************************************//* kontrola ci URL splna podmienky pre WWW robotov *//* FIXME: Translate me!                            *//***************************************************/bool_t robots_check(url * urlp){  char *pom, *mdp, *map;  int i = 0;  int rv = TRUE;  robotlim *tmpr = NULL;  int dont_have = TRUE;  if((urlp->type != URLT_HTTP && urlp->type != URLT_HTTPS)    || !cfg.condition.allow_robots)    return TRUE;  LOCK_ROBOTS;  if(robots)  {    while(robots[i] && !(!strcmp(robots[i]->site, urlp->p.http.host) &&        (robots[i]->port == urlp->p.http.port)))      i++;    if(robots[i])      dont_have = FALSE;  }  if(dont_have)  {    tmpr = (robotlim *) _malloc(sizeof(robotlim));    tmpr->site = new_string(urlp->p.http.host);    tmpr->port = urlp->p.http.port;    tmpr->dpat = NULL;    tmpr->apat = NULL;#ifdef HAVE_MT    pthread_mutex_init(&tmpr->lock, NULL);#endif    robots = (robotlim **) _realloc(robots, (i + 2) * sizeof(robotlim *));    robots[i] = tmpr;    robots[i + 1] = NULL;    /* here is possible to cros enter/leave to critical   */    /* sections because no chance that anyone else holds  */    /* lock on robots[i]->lock when it is freshly created */    LOCK_ROBOTS_ENTRY(robots[i]);  }  UNLOCK_ROBOTS;  if(dont_have)  {    pom = get_robots(urlp);    if(pom)    {      parse_robots("pavuk", pom, &tmpr->dpat, &tmpr->apat);      _free(pom);    }  }  else  {    LOCK_ROBOTS_ENTRY(robots[i]);  }  mdp = get_max_match(urlp->p.http.document, robots[i]->dpat);  map = get_max_match(urlp->p.http.document, robots[i]->apat);  UNLOCK_ROBOTS_ENTRY(robots[i]);  if(map && mdp && (strlen(map) >= strlen(mdp)))    rv = TRUE;  else if(mdp)    rv = FALSE;  return rv;}/************************************************//* prenos suboru "robots.txt" pre dane URL      *//* FIXME: Translate me!                         *//************************************************/static char *get_robots(url * urlp){  url *purl = _malloc(sizeof(url));  doc docu;  int rstat;  char *ret = NULL;  char *pom;  int nredir = 0, nreget = 0;  struct stat estat;  char *pp;  int f;  global_connection_info con_info;#ifdef I_FACE  if(cfg.xi_face)  {    gui_set_status(gettext("transfering \"robots.txt\""));  }#endif  xprintf(1, gettext("transfering \"robots.txt\"\n"));  memset(purl, '\0', sizeof(url));  purl->type = urlp->type;  purl->parent_url = NULL;  purl->status = URL_INLINE_OBJ; /*** required if -store_name option used ***/  purl->extension = NULL;  purl->local_name = NULL;#ifdef HAVE_MT  pthread_mutex_init(&purl->lock, NULL);#endif#ifdef WITH_TREE#ifdef I_FACE  purl->prop = NULL;  purl->tree_nfo = NULL;#endif#endif  purl->level = 0;  purl->p.http.user = new_string(urlp->p.http.user);  purl->p.http.password = new_string(urlp->p.http.password);  purl->p.http.host = new_string(urlp->p.http.host);  purl->p.http.port = urlp->p.http.port;  purl->p.http.document = new_string("/robots.txt");  purl->p.http.anchor_name = NULL;  purl->p.http.searchstr = NULL;  doc_init(&docu, purl);  docu.is_robot = TRUE;  docu.save_online = FALSE;  docu.report_size = FALSE;  docu.check_limits = FALSE;  if(cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR)  {    pp = url_to_filename(purl, TRUE);    if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode))    {      docu.dtime = estat.st_mtime;    }  }  init_global_connection_data(&con_info);  while((rstat = doc_download(&docu, TRUE, FALSE)) &&    ((nredir < cfg.nredir && docu.errcode == ERR_HTTP_REDIR) ||      (nreget < cfg.nreget && docu.errcode == ERR_HTTP_TRUNC)))  {    if(docu.errcode)      report_error(&docu, "robots.txt");    save_global_connection_data(&con_info, &docu);    nredir += docu.errcode == ERR_HTTP_REDIR;    nreget += docu.errcode == ERR_HTTP_TRUNC;    if(docu.errcode == ERR_HTTP_REDIR)    {      purl = docu.doc_url->moved_to;      pom = url_to_urlstr(purl, FALSE);      xprintf(1, gettext("Hmm: redirecting \"robots.txt\" to %s ???\n"), pom);      _free(pom);      free_deep_url(docu.doc_url);      _free(docu.doc_url) docu.doc_url = purl;    }    _free(docu.contents);    _free(docu.mime);    _free(docu.type_str);    doc_remove_lock(&docu);    if(cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR)    {      pp = url_to_filename(purl, TRUE);      if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode))      {        docu.dtime = estat.st_mtime;      }    }    restore_global_connection_data(&con_info, &docu);  }  if(docu.errcode)    report_error(&docu, "robots.txt");  save_global_connection_data(&con_info, &docu);  kill_global_connection_data(&con_info);  if(!rstat)  {    if(cfg.dumpfd < 0)    {      doc_store(&docu, TRUE);    }    ret = docu.contents;  }  else if(docu.errcode == ERR_HTTP_NFOUND || docu.errcode == ERR_HTTP_GONE)  {    pp = url_to_filename(purl, TRUE);    if(cfg.dumpfd < 0)    {      if((f =          open(pp, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY,            S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR)) > 0)        close(f);    }  }  else  {    _free(docu.contents);  }  doc_remove_lock(&docu);  _free(docu.type_str);  _free(docu.mime);  if(purl && purl->moved_to)  {    free_deep_url(purl->moved_to);    free(purl->moved_to);  }  if(purl)  {    free_deep_url(purl);    free(purl);  }  return ret;}/*******************************//* analyza suboru "robots.txt" *//* FIXME: Translate me!        *//*******************************/static void parse_robots(char *agent, char *file, char ***dpat, char ***apat){  char *p, *p1, *p2;  bool_t is_me = FALSE;  int n_dret = 0, n_aret = 0;  bool_t last = 1;  int ilen;  *apat = NULL;  *dpat = NULL;  p = file;  while(*p)  {    ilen = strcspn(p, "\r\n");    if(*(p + ilen))      *(p + ilen) = '\0';    else      last = 0;    while(*p == ' ' || *p == '\t')      p++;    if(!*p)    {      is_me = FALSE;    }    else if(!strncasecmp("User-Agent: ", p, 12))    {      p2 = p + 12;      while(*p2 == ' ' || *p2 == '\t')        p2++;      p1 = p2 + strlen(p2);      while(*p1 == ' ' || *p1 == '\t')      {        *p1 = '\0';        p1--;      }      if(*p2 == '*')        is_me = TRUE;      else if(!strncmp(agent, p2, strlen(agent)))        is_me = TRUE;    }    else if(is_me && !strncasecmp("Disallow: ", p, 10))    {      p2 = p + 10;      while(*p2 == ' ' || *p2 == '\t')        p2++;      p1 = p2 + strlen(p2);      while(*p1 == ' ' || *p1 == '\t')      {        *p1 = '\0';        p1--;      }      if(*p2)      {        *dpat = (char **) _realloc(*dpat, (n_dret + 2) * sizeof(char *));        (*dpat)[n_dret + 1] = NULL;        (*dpat)[n_dret] = new_string(p2);        n_dret++;      }    }    else if(is_me && !strncasecmp("Allow: ", p, 7))    {      p2 = p + 7;      while(*p2 == ' ' || *p2 == '\t')        p2++;      p1 = p2 + strlen(p2);      while(*p1 == ' ' || *p1 == '\t')      {        *p1 = '\0';        p1--;      }      if(*p2)      {        *apat = (char **) _realloc(*apat, (n_aret + 2) * sizeof(char *));        (*apat)[n_aret + 1] = NULL;        (*apat)[n_aret] = new_string(p2);        n_aret++;      }    }    p += ilen + last;    p += strspn(p, "\n\r");  }}void robots_do_cleanup(void){  int i, j;  for(i = 0; robots && robots[i]; i++)  {    _free(robots[i]->site);    for(j = 0; robots[i]->apat && robots[i]->apat[j]; j++)      _free(robots[i]->apat[j]);    _free(robots[i]->apat);    for(j = 0; robots[i]->dpat && robots[i]->dpat[j]; j++)      _free(robots[i]->dpat[j]);    _free(robots[i]->dpat);#ifdef HAVE_MT    pthread_mutex_destroy(&(robots[i]->lock));#endif    _free(robots[i]);  }  _free(robots);}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?