⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 recurse.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 2 页
字号:
        bufio_write(fd, docu->contents, docu->size);        bufio_close(fd);        UNLOCK_DUMPFD;      }      else if((docu->doc_url->type != URLT_FILE) &&        !(docu->doc_url->status & URL_REDIRECT) &&        (docu->errcode != ERR_HTTP_ACTUAL) &&        (docu->errcode != ERR_FTP_ACTUAL) &&        (cfg.mode != MODE_NOSTORE) &&        (cfg.dumpfd < 0) && (cfg.mode != MODE_FTPDIR))      {        gui_set_status(gettext("Storing document"));        store_stat = doc_store(docu, TRUE);        if(store_stat)        {          xprintf(1, gettext("Store failed\n"));          urlr->status &= ~URL_ERR_REC;        }      }      _Xt_Serve;      if(priv_cfg.post_cmd)        run_post_command(docu);      doc_remove_lock(docu);      doc_update_parent_links(docu);    }    else    {      if(priv_cfg.post_cmd)        run_post_command(docu);      doc_remove_lock(docu);      doc_update_parent_links(docu);    }    urlr->status |= URL_DOWNLOADED;    urlr->status |= URL_PROCESSED;    SETNEXTURL;  }  return ERR_UNKNOWN;}#ifdef I_FACEint download_single_doc(url * urlp){  int rv;  doc docu;  global_connection_info con_info;#if defined(HAVE_MT) && defined(I_FACE)  _config_struct_priv_t privcfg;#if defined (__OSF__) || defined (__osf__)#define __builtin_try#define __builtin_finally#endif  privcfg_make_copy(&privcfg);  pthread_setspecific(cfg.privcfg_key, (void *) (&privcfg));  pthread_cleanup_push((void *) privcfg_free, (void *) (&privcfg));#endif  gui_start_download(FALSE);#ifdef HAVE_MT  {    sigset_t smask;    sigemptyset(&smask);    sigaddset(&smask, SIGINT);    sigaddset(&smask, SIGQUIT);    pthread_sigmask(SIG_UNBLOCK, &smask, NULL);    signal(SIGINT, _sigintthr);    signal(SIGQUIT, _sigquitthr);  }  pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);  pthread_setspecific(cfg.thrnr_key, (void *) 0);  DEBUG_MTTHR("starting thread(%ld) %d\n", pthread_self(), 0);  cfg.allthreadsnr = 0;  gui_mt_thread_start(cfg.allthreadsnr);#endif  cfg.rbreak = FALSE;  cfg.stop = FALSE;  cfg.processing = TRUE;  doc_init(&docu, urlp);#ifdef HAVE_MT  docu.threadnr = 0;  pthread_setspecific(cfg.currdoc_key, (void *) NULL);  pthread_setspecific(cfg.herrno_key, (void *) (&(docu.__herrno)));#endif  rv = process_document(&docu, FALSE);  init_global_connection_data(&con_info);  save_global_connection_data(&con_info, &docu);  kill_global_connection_data(&con_info);  cfg.processing = FALSE;  cfg.rbreak = FALSE;  cfg.stop = FALSE;#if defined(HAVE_MT) && defined(I_FACE)  pthread_cleanup_pop(TRUE);#endif#ifdef HAVE_MT  doc_finish_processing(&docu);  cfg.allthreadsnr = 0;  gui_mt_thread_end(0);#endif  gui_beep();  gui_set_msg(gettext("Done"), 0);  return rv;}#endif/*********************************************//* rekurzivne prechadzanie stromu dokumentov *//* FIXME: Translate me!                      *//*********************************************/#ifdef HAVE_MTstatic void _recurse(int thnr)#elsevoid recurse(int thnr)#endif{  bool_t rbreaksave, stopsave;  global_connection_info con_info;  if(cfg.urlstack == NULL)    return;  init_global_connection_data(&con_info);/**** obsluzenie vsetkych URL v zozname ****//**** FIXME: Translate me!              ****/  while(cfg.urlstack && !cfg.stop)  {    doc docu;    url *urlp;    LOCK_CFG_URLSTACK;    if(cfg.urlstack)    {      urlp = (url *) cfg.urlstack->data;      cfg.urlstack = dllist_remove_entry(cfg.urlstack, cfg.urlstack);#ifdef HAVE_MT      mt_semaphore_decrement(&cfg.urlstack_sem);#endif      UNLOCK_CFG_URLSTACK;    }    else    {      UNLOCK_CFG_URLSTACK;      break;    }    doc_init(&docu, urlp);#ifdef HAVE_MT    docu.threadnr = thnr;    pthread_setspecific(cfg.currdoc_key, (void *) (&docu));    pthread_setspecific(cfg.herrno_key, (void *) (&(docu.__herrno)));#endif    LOCK_DCNT;    cfg.docnr++;    docu.doc_nr = cfg.docnr;    UNLOCK_DCNT;    restore_global_connection_data(&con_info, &docu);    process_document(&docu, TRUE);    save_global_connection_data(&con_info, &docu);#ifdef HAVE_MT    doc_finish_processing(&docu);#endif    if(docu.errcode == ERR_QUOTA_FS ||      docu.errcode == ERR_QUOTA_TRANS ||      docu.errcode == ERR_QUOTA_TIME || cfg.rbreak)    {      LOCK_CFG_URLSTACK;      cfg.docnr--;      cfg.urlstack = dllist_prepend(cfg.urlstack, (dllist_t) urlp);#ifdef HAVE_MT      mt_semaphore_up(&cfg.urlstack_sem);#endif      UNLOCK_CFG_URLSTACK;      break;    }  }#if defined(I_FACE) && !defined(HAVE_MT)  if(cfg.xi_face)  {    gui_set_status(gettext("Done"));  }#endif#ifdef I_FACE  if(cfg.xi_face)    gui_set_doccounter();#endif  stopsave = cfg.stop;  rbreaksave = cfg.rbreak;  cfg.stop = FALSE;  cfg.rbreak = FALSE;  kill_global_connection_data(&con_info);  if(cfg.update_cookies)  {    cookie_update_file(TRUE);  }  cfg.stop = stopsave;  cfg.rbreak = rbreaksave;  if(!cfg.rbreak && !cfg.stop && cfg.stats_file)  {    stats_fill_spage(cfg.stats_file, NULL);  }}#ifdef HAVE_MTstatic void _recurse_thrd(int thrnr){  bool_t init = (thrnr == 0);#ifdef I_FACE  _config_struct_priv_t privcfg;#endif  {    sigset_t smask;    sigemptyset(&smask);    sigaddset(&smask, SIGINT);    sigaddset(&smask, SIGQUIT);    pthread_sigmask(SIG_UNBLOCK, &smask, NULL);    signal(SIGINT, _sigintthr);    signal(SIGQUIT, _sigquitthr);  }  pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);  pthread_setspecific(cfg.currdoc_key, (void *) NULL);  pthread_setspecific(cfg.thrnr_key, (void *) thrnr);  DEBUG_MTTHR("starting thread(%ld) %d\n", pthread_self(), thrnr);#ifdef I_FACE  privcfg_make_copy(&privcfg);  pthread_setspecific(cfg.privcfg_key, (void *) (&privcfg));  pthread_cleanup_push((void *) privcfg_free, (void *) (&privcfg));#endif  for(; !cfg.rbreak && !cfg.stop;)  {    int v;    pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);    DEBUG_MTTHR("thread %d awaking\n", thrnr);    _recurse(thrnr);    init = FALSE;    gui_clear_status();    DEBUG_MTTHR("thread %d sleeping\n", thrnr);    gui_set_status(gettext("Sleeping ..."));    pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);    mt_semaphore_up(&cfg.nrunning_sem);    /* UN-critical section */    while(!cfg.stop && !cfg.rbreak &&      (v = mt_semaphore_timed_wait(&cfg.urlstack_sem, 400)) < 0);    mt_semaphore_decrement(&cfg.nrunning_sem);  }#ifdef I_FACE  pthread_cleanup_pop(TRUE);#endif  DEBUG_MTTHR("thread %d exiting\n", thrnr);  gui_set_status(gettext("Exiting ..."));  pthread_exit(NULL);}void recurse(int dumb){  pthread_attr_t thrdattr;  int i;  int num = cfg.nthr;  sigset_t smask;  sigemptyset(&smask);  sigaddset(&smask, SIGINT);  sigaddset(&smask, SIGQUIT);  pthread_sigmask(SIG_UNBLOCK, &smask, NULL);  signal(SIGQUIT, _sigquitthr);  pthread_attr_init(&thrdattr);  pthread_attr_setscope(&thrdattr, PTHREAD_SCOPE_SYSTEM);  pthread_attr_setstacksize(&thrdattr, MT_STACK_SIZE);  mt_semaphore_init(&cfg.nrunning_sem);  if(num <= 0)    num = 1;  cfg.allthreadsnr = 0;  cfg.allthreads = _malloc(num * sizeof(pthread_t));  mt_semaphore_decrement(&cfg.urlstack_sem);  for(i = 0; i < num; i++)  {    if(!pthread_create(&(cfg.allthreads[cfg.allthreadsnr]),        &thrdattr, (void *) _recurse_thrd, (void *) cfg.allthreadsnr))    {      cfg.allthreadsnr++;      gui_mt_thread_start(cfg.allthreadsnr);      mt_semaphore_decrement(&cfg.nrunning_sem);    }    else    {      char pom[100];      sprintf(pom, "Create downloading thread %d", i);      xperror(pom);    }    if(cfg.rbreak || cfg.stop)      break;  }  while(!cfg.stop && !cfg.rbreak &&    mt_semaphore_timed_down(&cfg.nrunning_sem, 500) < 0);  cfg.stop = TRUE;  tl_msleep(300);  for(i = 0; i < cfg.allthreadsnr; i++)  {/*    pthread_cancel(cfg.allthreads[i]);    pthread_kill(cfg.allthreads[i], SIGQUIT);*/    pthread_join(cfg.allthreads[i], NULL);  }  mt_semaphore_destroy(&cfg.nrunning_sem);  _free(cfg.allthreads);  cfg.allthreadsnr = 0;  gui_mt_thread_end(0);}#endifstatic void dump_ftp_list(dllist * urllst){  dllist *ptr = urllst;  while(ptr)  {    url *urlp = (url *) ptr->data;    void *dupl;    dupl = dllist_find2(ptr->next, (dllist_t) urlp, dllist_url_compare);    if(!dupl && !(urlp->status & URL_INLINE_OBJ) &&      (urlp->type == URLT_FTP || urlp->type == URLT_FTPS))    {      char *p, *pp;      p = url_get_path(urlp);      pp = strrchr(p, '/');      if(pp)      {        pp++;        if(!*pp)        {          pp -= 2;          while(pp > p && *pp != '/')            pp--;          pp++;        }        if(urlp->extension)        {          ftp_url_extension *fe = urlp->extension;          if(fe->type == FTP_TYPE_F)            xprintf(1, gettext("\t%s    (%d bytes)\n"), pp, fe->size);          else if(fe->type == FTP_TYPE_L)            xprintf(1, "\t%s    -> %s\n", pp, fe->slink);          else if(fe->type == FTP_TYPE_D)            xprintf(1, "\t%s/\n", pp, fe->slink);        }        else          xprintf(1, "\t%s\n", pp);      }    }    free_deep_url(urlp);    free(urlp);    ptr = dllist_remove_entry(ptr, ptr);  }}static void dump_urls_list(dllist * urls){  dllist *ptr;  LOCK_DUMPURLS;  for(ptr = urls; ptr; ptr = ptr->next)  {    void *dupl;    dupl = dllist_find2(ptr->next, (dllist_t) ptr->data, dllist_url_compare);    if(!dupl)    {      char *ustr = url_to_urlstr((url *) ptr->data, FALSE);      if(ustr)      {        write(cfg.dump_urlfd, ustr, strlen(ustr));        write(cfg.dump_urlfd, "\n", 1);        free(ustr);      }    }  }  UNLOCK_DUMPURLS;}void get_urls_to_resume(char *dirname){  DIR *dir;  struct dirent *dent;  char next_dir[PATH_MAX];  struct stat estat;  url *purl;  if(!(dir = opendir(dirname)))  {    xperror(dirname);    return;  }  gui_set_msg(gettext("Searching for files to resume"), 0);  while((dent = readdir(dir)))  {    _Xt_Serve;    snprintf(next_dir, sizeof(next_dir), "%s/%s", dirname, dent->d_name);    if(!strcmp(dent->d_name, "."))      continue;    if(!strcmp(dent->d_name, ".."))      continue;    if(lstat(next_dir, &estat))    {      xperror(next_dir);      continue;    }    if(S_ISDIR(estat.st_mode))    {      if(!strcmp(dent->d_name, ".pavuk_info") && cfg.enable_info)        continue;      get_urls_to_resume(next_dir);    }    else if(!strncmp(".in_", dent->d_name, 4))    {      snprintf(next_dir, sizeof(next_dir), "%s/%s", dirname, dent->d_name + 4);      if((purl = filename_to_url(next_dir)))      {        if(cfg.mode != MODE_MIRROR)        {          xprintf(1, gettext("Adding %s to resume list\n"), next_dir);        }        purl->status |= URL_ISSTARTING;        url_set_filename(purl, tl_strdup(next_dir));        append_url_to_list(purl);      }    }#ifdef I_FACE    if(cfg.xi_face && (cfg.rbreak || cfg.stop))      break;#endif  }  closedir(dir);}void get_urls_to_synchronize(char *dirname, dllist ** list){  DIR *dir;  struct dirent *dent;  char next_dir[PATH_MAX];  struct stat estat;  url *purl;  if(!(dir = opendir(dirname)))  {    xperror(dirname);    return;  }  gui_set_msg(gettext("Searching for documents to synchronize"), 0);  while((dent = readdir(dir)))  {    _Xt_Serve;    snprintf(next_dir, sizeof(next_dir), "%s/%s", dirname, dent->d_name);    if(!strcmp(dent->d_name, "."))      continue;    if(!strcmp(dent->d_name, ".."))      continue;    if(lstat(next_dir, &estat))    {      xperror(next_dir);      continue;    }    if(S_ISDIR(estat.st_mode))    {      if(!strcmp(dent->d_name, ".pavuk_info") && cfg.enable_info)        continue;      strcat(next_dir, "/");      if((purl = filename_to_url(next_dir)) &&        purl->type == URLT_FTP && !cfg.store_index)      {        purl->status |= URL_ISSTARTING;        purl->extension = ftp_url_ext_new(FTP_TYPE_D, -1, -1, NULL, 0);        url_set_filename(purl,          tl_str_concat(NULL, next_dir, priv_cfg.index_name, NULL));        *list = dllist_prepend(*list, (dllist_t) purl);      }      else if(purl)      {        free_deep_url(purl);        _free(purl);      }      next_dir[strlen(next_dir) - 1] = '\0';      get_urls_to_synchronize(next_dir, list);    }    else if(cfg.enable_info && !strcmp(dent->d_name, ".lock"))    {      /* do nothing */      continue;    }    else if(!strncmp(".in_", dent->d_name, 4))    {      snprintf(next_dir, sizeof(next_dir), "%s/%s", dirname, dent->d_name + 4);      if((purl = filename_to_url(next_dir)))      {        char *ustr;        ustr = url_to_urlstr(purl, FALSE);        if(cfg.mode != MODE_MIRROR)        {          xprintf(1, gettext("Adding file %s to sync list as URL %s\n"),            next_dir, ustr);        }        _free(ustr);        if(purl->type == URLT_FTP)        {          int tp;          if(purl->p.ftp.dir)            tp = FTP_TYPE_D;          else            tp = FTP_TYPE_F;          purl->extension = ftp_url_ext_new(tp, -1, -1, NULL, 0);        }        purl->status |= URL_ISSTARTING;        url_set_filename(purl, tl_strdup(next_dir));        *list = dllist_prepend(*list, (dllist_t) purl);      }    }    else    {      if((purl = filename_to_url(next_dir)))      {        char *ustr;        ustr = url_to_urlstr(purl, FALSE);        if(cfg.mode != MODE_MIRROR)        {          xprintf(1, gettext("Adding file %s to sync list as URL %s\n"),            next_dir, ustr);        }        _free(ustr);        if(purl->type == URLT_FTP)        {          int tp;          if(purl->p.ftp.dir)            tp = FTP_TYPE_D;#ifdef S_ISLNK          else if(S_ISLNK(estat.st_mode))            tp = FTP_TYPE_L;#endif          else            tp = FTP_TYPE_F;          purl->extension = ftp_url_ext_new(tp, -1, -1, NULL, 0);        }        purl->status |= URL_ISSTARTING;        url_set_filename(purl, tl_strdup(next_dir));        *list = dllist_prepend(*list, (dllist_t) purl);      }    }#ifdef I_FACE    if(cfg.xi_face && (cfg.rbreak || cfg.stop))      break;#endif  }  closedir(dir);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -