⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 odidx.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
/************************************************************************************************* * Utility for indexing document files into a database of Odeum *                                                      Copyright (C) 2000-2003 Mikio Hirabayashi * This file is part of QDBM, Quick Database Manager. * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU * Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License or any later version.  QDBM is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more * details. * You should have received a copy of the GNU Lesser General Public License along with QDBM; if * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA * 02111-1307 USA. *************************************************************************************************/#include <depot.h>#include <cabin.h>#include <odeum.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <ctype.h>#include <stdarg.h>#include <time.h>#include <signal.h>#undef TRUE#define TRUE           1                 /* boolean true */#undef FALSE#define FALSE          0                 /* boolean false */#define PATHCHR        '/'               /* delimiter character of path */#define EXTCHR         '.'               /* delimiter character of extension */#define CDIRSTR        "."               /* string of current directory */#define PDIRSTR        ".."              /* string of parent directory */#define MTDBNAME       "_mtime"          /* name of the database for last modified times */#define MTDBLRM        81                /* records in a leaf node of time database */#define MTDBNIM        192               /* records in a non-leaf node of time database */#define MTDBLCN        64                /* number of leaf cache of time database */#define MTDBNCN        32                /* number of non-leaf cache of time database */#define SCDBNAME       "_score"          /* name of the database for scores */#define SCDBBNUM       32749             /* bucket number of score database */#define PATHBUFSIZ     2048              /* size of a path buffer */#define MAXLOAD        0.85              /* max ratio of bucket loading */#define KEYNUM         32                /* number of keywords to store *//* for Win32 and RISC OS */#if defined(_WIN32)#undef PATHCHR#define PATHCHR        '\\'#undef EXTCHR#define EXTCHR         '.'#undef CDIRSTR#define CDIRSTR        "."#undef PDIRSTR#define PDIRSTR        ".."#elif defined(__riscos__) || defined(__riscos)#include <unixlib/local.h>int __riscosify_control = __RISCOSIFY_NO_PROCESS;#undef PATHCHR#define PATHCHR        '.'#undef EXTCHR#define EXTCHR         '/'#undef CDIRSTR#define CDIRSTR        "@"#undef PDIRSTR#define PDIRSTR        "^"#endif/* global variables */const char *progname;                    /* program name */int sigterm;                             /* flag for termination signal *//* function prototypes */int main(int argc, char **argv);void setsignals(void);void sigtermhandler(int num);void usage(void);int runregister(int argc, char **argv);int runrelate(int argc, char **argv);int runpurge(int argc, char **argv);int fwimatch(const char *str, const char *key);int bwimatch(const char *str, const char *key);int bwimatchlist(const char *str, const CBLIST *keys);char *fgetl(FILE *IN);void pdperror(const char *name);void printferror(const char *format, ...);void printfinfo(const char *format, ...);const char *datestr(int t);int proclist(const char *name, const char *lfile, int wmax, int ft,             const CBLIST *tsuflist, const CBLIST *hsuflist);int procdir(const char *name, const char *dir, int wmax, int ft,            const CBLIST *tsuflist, const CBLIST *hsuflist);int indexdir(ODEUM *odeum, VILLA *mtdb, const char *name, const char *dir, int wmax, int ft,             const CBLIST *tsuflist, const CBLIST *hsuflist);int indexfile(ODEUM *odeum, VILLA *mtdb, const char *name, const char *file, int wmax, int ft,              const CBLIST *tsuflist, const CBLIST *hsuflist);char *filetouri(const char *file);ODDOC *makedocplain(const char *uri, const char *text, const char *date);ODDOC *makedochtml(const char *uri, const char *html, const char *date);CBLIST *htmllist(const char *html);int procrelate(const char *name);int procpurge(const char *name);/* main routine */int main(int argc, char **argv){  int rv;  progname = argv[0];  sigterm = FALSE;  setsignals();  if(argc < 2) usage();  rv = 0;  if(!strcmp(argv[1], "register")){    rv = runregister(argc, argv);  } else if(!strcmp(argv[1], "relate")){    rv = runrelate(argc, argv);  } else if(!strcmp(argv[1], "purge")){    rv = runpurge(argc, argv);  } else {    usage();  }  return rv;}/* set signal handlers */void setsignals(void){  signal(1, sigtermhandler);  signal(2, sigtermhandler);  signal(3, sigtermhandler);  signal(13, sigtermhandler);  signal(15, sigtermhandler);}/* handler of termination signal */void sigtermhandler(int num){  signal(num, SIG_DFL);  sigterm = TRUE;  printfinfo("the termination signal %d catched", num);}/* print the usage and exit */void usage(void){  fprintf(stderr, "%s: indexer of document files\n", progname);  fprintf(stderr, "\n");  fprintf(stderr, "usage:\n");  fprintf(stderr, "  %s register [-l file] [-wmax num] [-tsuf sufs] [-hsuf sufs] [-ft]"          " name [dir]\n", progname);  fprintf(stderr, "  %s relate name\n", progname);  fprintf(stderr, "  %s purge name\n", progname);  exit(1);}/* parse arguments of register command  */int runregister(int argc, char **argv){  char *name, *dir, *lfile, *tsuf, *hsuf, path[PATHBUFSIZ];  int i, wmax, ft, plen, rv;  CBLIST *tsuflist, *hsuflist;  name = NULL;  dir = NULL;  lfile = NULL;  tsuf = NULL;  hsuf = NULL;  wmax = -1;  ft = FALSE;  for(i = 2; i < argc; i++){    if(!name && argv[i][0] == '-'){      if(!strcmp(argv[i], "-l")){        if(++i >= argc) usage();        lfile = argv[i];      } else if(!strcmp(argv[i], "-wmax")){        if(++i >= argc) usage();        wmax = atoi(argv[i]);      } else if(!strcmp(argv[i], "-tsuf")){        if(++i >= argc) usage();        tsuf = argv[i];      } else if(!strcmp(argv[i], "-hsuf")){        if(++i >= argc) usage();        hsuf = argv[i];      } else if(!strcmp(argv[i], "-ft")){        ft = TRUE;      } else {        usage();      }    } else if(!name){      name = argv[i];    } else if(!dir){      dir = argv[i];    } else {      usage();    }  }  if(!name) usage();  if(!dir) dir = CDIRSTR;  plen = sprintf(path, "%s", dir);  if(plen > 1 && path[plen-1] == PATHCHR) path[plen-1] = '\0';  tsuflist = cbsplit(tsuf ? tsuf : ".txt,.text", -1, ",");  hsuflist = cbsplit(hsuf ? hsuf : ".html,.htm", -1, ",");  if(lfile){    rv = proclist(name, lfile, wmax, ft, tsuflist, hsuflist);  } else {    rv = procdir(name, path, wmax, ft, tsuflist, hsuflist);  }  cblistclose(hsuflist);  cblistclose(tsuflist);  return rv;}/* parse arguments of relate command  */int runrelate(int argc, char **argv){  char *name;  int i, rv;  name = NULL;  for(i = 2; i < argc; i++){    if(!name && argv[i][0] == '-'){      usage();    } else if(!name){      name = argv[i];    } else {      usage();    }  }  if(!name) usage();  rv = procrelate(name);  return rv;}/* parse arguments of purge command  */int runpurge(int argc, char **argv){  char *name;  int i, rv;  name = NULL;  for(i = 2; i < argc; i++){    if(!name && argv[i][0] == '-'){      usage();    } else if(!name){      name = argv[i];    } else {      usage();    }  }  if(!name) usage();  rv = procpurge(name);  return rv;}/* case insensitive forward matching */int fwimatch(const char *str, const char *key){  int len, i;  len = strlen(key);  for(i = 0; i < len; i++){    if(tolower(str[i]) != tolower(key[i]) || str[i] == '\0') return FALSE;  }  return TRUE;}/* case insensitive backward matching */int bwimatch(const char *str, const char *key){  int slen, klen, i;  slen = strlen(str);  klen = strlen(key);  for(i = 1; i <= klen; i++){    if(tolower(str[slen-i]) != tolower(key[klen-i]) || i > slen) return FALSE;  }  return TRUE;}/* case insensitive backward matching with a list */int bwimatchlist(const char *str, const CBLIST *keys){  int i;  for(i = 0; i < cblistnum(keys); i++){    if(bwimatch(str, cblistval(keys, i, NULL))) return TRUE;  }  return FALSE;}/* read a line */char *fgetl(FILE *IN){  char *buf;  int c, len, blen;  buf = NULL;  len = 0;  blen = 256;  while((c = fgetc(IN)) != EOF){    if(blen <= len) blen *= 2;    buf = cbrealloc(buf, blen + 1);    if(c == '\n') c = '\0';    buf[len++] = c;    if(c == '\0') break;  }  if(!buf) return NULL;  buf[len] = '\0';  return buf;}/* print an error message */void pdperror(const char *name){  printf("%s: ERROR: %s: %s\n", progname, name, dperrmsg(dpecode));  fflush(stdout);}/* print formatted error string and flush the buffer */void printferror(const char *format, ...){  va_list ap;  va_start(ap, format);  printf("%s: ERROR: ", progname);  vprintf(format, ap);  putchar('\n');  fflush(stdout);  va_end(ap);}/* print formatted information string and flush the buffer */void printfinfo(const char *format, ...){  va_list ap;  va_start(ap, format);  printf("%s: INFO: ", progname);  vprintf(format, ap);  putchar('\n');  fflush(stdout);  va_end(ap);}/* get static string of the date */const char *datestr(int t){  static char buf[32];  struct tm *stp;  time_t tt;  tt = (time_t)t;  if(!(stp = localtime(&tt))) return "0000/00/00 00:00:00";  sprintf(buf, "%04d/%02d/%02d %02d:%02d:%02d",          stp->tm_year + 1900, stp->tm_mon + 1, stp->tm_mday,          stp->tm_hour, stp->tm_min, stp->tm_sec);  return buf;}/* processing with finding files in a list file */int proclist(const char *name, const char *lfile, int wmax, int ft,             const CBLIST *tsuflist, const CBLIST *hsuflist){  ODEUM *odeum;  VILLA *mtdb;  FILE *IN;  char *line, path[PATHBUFSIZ];  int err, fatal;  if(!strcmp(lfile, "-")){    IN = stdin;  } else {    if(!(IN = fopen(lfile, "rb"))){      printferror("%s: file cannot be opened", lfile);      return 1;    }  }  printfinfo("%s: registration started", name);  if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){    pdperror(name);    if(IN != stdin) fclose(IN);    return 1;  }  sprintf(path, "%s%c%s", name, PATHCHR, MTDBNAME);  if(!(mtdb = vlopen(path, VL_OWRITER | VL_OCREAT, VL_CMPLEX))){    pdperror(name);    odclose(odeum);    if(IN != stdin) fclose(IN);    return 1;  }  vlsettuning(mtdb, MTDBLRM, MTDBNIM, MTDBLCN, MTDBNCN);  printfinfo("%s: database opened: fsiz=%d dnum=%d wnum=%d bnum=%d",             name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  err = FALSE;  while((line = fgetl(IN)) != NULL){    if(sigterm){      printferror("aborting due to a termination signal");      free(line);      err = TRUE;      break;    }    if(!indexfile(odeum, mtdb, name, line, wmax, ft, tsuflist, hsuflist)) err = TRUE;    free(line);  }  fatal = odfatalerror(odeum);  printfinfo("%s: database closing: fsiz=%d dnum=%d wnum=%d bnum=%d",             name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  if(!vlclose(mtdb)){    pdperror(name);    err = TRUE;  }  if(!odclose(odeum)){    pdperror(name);    err = TRUE;  }  if(IN != stdin) fclose(IN);  if(err){    printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : "");  } else {    printfinfo("%s: registration completed successfully", name);  }  return err ? 1 : 0;}/* processing with finding files in a directory */int procdir(const char *name, const char *dir, int wmax, int ft,            const CBLIST *tsuflist, const CBLIST *hsuflist){  ODEUM *odeum;  VILLA *mtdb;  char path[PATHBUFSIZ];  int err, fatal;  printfinfo("%s: registration started", name);  if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){    pdperror(name);    return 1;  }  sprintf(path, "%s%c%s", name, PATHCHR, MTDBNAME);  if(!(mtdb = vlopen(path, VL_OWRITER | VL_OCREAT, VL_CMPLEX))){    pdperror(name);    odclose(odeum);    return 1;  }  vlsettuning(mtdb, MTDBLRM, MTDBNIM, MTDBLCN, MTDBNCN);  printfinfo("%s: database opened: fsiz=%d dnum=%d wnum=%d bnum=%d",             name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  err = FALSE;  if(!indexdir(odeum, mtdb, name, dir, wmax, ft, tsuflist, hsuflist)) err = TRUE;  fatal = odfatalerror(odeum);  printfinfo("%s: database closing: fsiz=%d dnum=%d wnum=%d bnum=%d",             name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  if(!vlclose(mtdb)){    pdperror(name);    err = TRUE;  }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -