⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 odidx.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
  if(!odclose(odeum)){    pdperror(name);    err = TRUE;  }  if(err){    printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : "");  } else {    printfinfo("%s: registration completed successfully", name);  }  return err ? 1 : 0;}/* find and index files in a directory */int indexdir(ODEUM *odeum, VILLA *mtdb, const char *name, const char *dir, int wmax, int ft,             const CBLIST *tsuflist, const CBLIST *hsuflist){  CBLIST *files;  const char *file;  char path[PATHBUFSIZ];  int i, isroot, isdir, err;  if(!(files = cbdirlist(dir))){    printferror("%s: directory cannot be opened", dir);    return FALSE;  }  isroot = dir[0] == PATHCHR && dir[1] == '\0';  err = FALSE;  for(i = 0; i < cblistnum(files); i++){    if(sigterm){      printferror("aborting due to a termination signal");      cblistclose(files);      return FALSE;    }    file = cblistval(files, i, NULL);    if(!strcmp(file, CDIRSTR) || !strcmp(file, PDIRSTR)) continue;    if(isroot){      sprintf(path, "%s%s", dir, file);    } else {      sprintf(path, "%s%c%s", dir, PATHCHR, file);    }    if(!cbfilestat(path, &isdir, NULL, NULL)){      printferror("%s: file does not exist", file);      err = TRUE;      continue;    }    if(isdir){      if(!indexdir(odeum, mtdb, name, path, wmax, ft, tsuflist, hsuflist)) err = TRUE;    } else {      if(!indexfile(odeum, mtdb, name, path, wmax, ft, tsuflist, hsuflist)) err = TRUE;    }  }  cblistclose(files);  return err ? FALSE : TRUE;}/* index a file into the database */int indexfile(ODEUM *odeum, VILLA *mtdb, const char *name, const char *file, int wmax, int ft,              const CBLIST *tsuflist, const CBLIST *hsuflist){  static int cnt = 0;  char *vbuf, *buf, *uri;  const char *title;  int size, mtime, hot, vsiz, wnum, bnum;  ODDOC *doc;  if(!cbfilestat(file, NULL, &size, &mtime)){    printferror("%s: file does not exist", file);    return FALSE;  }  hot = TRUE;  if((vbuf = vlget(mtdb, file, -1, &vsiz)) != NULL){    if(vsiz == sizeof(int) && mtime <= *(int *)vbuf) hot = FALSE;    free(vbuf);  }  if(!hot){    printfinfo("%s: passed", file);    return TRUE;  }  doc = NULL;  uri = filetouri(file);  if(bwimatchlist(file, tsuflist)){    if(!(buf = cbreadfile(file, NULL))){      printferror("%s: file cannot be opened", file);      return FALSE;    }    doc = makedocplain(uri, buf, datestr(mtime));    free(buf);  } else if(bwimatchlist(file, hsuflist)){    if(!(buf = cbreadfile(file, NULL))){      printferror("%s: file cannot be opened", file);      return FALSE;    }    doc = makedochtml(uri, buf, datestr(mtime));    free(buf);  }  free(uri);  if(doc){    if(ft && (!(title = oddocgetattr(doc, "title")) || strlen(title) < 1)){      if((title = strrchr(file, PATHCHR)) != NULL){        title++;      }  else {        title = file;      }      oddocaddattr(doc, "title", title);    }    if(odput(odeum, doc, wmax, TRUE) &&       vlput(mtdb, file, -1, (char *)&mtime, sizeof(int), VL_DOVER)){      printfinfo("%s: registered: id=%d wnum=%d",                 file, oddocid(doc), cblistnum(oddocnwords(doc)));      cnt++;    } else {      pdperror(file);    }    oddocclose(doc);  }  wnum = odwnum(odeum);  bnum = odbnum(odeum);  if(wnum != -1 && bnum != -1 && (double)wnum / (double)bnum > MAXLOAD){    printfinfo("%s: optimizing started: fsiz=%d dnum=%d wnum=%d bnum=%d",               name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));    if(!odoptimize(odeum)){      pdperror(file);      return FALSE;    }    printfinfo("%s: optimizing completed: fsiz=%d dnum=%d wnum=%d bnum=%d",               name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  }  if(cnt >= 256){    printfinfo("%s: database status: fsiz=%d dnum=%d wnum=%d bnum=%d",               name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));    cnt = 0;  }  return TRUE;}/* make the url from file path */char *filetouri(const char *file){  CBLIST *list;  char str[PATHBUFSIZ], *wp, *enc;  const char *name;  int i, nsiz;  sprintf(str, "%c", PATHCHR);  list = cbsplit(file, -1, str);  wp = str;  for(i = 0; i < cblistnum(list); i++){    if(i > 0) *(wp++) = '/';    name = cblistval(list, i, &nsiz);    enc = cburlencode(name, nsiz);    wp += sprintf(wp, "%s", enc);    free(enc);  }  cblistclose(list);  *wp = '\0';  return cbmemdup(str, -1);}/* make a document of plain text */ODDOC *makedocplain(const char *uri, const char *text, const char *date){  ODDOC *doc;  CBLIST *awords;  const char *asis;  char *normal;  int i;  doc = oddocopen(uri);  if(date) oddocaddattr(doc, "date", date);  awords = odbreaktext(text);  for(i = 0; i < cblistnum(awords); i++){    asis = cblistval(awords, i, NULL);    normal = odnormalizeword(asis);    oddocaddword(doc, normal, asis);    free(normal);  }  cblistclose(awords);  return doc;}/* make a document of HTML */ODDOC *makedochtml(const char *uri, const char *html, const char *date){  static CBMAP *pairs = NULL;  ODDOC *doc;  CBLIST *elems, *awords;  const char *text, *asis;  char kbuf[8], vbuf[8], *rtext, *normal;  int i, j, body;  if(!pairs){    pairs = cbmapopen();    cbglobalgc(pairs, (void (*)(void *))cbmapclose);    cbmapput(pairs, "&amp;", -1, "&", 1, TRUE);    cbmapput(pairs, "&lt;", -1, "<", 1, TRUE);    cbmapput(pairs, "&gt;", -1, ">", 1, TRUE);    cbmapput(pairs, "&quot;", -1, "\"", 1, TRUE);    cbmapput(pairs, "&apos;", -1, "'", 1, TRUE);    cbmapput(pairs, "&nbsp;", -1, " ", 1, TRUE);    cbmapput(pairs, "&copy;", -1, "(C)", -1, TRUE);    cbmapput(pairs, "&reg;", -1, "(R)", -1, TRUE);    cbmapput(pairs, "&trade;", -1, "(TM)", -1, TRUE);    for(i = 1; i <= 127; i++){      sprintf(vbuf, "%c", i);      sprintf(kbuf, "&#%d;", i);      cbmapput(pairs, kbuf, -1, vbuf, 1, TRUE);    }  }  doc = oddocopen(uri);  if(date) oddocaddattr(doc, "date", date);  elems = htmllist(html);  body = FALSE;  for(i = 0; i < cblistnum(elems); i++){    text = cblistval(elems, i, NULL);    if(fwimatch(text, "<title")){      i++;      if(i < cblistnum(elems)){        text = cblistval(elems, i, NULL);        if(text[0] == '<') text = "";        rtext = cbreplace(text, pairs);        for(j = 0; rtext[j] != '\0'; j++){          if(strchr("\t\n\v\f\r", rtext[j])) rtext[j] = ' ';        }        while(--j >= 0){          if(rtext[j] != ' ') break;          rtext[j] = '\0';        }        for(j = 0; rtext[j] != '\0'; j++){          if(rtext[j] != ' ') break;        }        oddocaddattr(doc, "title", rtext + j);        awords = odbreaktext(rtext);        for(j = 0; j < cblistnum(awords); j++){          asis = cblistval(awords, j, NULL);          normal = odnormalizeword(asis);          oddocaddword(doc, normal, "");          free(normal);        }        cblistclose(awords);        free(rtext);      }    } else if(fwimatch(text, "<body")){      body = TRUE;    } else if(body && text[0] != '<'){      rtext = cbreplace(text, pairs);      awords = odbreaktext(rtext);      for(j = 0; j < cblistnum(awords); j++){        asis = cblistval(awords, j, NULL);        normal = odnormalizeword(asis);        oddocaddword(doc, normal, asis);        free(normal);      }      cblistclose(awords);      free(rtext);    }  }  if(!body){    for(i = 0; i < cblistnum(elems); i++){      text = cblistval(elems, i, NULL);      if(fwimatch(text, "<title")){        i++;      } else if(text[0] != '<'){        rtext = cbreplace(text, pairs);        awords = odbreaktext(rtext);        for(j = 0; j < cblistnum(awords); j++){          asis = cblistval(awords, j, NULL);          normal = odnormalizeword(asis);          oddocaddword(doc, normal, asis);          free(normal);        }        cblistclose(awords);        free(rtext);      }    }  }  cblistclose(elems);  return doc;}/* break HTML into elements */CBLIST *htmllist(const char *html){  CBLIST *list;  int i, pv, tag;  char *ep;  list = cblistopen();  i = 0;  pv = 0;  tag = FALSE;  while(TRUE){    if(html[i] == '\0'){      if(i > pv) cblistpush(list, html + pv, i - pv);      break;    } else if(fwimatch(html + i, "<!--")){      if(i > pv) cblistpush(list, html + pv, i - pv);      if((ep = strstr(html + i, "-->")) != NULL){        i = ep - html + 2;        pv = i + 1;      }    } else if(!tag && html[i] == '<'){      if(i > pv) cblistpush(list, html + pv, i - pv);      tag = TRUE;      pv = i;    } else if(tag && html[i] == '>'){      if(i > pv) cblistpush(list, html + pv, i - pv + 1);      tag = FALSE;      pv = i + 1;    }    i++;  }  return list;}/* register scores of documents */int procrelate(const char *name){  ODEUM *odeum;  DEPOT *scdb;  ODDOC *doc;  CBMAP *scores;  const char *file;  char path[PATHBUFSIZ], *mbuf;  int err, fatal, id, msiz;  printfinfo("%s: relating started", name);  if(!(odeum = odopen(name, OD_OWRITER))){    pdperror(name);    return 1;  }  sprintf(path, "%s%c%s", name, PATHCHR, SCDBNAME);  if(!(scdb = dpopen(path, OD_OWRITER | OD_OCREAT, SCDBBNUM))){    pdperror(name);    odclose(odeum);    return 1;  }  printfinfo("%s: database opened: fsiz=%d dnum=%d wnum=%d bnum=%d",             name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  err = FALSE;  if(!oditerinit(odeum)){    pdperror(name);    err = TRUE;  } else {    while(TRUE){      if(sigterm){        printferror("aborting due to a termination signal");        err = TRUE;        break;      }      if(!(doc = oditernext(odeum))){        if(dpecode != DP_ENOITEM){          pdperror(name);          err = TRUE;        }        break;      }      file = oddocuri(doc);      id = oddocid(doc);      scores = oddocscores(doc, KEYNUM, odeum);      mbuf = cbmapdump(scores, &msiz);      if(!dpput(scdb, (char *)&id, sizeof(int), mbuf, msiz, DP_DOVER)){        pdperror(name);        err = TRUE;      } else {        printfinfo("%s: related", file);      }      free(mbuf);      cbmapclose(scores);      oddocclose(doc);      if(err) break;    }  }  fatal = odfatalerror(odeum);  printfinfo("%s: database closing: fsiz=%d dnum=%d wnum=%d bnum=%d",             name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  if(!dpclose(scdb)){    pdperror(name);    err = TRUE;  }  if(!odclose(odeum)){    pdperror(name);    err = TRUE;  }  if(err){    printfinfo("%s: relating was over%s", name, fatal ? " with fatal error" : "");  } else {    printfinfo("%s: relating completed successfully", name);  }  return err ? 1 : 0;}/* purge documents which is not existing. */int procpurge(const char *name){  ODEUM *odeum;  ODDOC *doc;  const char *file;  int err, fatal;  printfinfo("%s: purging started", name);  if(!(odeum = odopen(name, OD_OWRITER))){    pdperror(name);    return 1;  }  printfinfo("%s: database opened: fsiz=%d dnum=%d wnum=%d bnum=%d",             name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  err = FALSE;  if(!oditerinit(odeum)){    pdperror(name);    err = TRUE;  } else {    while(TRUE){      if(sigterm){        printferror("aborting due to a termination signal");        err = TRUE;        break;      }      if(!(doc = oditernext(odeum))){        if(dpecode != DP_ENOITEM){          pdperror(name);          err = TRUE;        }        break;      }      file = oddocuri(doc);      if(cbfilestat(file, NULL, NULL, NULL)){        printfinfo("%s: passed", file);      } else {        if(!odout(odeum, file)){          pdperror(file);          err = TRUE;        }        printfinfo("%s: purged", file);      }      oddocclose(doc);    }  }  fatal = odfatalerror(odeum);  printfinfo("%s: database closing: fsiz=%d dnum=%d wnum=%d bnum=%d",             name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));  if(!odclose(odeum)){    pdperror(name);    err = TRUE;  }  if(err){    printfinfo("%s: purging was over%s", name, fatal ? " with fatal error" : "");  } else {    printfinfo("%s: purging completed successfully", name);  }  return err ? 1 : 0;}/* END OF FILE */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -