⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 odeum.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/************************************************************************************************* * Implementation of Odeum *                                                      Copyright (C) 2000-2003 Mikio Hirabayashi * This file is part of QDBM, Quick Database Manager. * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU * Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License or any later version.  QDBM is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more * details. * You should have received a copy of the GNU Lesser General Public License along with QDBM; if * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA * 02111-1307 USA. *************************************************************************************************/#include "odeum.h"#include "myconf.h"#define OD_NAMEMAX     256               /* max size of a database name */#define OD_DIRMODE     00755             /* permission of a creating directory */#define OD_PATHBUFSIZ  1024              /* size of a path buffer */#define OD_NUMBUFSIZ   32                /* size of a buffer for a number */#define OD_DOCSNAME    "docs"            /* name of the database for documents */#define OD_INDEXNAME   "index"           /* name of the database for inverted index */#define OD_RDOCSNAME   "rdocs"           /* name of the database for reverse dictionary */#define OD_DOCSBNUM    509               /* initial bucket number of document database */#define OD_DOCSDNUM    9                 /* division number of document database */#define OD_DOCSALIGN   -4                /* alignment of document database */#define OD_INDEXBNUM   8191              /* initial bucket number of inverted index */#define OD_INDEXDNUM   5                 /* division number of inverted index */#define OD_INDEXALIGN  -3                /* alignment of inverted index */#define OD_RDOCSLRM    81                /* records in a leaf node of reverse dictionary */#define OD_RDOCSNIM    192               /* records in a non-leaf node of reverse dictionary */#define OD_RDOCSLCN    128               /* number of leaf cache of reverse dictionary */#define OD_RDOCSNCN    32                /* number of non-leaf cache of reverse dictionary */#define OD_SORTMAPMAX  262144            /* max number of records of sortmap */#define OD_DMAXEXPR    "dmax"            /* key of max number of the document ID */#define OD_DNUMEXPR    "dnum"            /* key of number of the documents */#define OD_URIEXPR     "1"               /* map key of URI */#define OD_ATTRSEXPR   "2"               /* map key of attributes */#define OD_NWORDSEXPR  "3"               /* map key of normal words */#define OD_AWORDSEXPR  "4"               /* map key of as-is words */#define OD_WTOPRATE    0.1               /* ratio of top words */#define OD_WTOPBONUS   5000              /* bonus points of top words */#define OD_WOCCRPOINT  10000             /* points per occurence */#define OD_KEYCANDS    128               /* max number of candidates for keywords */#define OD_SPACECHARS  "\t\n\v\f\r "     /* space characters */#define OD_DELIMCHARS  "!\"#$%&'()*,./:;<=>?@[\\]^`{|}~"  /* delimiter characters */#define OD_MAXWORDLEN  48                /* max length of a word */typedef struct {                         /* type of structure for word counting */  const char *word;                      /* pointer to the word */  int num;                               /* frequency of the word */} ODWORD;/* private function prototypes */static int odsortindex(ODEUM *odeum);static int odsortcompare(const void *a, const void *b);static int odpurgeindex(ODEUM *odeum);static CBMAP *odpairsmap(const ODPAIR *pairs, int num);static int odwordcompare(const void *a, const void *b);/************************************************************************************************* * public objects *************************************************************************************************//* Get a database handle. */ODEUM *odopen(const char *name, int omode){  int cromode, vlomode, inode, dmax, dnum;  char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ], *tmp;  struct stat sbuf;  CURIA *docsdb, *indexdb;  VILLA *rdocsdb;  CBMAP *sortmap;  ODEUM *odeum;  assert(name);  if(strlen(name) > OD_NAMEMAX){    dpecode = DP_EMISC;    return NULL;  }  cromode = CR_OREADER;  vlomode = VL_OREADER;  if(omode & OD_OWRITER){    cromode = CR_OWRITER;    vlomode = VL_OWRITER;    if(omode & OD_OCREAT){      cromode |= CR_OCREAT;      vlomode |= VL_OCREAT;    }    if(omode & OD_OTRUNC){      cromode |= CR_OTRUNC;      vlomode |= VL_OTRUNC;    }  }  if(omode & OD_ONOLCK){    cromode |= CR_ONOLCK;    vlomode |= VL_ONOLCK;  }  sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME);  sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME);  sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME);  docsdb = NULL;  indexdb = NULL;  rdocsdb = NULL;  if((omode & OD_OWRITER) && (omode & OD_OCREAT)){    if(mkdir(name, OD_DIRMODE) == -1 && errno != EEXIST){      dpecode = DP_EMKDIR;      return NULL;    }  }  if(stat(name, &sbuf) == -1){    dpecode = DP_ESTAT;    return NULL;  }  inode = sbuf.st_ino;  if(!(docsdb = cropen(docsname, cromode, OD_DOCSBNUM, OD_DOCSDNUM))) return NULL;  if(!(indexdb = cropen(indexname, cromode, OD_INDEXBNUM, OD_INDEXDNUM))){    crclose(docsdb);    return NULL;  }  if(omode & OD_OWRITER){    if(!crsetalign(docsdb, OD_DOCSALIGN) || !crsetalign(indexdb, OD_INDEXALIGN)){      crclose(indexdb);      crclose(docsdb);      return NULL;    }  }  if(!(rdocsdb = vlopen(rdocsname, vlomode, VL_CMPLEX))){    crclose(indexdb);    crclose(docsdb);    return NULL;  }  vlsettuning(rdocsdb, OD_RDOCSLRM, OD_RDOCSNIM, OD_RDOCSLCN, OD_RDOCSNCN);  if(omode & OD_OWRITER){    sortmap = cbmapopen();  } else {    sortmap = NULL;  }  if(vlrnum(rdocsdb) > 0){    dmax = -1;    dnum = -1;    if((tmp = vlget(rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), NULL)) != NULL){      dmax = atoi(tmp);      free(tmp);    }    if((tmp = vlget(rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), NULL)) != NULL){      dnum = atoi(tmp);      free(tmp);    }    if(dmax < 0 || dnum < 0){      vlclose(rdocsdb);      crclose(indexdb);      crclose(docsdb);      dpecode = DP_EBROKEN;      return NULL;    }  } else {    dmax = 0;    dnum = 0;  }  odeum = cbmalloc(sizeof(ODEUM));  odeum->name = cbmemdup(name, -1);  odeum->wmode = omode & OD_OWRITER;  odeum->fatal = FALSE;  odeum->inode = inode;  odeum->docsdb = docsdb;  odeum->indexdb = indexdb;  odeum->rdocsdb = rdocsdb;  odeum->sortmap = sortmap;  odeum->dmax = dmax;  odeum->dnum = dnum;  return odeum;}/* Close a database handle. */int odclose(ODEUM *odeum){  char numbuf[OD_NUMBUFSIZ];  int err;  assert(odeum);  err = FALSE;  if(odeum->wmode){    sprintf(numbuf, "%d", odeum->dmax);    if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)) err = TRUE;    sprintf(numbuf, "%d", odeum->dnum);    if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)) err = TRUE;    if(!odsortindex(odeum)) err = TRUE;    cbmapclose(odeum->sortmap);  }  if(!vlclose(odeum->rdocsdb)) err = TRUE;  if(!crclose(odeum->indexdb)) err = TRUE;  if(!crclose(odeum->docsdb)) err = TRUE;  free(odeum->name);  free(odeum);  return err ? FALSE : TRUE;}/* Store a document. */int odput(ODEUM *odeum, ODDOC *doc, int wmax, int over){  char *tmp, *zbuf;  const char *word, *ctmp;  int i, docid, tsiz, wsiz, wnum, tmax, num, zsiz;  double wlog;  ODPAIR pair;  CBMAP *map;  CBLIST *tlist;  assert(odeum);  if(odeum->fatal){    dpecode = DP_EFATAL;    return FALSE;  }  if(!odeum->wmode){    dpecode = DP_EMODE;    return FALSE;  }  if((tmp = vlget(odeum->rdocsdb, doc->uri, -1, &tsiz)) != NULL){    if(!over){      free(tmp);      dpecode = DP_EKEEP;      return FALSE;    }    if(tsiz != sizeof(int) || !odoutbyid(odeum, *(int *)tmp)){      free(tmp);      dpecode = DP_EBROKEN;      odeum->fatal = TRUE;      return FALSE;    }    free(tmp);  }  odeum->dmax++;  odeum->dnum++;  docid = odeum->dmax;  map = cbmapopen();  cbmapput(map, OD_URIEXPR, sizeof(OD_URIEXPR), doc->uri, -1, TRUE);  tmp = cbmapdump(doc->attrs, &tsiz);  cbmapput(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), tmp, tsiz, TRUE);  free(tmp);  if(wmax >= 0 && wmax < cblistnum(doc->nwords)){    tlist = cblistopen();    for(i = 0; i < wmax; i++){      ctmp = cblistval(doc->nwords, i, &wsiz);      cblistpush(tlist, ctmp, wsiz);    }    tmp = cblistdump(tlist, &tsiz);    cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE);    free(tmp);    cblistclose(tlist);    tlist = cblistopen();    for(i = 0; i < wmax; i++){      ctmp = cblistval(doc->awords, i, &wsiz);      cblistpush(tlist, ctmp, wsiz);    }    tmp = cblistdump(tlist, &tsiz);    cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE);    free(tmp);    cblistclose(tlist);  } else {    tmp = cblistdump(doc->nwords, &tsiz);    cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE);    free(tmp);    tmp = cblistdump(doc->awords, &tsiz);    cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE);    free(tmp);  }  tmp = cbmapdump(map, &tsiz);  cbmapclose(map);  if(_qdbm_deflate){    if(!(zbuf = _qdbm_deflate(tmp, tsiz, &zsiz))){      free(tmp);      dpecode = DP_EMISC;      odeum->fatal = TRUE;      return FALSE;    }    free(tmp);    tmp = zbuf;    tsiz = zsiz;  }  if(!crput(odeum->docsdb, (char *)&docid, sizeof(int), tmp, tsiz, CR_DKEEP)){    free(tmp);    if(dpecode == DP_EKEEP) dpecode = DP_EBROKEN;    odeum->fatal = TRUE;    return FALSE;  }  free(tmp);  if(!vlput(odeum->rdocsdb, doc->uri, -1, (char *)&docid, sizeof(int), VL_DOVER)){    odeum->fatal = TRUE;    return FALSE;  }  map = cbmapopen();  wnum = cblistnum(doc->nwords);  tmax = wnum * OD_WTOPRATE;  for(i = 0; i < wnum; i++){    word = cblistval(doc->nwords, i, &wsiz);    if(wsiz < 1) continue;    if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){      num = *(int *)ctmp + OD_WOCCRPOINT;    } else {      num = i <= tmax ? OD_WTOPBONUS + OD_WOCCRPOINT : OD_WOCCRPOINT;    }    cbmapput(map, word, wsiz, (char *)&num, sizeof(int), TRUE);  }  wlog = odlogarithm(wnum);  wlog = (wlog * wlog) / 4.0;  if(wlog < 4.0) wlog = 4.0;  cbmapiterinit(map);  while((word = cbmapiternext(map, &wsiz)) != NULL){    pair.id = docid;    pair.score = *(int *)cbmapget(map, word, wsiz, NULL) / wlog;    if(!crput(odeum->indexdb, word, wsiz, (char *)&pair, sizeof(pair), CR_DCAT)){      cbmapclose(map);      odeum->fatal = TRUE;      return FALSE;    }    cbmapput(odeum->sortmap, word, wsiz, "", 0, FALSE);  }  cbmapclose(map);  if(cbmaprnum(odeum->sortmap) > OD_SORTMAPMAX){    if(!odsortindex(odeum)){      odeum->fatal = TRUE;      return FALSE;    }  }  doc->id = docid;  return TRUE;}/* Delete a document by a URL. */int odout(ODEUM *odeum, const char *uri){  char *tmp;  int tsiz, docid;  assert(odeum && uri);  if(odeum->fatal){    dpecode = DP_EFATAL;    return FALSE;  }  if(!odeum->wmode){    dpecode = DP_EMODE;    return FALSE;  }  if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){    if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;    return FALSE;  }  if(tsiz != sizeof(int)){    free(tmp);    dpecode = DP_EBROKEN;    odeum->fatal = TRUE;    return FALSE;  }  docid = *(int *)tmp;  free(tmp);  return odoutbyid(odeum, docid);}/* Delete a document specified by an ID number. */int odoutbyid(ODEUM *odeum, int id){  char *tmp, *zbuf;  const char *uritmp;  int tsiz, uritsiz, zsiz;  CBMAP *map;  assert(odeum && id > 0);  if(odeum->fatal){    dpecode = DP_EFATAL;    return FALSE;  }  if(!odeum->wmode){    dpecode = DP_EMODE;    return FALSE;  }  if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){    if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;    return FALSE;  }  if(_qdbm_inflate){    if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz))){      free(tmp);      dpecode = DP_EBROKEN;      odeum->fatal = TRUE;      return FALSE;    }    free(tmp);    tmp = zbuf;    tsiz = zsiz;  }  map = cbmapload(tmp, tsiz);  free(tmp);  uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz);  if(!uritmp || !vlout(odeum->rdocsdb, uritmp, uritsiz)){    cbmapclose(map);    dpecode = DP_EBROKEN;    odeum->fatal = TRUE;    return FALSE;  }  cbmapclose(map);  if(!crout(odeum->docsdb, (char *)&id, sizeof(int))){    odeum->fatal = TRUE;    return FALSE;  }  odeum->dnum--;  return TRUE;}/* Retrieve a document by a URL. */ODDOC *odget(ODEUM *odeum, const char *uri){  char *tmp;  int tsiz, docid;  assert(odeum && uri);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -