📄 odeum.c
字号:
/************************************************************************************************* * Implementation of Odeum * Copyright (C) 2000-2003 Mikio Hirabayashi * This file is part of QDBM, Quick Database Manager. * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU * Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * You should have received a copy of the GNU Lesser General Public License along with QDBM; if * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA * 02111-1307 USA. *************************************************************************************************/#include "odeum.h"#include "myconf.h"#define OD_NAMEMAX 256 /* max size of a database name */#define OD_DIRMODE 00755 /* permission of a creating directory */#define OD_PATHBUFSIZ 1024 /* size of a path buffer */#define OD_NUMBUFSIZ 32 /* size of a buffer for a number */#define OD_DOCSNAME "docs" /* name of the database for documents */#define OD_INDEXNAME "index" /* name of the database for inverted index */#define OD_RDOCSNAME "rdocs" /* name of the database for reverse dictionary */#define OD_DOCSBNUM 509 /* initial bucket number of document database */#define OD_DOCSDNUM 9 /* division number of document database */#define OD_DOCSALIGN -4 /* alignment of document database */#define OD_INDEXBNUM 8191 /* initial bucket number of inverted index */#define OD_INDEXDNUM 5 /* division number of inverted index */#define OD_INDEXALIGN -3 /* alignment of inverted index */#define OD_RDOCSLRM 81 /* records in a leaf node of reverse dictionary */#define OD_RDOCSNIM 192 /* records in a non-leaf node of reverse dictionary */#define OD_RDOCSLCN 128 /* number of leaf cache of reverse dictionary */#define OD_RDOCSNCN 32 /* number of non-leaf cache of reverse dictionary */#define OD_SORTMAPMAX 262144 /* max number of records of sortmap */#define OD_DMAXEXPR "dmax" /* key of max number of the document ID */#define OD_DNUMEXPR "dnum" /* key of number of the documents */#define OD_URIEXPR "1" /* map key of URI */#define OD_ATTRSEXPR "2" /* map key of attributes */#define OD_NWORDSEXPR "3" /* map key of normal words */#define OD_AWORDSEXPR "4" /* map key of as-is words */#define OD_WTOPRATE 0.1 /* ratio of top words */#define OD_WTOPBONUS 5000 /* bonus points of top words */#define OD_WOCCRPOINT 10000 /* points per occurence */#define OD_KEYCANDS 128 /* max number of candidates for keywords */#define OD_SPACECHARS "\t\n\v\f\r " /* space characters */#define OD_DELIMCHARS "!\"#$%&'()*,./:;<=>?@[\\]^`{|}~" /* delimiter characters */#define OD_MAXWORDLEN 48 /* max length of a word */typedef struct { /* type of structure for word counting */ const char *word; /* pointer to the word */ int num; /* frequency of the word */} ODWORD;/* private function prototypes */static int odsortindex(ODEUM *odeum);static int odsortcompare(const void *a, const void *b);static int odpurgeindex(ODEUM *odeum);static CBMAP *odpairsmap(const ODPAIR *pairs, int num);static int odwordcompare(const void *a, const void *b);/************************************************************************************************* * public objects *************************************************************************************************//* Get a database handle. */ODEUM *odopen(const char *name, int omode){ int cromode, vlomode, inode, dmax, dnum; char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ], *tmp; struct stat sbuf; CURIA *docsdb, *indexdb; VILLA *rdocsdb; CBMAP *sortmap; ODEUM *odeum; assert(name); if(strlen(name) > OD_NAMEMAX){ dpecode = DP_EMISC; return NULL; } cromode = CR_OREADER; vlomode = VL_OREADER; if(omode & OD_OWRITER){ cromode = CR_OWRITER; vlomode = VL_OWRITER; if(omode & OD_OCREAT){ cromode |= CR_OCREAT; vlomode |= VL_OCREAT; } if(omode & OD_OTRUNC){ cromode |= CR_OTRUNC; vlomode |= VL_OTRUNC; } } if(omode & OD_ONOLCK){ cromode |= CR_ONOLCK; vlomode |= VL_ONOLCK; } sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME); sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME); sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME); docsdb = NULL; indexdb = NULL; rdocsdb = NULL; if((omode & OD_OWRITER) && (omode & OD_OCREAT)){ if(mkdir(name, OD_DIRMODE) == -1 && errno != EEXIST){ dpecode = DP_EMKDIR; return NULL; } } if(stat(name, &sbuf) == -1){ dpecode = DP_ESTAT; return NULL; } inode = sbuf.st_ino; if(!(docsdb = cropen(docsname, cromode, OD_DOCSBNUM, OD_DOCSDNUM))) return NULL; if(!(indexdb = cropen(indexname, cromode, OD_INDEXBNUM, OD_INDEXDNUM))){ crclose(docsdb); return NULL; } if(omode & OD_OWRITER){ if(!crsetalign(docsdb, OD_DOCSALIGN) || !crsetalign(indexdb, OD_INDEXALIGN)){ crclose(indexdb); crclose(docsdb); return NULL; } } if(!(rdocsdb = vlopen(rdocsname, vlomode, VL_CMPLEX))){ crclose(indexdb); crclose(docsdb); return NULL; } vlsettuning(rdocsdb, OD_RDOCSLRM, OD_RDOCSNIM, OD_RDOCSLCN, OD_RDOCSNCN); if(omode & OD_OWRITER){ sortmap = cbmapopen(); } else { sortmap = NULL; } if(vlrnum(rdocsdb) > 0){ dmax = -1; dnum = -1; if((tmp = vlget(rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), NULL)) != NULL){ dmax = atoi(tmp); free(tmp); } if((tmp = vlget(rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), NULL)) != NULL){ dnum = atoi(tmp); free(tmp); } if(dmax < 0 || dnum < 0){ vlclose(rdocsdb); crclose(indexdb); crclose(docsdb); dpecode = DP_EBROKEN; return NULL; } } else { dmax = 0; dnum = 0; } odeum = cbmalloc(sizeof(ODEUM)); odeum->name = cbmemdup(name, -1); odeum->wmode = omode & OD_OWRITER; odeum->fatal = FALSE; odeum->inode = inode; odeum->docsdb = docsdb; odeum->indexdb = indexdb; odeum->rdocsdb = rdocsdb; odeum->sortmap = sortmap; odeum->dmax = dmax; odeum->dnum = dnum; return odeum;}/* Close a database handle. */int odclose(ODEUM *odeum){ char numbuf[OD_NUMBUFSIZ]; int err; assert(odeum); err = FALSE; if(odeum->wmode){ sprintf(numbuf, "%d", odeum->dmax); if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)) err = TRUE; sprintf(numbuf, "%d", odeum->dnum); if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)) err = TRUE; if(!odsortindex(odeum)) err = TRUE; cbmapclose(odeum->sortmap); } if(!vlclose(odeum->rdocsdb)) err = TRUE; if(!crclose(odeum->indexdb)) err = TRUE; if(!crclose(odeum->docsdb)) err = TRUE; free(odeum->name); free(odeum); return err ? FALSE : TRUE;}/* Store a document. */int odput(ODEUM *odeum, ODDOC *doc, int wmax, int over){ char *tmp, *zbuf; const char *word, *ctmp; int i, docid, tsiz, wsiz, wnum, tmax, num, zsiz; double wlog; ODPAIR pair; CBMAP *map; CBLIST *tlist; assert(odeum); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } if(!odeum->wmode){ dpecode = DP_EMODE; return FALSE; } if((tmp = vlget(odeum->rdocsdb, doc->uri, -1, &tsiz)) != NULL){ if(!over){ free(tmp); dpecode = DP_EKEEP; return FALSE; } if(tsiz != sizeof(int) || !odoutbyid(odeum, *(int *)tmp)){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } free(tmp); } odeum->dmax++; odeum->dnum++; docid = odeum->dmax; map = cbmapopen(); cbmapput(map, OD_URIEXPR, sizeof(OD_URIEXPR), doc->uri, -1, TRUE); tmp = cbmapdump(doc->attrs, &tsiz); cbmapput(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), tmp, tsiz, TRUE); free(tmp); if(wmax >= 0 && wmax < cblistnum(doc->nwords)){ tlist = cblistopen(); for(i = 0; i < wmax; i++){ ctmp = cblistval(doc->nwords, i, &wsiz); cblistpush(tlist, ctmp, wsiz); } tmp = cblistdump(tlist, &tsiz); cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE); free(tmp); cblistclose(tlist); tlist = cblistopen(); for(i = 0; i < wmax; i++){ ctmp = cblistval(doc->awords, i, &wsiz); cblistpush(tlist, ctmp, wsiz); } tmp = cblistdump(tlist, &tsiz); cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE); free(tmp); cblistclose(tlist); } else { tmp = cblistdump(doc->nwords, &tsiz); cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE); free(tmp); tmp = cblistdump(doc->awords, &tsiz); cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE); free(tmp); } tmp = cbmapdump(map, &tsiz); cbmapclose(map); if(_qdbm_deflate){ if(!(zbuf = _qdbm_deflate(tmp, tsiz, &zsiz))){ free(tmp); dpecode = DP_EMISC; odeum->fatal = TRUE; return FALSE; } free(tmp); tmp = zbuf; tsiz = zsiz; } if(!crput(odeum->docsdb, (char *)&docid, sizeof(int), tmp, tsiz, CR_DKEEP)){ free(tmp); if(dpecode == DP_EKEEP) dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } free(tmp); if(!vlput(odeum->rdocsdb, doc->uri, -1, (char *)&docid, sizeof(int), VL_DOVER)){ odeum->fatal = TRUE; return FALSE; } map = cbmapopen(); wnum = cblistnum(doc->nwords); tmax = wnum * OD_WTOPRATE; for(i = 0; i < wnum; i++){ word = cblistval(doc->nwords, i, &wsiz); if(wsiz < 1) continue; if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){ num = *(int *)ctmp + OD_WOCCRPOINT; } else { num = i <= tmax ? OD_WTOPBONUS + OD_WOCCRPOINT : OD_WOCCRPOINT; } cbmapput(map, word, wsiz, (char *)&num, sizeof(int), TRUE); } wlog = odlogarithm(wnum); wlog = (wlog * wlog) / 4.0; if(wlog < 4.0) wlog = 4.0; cbmapiterinit(map); while((word = cbmapiternext(map, &wsiz)) != NULL){ pair.id = docid; pair.score = *(int *)cbmapget(map, word, wsiz, NULL) / wlog; if(!crput(odeum->indexdb, word, wsiz, (char *)&pair, sizeof(pair), CR_DCAT)){ cbmapclose(map); odeum->fatal = TRUE; return FALSE; } cbmapput(odeum->sortmap, word, wsiz, "", 0, FALSE); } cbmapclose(map); if(cbmaprnum(odeum->sortmap) > OD_SORTMAPMAX){ if(!odsortindex(odeum)){ odeum->fatal = TRUE; return FALSE; } } doc->id = docid; return TRUE;}/* Delete a document by a URL. */int odout(ODEUM *odeum, const char *uri){ char *tmp; int tsiz, docid; assert(odeum && uri); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } if(!odeum->wmode){ dpecode = DP_EMODE; return FALSE; } if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; return FALSE; } if(tsiz != sizeof(int)){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } docid = *(int *)tmp; free(tmp); return odoutbyid(odeum, docid);}/* Delete a document specified by an ID number. */int odoutbyid(ODEUM *odeum, int id){ char *tmp, *zbuf; const char *uritmp; int tsiz, uritsiz, zsiz; CBMAP *map; assert(odeum && id > 0); if(odeum->fatal){ dpecode = DP_EFATAL; return FALSE; } if(!odeum->wmode){ dpecode = DP_EMODE; return FALSE; } if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){ if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; return FALSE; } if(_qdbm_inflate){ if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz))){ free(tmp); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } free(tmp); tmp = zbuf; tsiz = zsiz; } map = cbmapload(tmp, tsiz); free(tmp); uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz); if(!uritmp || !vlout(odeum->rdocsdb, uritmp, uritsiz)){ cbmapclose(map); dpecode = DP_EBROKEN; odeum->fatal = TRUE; return FALSE; } cbmapclose(map); if(!crout(odeum->docsdb, (char *)&id, sizeof(int))){ odeum->fatal = TRUE; return FALSE; } odeum->dnum--; return TRUE;}/* Retrieve a document by a URL. */ODDOC *odget(ODEUM *odeum, const char *uri){ char *tmp; int tsiz, docid; assert(odeum && uri);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -