📄 odeum.c
字号:
const char *oddocgetattr(const ODDOC *doc, const char *name){ assert(doc && name); return cbmapget(doc->attrs, name, -1, NULL);}/* Get the list handle contains words in normalized form of a document. */const CBLIST *oddocnwords(const ODDOC *doc){ assert(doc); return doc->nwords;}/* Get the list handle contains words in appearance form of a document. */const CBLIST *oddocawords(const ODDOC *doc){ assert(doc); return doc->awords;}/* Get the map handle contains keywords in normalized form and their scores. */CBMAP *oddocscores(const ODDOC *doc, int max, ODEUM *odeum){ const CBLIST *nwords; CBMAP *map, *kwmap; const char *word, *ctmp; char numbuf[OD_NUMBUFSIZ]; ODWORD *owords; int i, wsiz, wnum, hnum, mnum, nbsiz; double ival; assert(doc && max >= 0); map = cbmapopen(); nwords = oddocnwords(doc); for(i = 0; i < cblistnum(nwords); i++){ word = cblistval(nwords, i, &wsiz); if(wsiz < 1) continue; if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){ wnum = *(int *)ctmp + OD_WOCCRPOINT; } else { wnum = OD_WOCCRPOINT; } cbmapput(map, word, wsiz, (char *)&wnum, sizeof(int), TRUE); } mnum = cbmaprnum(map); owords = cbmalloc(mnum * sizeof(ODWORD) + 1); cbmapiterinit(map); for(i = 0; (word = cbmapiternext(map, &wsiz)) != NULL; i++){ owords[i].word = word; owords[i].num = *(int *)cbmapget(map, word, wsiz, NULL); } cbqsort(owords, mnum, sizeof(ODWORD), odwordcompare); if(odeum){ if(mnum > OD_KEYCANDS) mnum = OD_KEYCANDS; for(i = 0; i < mnum; i++){ if((hnum = odsearchdnum(odeum, owords[i].word)) < 0) hnum = 0; ival = odlogarithm(hnum); ival = (ival * ival * ival) / 8.0; if(ival < 8.0) ival = 8.0; owords[i].num = owords[i].num / ival; } cbqsort(owords, mnum, sizeof(ODWORD), odwordcompare); } if(mnum > max) mnum = max; kwmap = cbmapopen(); for(i = 0; i < mnum; i++){ nbsiz = sprintf(numbuf, "%d", owords[i].num); cbmapput(kwmap, owords[i].word, -1, numbuf, nbsiz, TRUE); } free(owords); cbmapclose(map); return kwmap;}/* Break a text into words in appearance form. */CBLIST *odbreaktext(const char *text){ const char *word; CBLIST *elems, *words; int i, j, dif, wsiz, pv, delim; assert(text); words = cblistopen(); elems = cbsplit(text, -1, OD_SPACECHARS); for(i = 0; i < cblistnum(elems); i++){ word = cblistval(elems, i, &wsiz); delim = FALSE; j = 0; pv = 0; while(TRUE){ dif = j - pv; if(j >= wsiz){ if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); break; } if(delim){ if(!strchr(OD_DELIMCHARS, word[j])){ if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); pv = j; delim = FALSE; } } else { if(strchr(OD_DELIMCHARS, word[j])){ if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); pv = j; delim = TRUE; } } j++; } } cblistclose(elems); return words;}/* Make the normalized form of a word. */char *odnormalizeword(const char *asis){ char *nword; int i; assert(asis); for(i = 0; asis[i] != '\0'; i++){ if(!strchr(OD_DELIMCHARS, asis[i])) break; } if(asis[i] == '\0') return cbmemdup("", 0); nword = cbmemdup(asis, -1); for(i = 0; nword[i] != '\0'; i++){ if(nword[i] >= 'A' && nword[i] <= 'Z') nword[i] += 'a' - 'A'; } return nword;}/* Get the common elements of two sets of documents. */ODPAIR *odpairsand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ CBMAP *map; ODPAIR *result; const char *tmp; int i, rnum; assert(apairs && anum >= 0 && bpairs && bnum >= 0); map = odpairsmap(bpairs, bnum); result = cbmalloc(sizeof(ODPAIR) * anum + 1); rnum = 0; for(i = 0; i < anum; i++){ if(!(tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL))) continue; result[rnum].id = apairs[i].id; result[rnum].score = apairs[i].score + *(int *)tmp; rnum++; } cbmapclose(map); cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare); *np = rnum; return result;}/* Get the sum of elements of two sets of documents. */ODPAIR *odpairsor(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ CBMAP *map; ODPAIR *result; const char *tmp; int i, score, rnum; assert(apairs && anum >= 0 && bpairs && bnum >= 0); map = odpairsmap(bpairs, bnum); for(i = 0; i < anum; i++){ score = 0; if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) score = *(int *)tmp; score += apairs[i].score; cbmapput(map, (char *)&(apairs[i].id), sizeof(int), (char *)&score, sizeof(int), TRUE); } rnum = cbmaprnum(map); result = cbmalloc(rnum * sizeof(ODPAIR) + 1); cbmapiterinit(map); for(i = 0; (tmp = cbmapiternext(map, NULL)) != NULL; i++){ result[i].id = *(int *)tmp; result[i].score = *(int *)cbmapget(map, tmp, sizeof(int), NULL); } cbmapclose(map); cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare); *np = rnum; return result;}/* Get the difference set of documents. */ODPAIR *odpairsnotand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ CBMAP *map; ODPAIR *result; const char *tmp; int i, rnum; assert(apairs && anum >= 0 && bpairs && bnum >= 0); map = odpairsmap(bpairs, bnum); result = cbmalloc(sizeof(ODPAIR) * anum + 1); rnum = 0; for(i = 0; i < anum; i++){ if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) continue; result[rnum].id = apairs[i].id; result[rnum].score = apairs[i].score; rnum++; } cbmapclose(map); cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare); *np = rnum; return result;}/* Sort a set of documents in descending order of scores. */void odpairssort(ODPAIR *pairs, int pnum){ assert(pairs && pnum >= 0); cbqsort(pairs, pnum, sizeof(ODPAIR), odsortcompare);}/* Get the natural logarithm of a number. */double odlogarithm(double x){ int i; if(x <= 1.0) return 0.0; x = x * x * x * x * x * x * x * x * x * x; for(i = 0; x > 1.0; i++){ x /= 2.718281828459; } return (double)i / 10.0;}/* Get the cosine of the angle of two vectors. */double odvectorcosine(const int *avec, const int *bvec, int vnum){ double rv; assert(avec && bvec && vnum >= 0); rv = odvecinnerproduct(avec, bvec, vnum) / ((odvecabsolute(avec, vnum) * odvecabsolute(bvec, vnum))); return rv > 0.0 ? rv : 0.0;}/************************************************************************************************* * Functions for Experts *************************************************************************************************//* Get the positive one of square roots of a number. */double odsquareroot(double x){ double c, rv; if(x <= 0.0) return 0.0; c = x > 1.0 ? x : 1; do { rv = c; c = (x / c + c) * 0.5; } while(c < rv); return rv;}/* Get the absolute of a vector. */double odvecabsolute(const int *vec, int vnum){ double rv; int i; assert(vec && vnum >= 0); rv = 0; for(i = 0; i < vnum; i++){ rv += (double)vec[i] * (double)vec[i]; } return odsquareroot(rv);}/* Get the inner product of two vectors. */double odvecinnerproduct(const int *avec, const int *bvec, int vnum){ double rv; int i; assert(avec && bvec && vnum >= 0); rv = 0; for(i = 0; i < vnum; i++){ rv += (double)avec[i] * (double)bvec[i]; } return rv;}/************************************************************************************************* * private objects *************************************************************************************************//* Sort the records of inverted index. `odeum' specifies a database handle. If successful, the return value is true, else, it is false. */static int odsortindex(ODEUM *odeum){ const char *word; char *tmp; int wsiz, tsiz; ODPAIR *pairs; assert(odeum); cbmapiterinit(odeum->sortmap); while((word = cbmapiternext(odeum->sortmap, &wsiz)) != NULL){ if((tmp = crget(odeum->indexdb, word, wsiz, 0, -1, &tsiz)) != NULL){ pairs = (ODPAIR *)tmp; cbqsort(pairs, tsiz / sizeof(ODPAIR), sizeof(ODPAIR), odsortcompare); if(!crput(odeum->indexdb, word, wsiz, tmp, tsiz, CR_DOVER)){ free(tmp); return FALSE; } free(tmp); } else if(dpecode != DP_ENOITEM){ return FALSE; } } cbmapclose(odeum->sortmap); odeum->sortmap = cbmapopen(); return TRUE;}/* Compare two pairs of structures of a search result. `a' specifies the pointer to the region of one pair. `b' specifies the pointer to the region of the other pair. The return value is positive if the former is big, negative if the latter is big, 0 if both are equivalent. */static int odsortcompare(const void *a, const void *b){ ODPAIR *ap, *bp; int rv; assert(a && b); ap = (ODPAIR *)a; bp = (ODPAIR *)b; rv = bp->score - ap->score; if(rv != 0) return rv; return ap->id - bp->id;}/* Purge the elements of the deleted documents from the inverted index. `odeum' specifies a database handle. If successful, the return value is true, else, it is false. */static int odpurgeindex(ODEUM *odeum){ ODPAIR *pairs; char *kbuf, *vbuf; int i, ksiz, vsiz, pnum, wi; assert(odeum); if(!criterinit(odeum->indexdb)) return FALSE; while(TRUE){ if(!(kbuf = criternext(odeum->indexdb, &ksiz))){ if(dpecode != DP_ENOITEM) return FALSE; break; } if(!(vbuf = crget(odeum->indexdb, kbuf, ksiz, 0, -1, &vsiz))){ dpecode = DP_EBROKEN; free(kbuf); return FALSE; } pairs = (ODPAIR *)vbuf; pnum = vsiz / sizeof(ODPAIR); wi = 0; for(i = 0; i < pnum; i++){ if(crvsiz(odeum->docsdb, (char *)&(pairs[i].id), sizeof(int)) != -1){ pairs[wi++] = pairs[i]; } } if(wi > 0){ if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, wi * sizeof(ODPAIR), CR_DOVER)){ free(vbuf); free(kbuf); return FALSE; } } else { if(!crout(odeum->indexdb, kbuf, ksiz)){ free(vbuf); free(kbuf); return FALSE; } } free(vbuf); free(kbuf); } return TRUE;}/* Create a map of a document array. `pairs' specifies the pointer to a document array. `num' specifies the number of elements of the array. The return value is a map of the document array. */static CBMAP *odpairsmap(const ODPAIR *pairs, int num){ CBMAP *map; int i; assert(pairs && num >= 0); map = cbmapopen(); for(i = 0; i < num; i++){ cbmapput(map, (char *)&(pairs[i].id), sizeof(int), (char *)&(pairs[i].score), sizeof(int), TRUE); } return map;}/* compare two pairs of structures of words in a document. `a' specifies the pointer to the region of one word. `b' specifies the pointer to the region of the other word. The return value is positive if the former is big, negative if the latter is big, 0 if both are equivalent. */static int odwordcompare(const void *a, const void *b){ ODWORD *ap, *bp; int rv; assert(a && b); ap = (ODWORD *)a; bp = (ODWORD *)b; if((rv = bp->num - ap->num) != 0) return rv; if((rv = strlen(bp->word) - strlen(ap->word)) != 0) return rv; return strcmp(ap->word, bp->word);}/* END OF FILE */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -