📄 odeum.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
const char *oddocgetattr(const ODDOC *doc, const char *name){  assert(doc && name);  return cbmapget(doc->attrs, name, -1, NULL);}/* Get the list handle contains words in normalized form of a document. */const CBLIST *oddocnwords(const ODDOC *doc){  assert(doc);  return doc->nwords;}/* Get the list handle contains words in appearance form of a document. */const CBLIST *oddocawords(const ODDOC *doc){  assert(doc);  return doc->awords;}/* Get the map handle contains keywords in normalized form and their scores. */CBMAP *oddocscores(const ODDOC *doc, int max, ODEUM *odeum){  const CBLIST *nwords;  CBMAP *map, *kwmap;  const char *word, *ctmp;  char numbuf[OD_NUMBUFSIZ];  ODWORD *owords;  int i, wsiz, wnum, hnum, mnum, nbsiz;  double ival;  assert(doc && max >= 0);  map = cbmapopen();  nwords = oddocnwords(doc);  for(i = 0; i < cblistnum(nwords); i++){    word = cblistval(nwords, i, &wsiz);    if(wsiz < 1) continue;    if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){      wnum = *(int *)ctmp + OD_WOCCRPOINT;    } else {      wnum = OD_WOCCRPOINT;    }    cbmapput(map, word, wsiz, (char *)&wnum, sizeof(int), TRUE);  }  mnum = cbmaprnum(map);  owords = cbmalloc(mnum * sizeof(ODWORD) + 1);  cbmapiterinit(map);  for(i = 0; (word = cbmapiternext(map, &wsiz)) != NULL; i++){    owords[i].word = word;    owords[i].num = *(int *)cbmapget(map, word, wsiz, NULL);  }  cbqsort(owords, mnum, sizeof(ODWORD), odwordcompare);  if(odeum){    if(mnum > OD_KEYCANDS) mnum = OD_KEYCANDS;    for(i = 0; i < mnum; i++){      if((hnum = odsearchdnum(odeum, owords[i].word)) < 0) hnum = 0;      ival = odlogarithm(hnum);      ival = (ival * ival * ival) / 8.0;      if(ival < 8.0) ival = 8.0;      owords[i].num = owords[i].num / ival;    }    cbqsort(owords, mnum, sizeof(ODWORD), odwordcompare);  }  if(mnum > max) mnum = max;  kwmap = cbmapopen();  for(i = 0; i < mnum; i++){    nbsiz = sprintf(numbuf, "%d", owords[i].num);    cbmapput(kwmap, owords[i].word, -1, numbuf, nbsiz, TRUE);  }  free(owords);  cbmapclose(map);  return kwmap;}/* Break a text into words in appearance form. */CBLIST *odbreaktext(const char *text){  const char *word;  CBLIST *elems, *words;  int i, j, dif, wsiz, pv, delim;  assert(text);  words = cblistopen();  elems = cbsplit(text, -1, OD_SPACECHARS);  for(i = 0; i < cblistnum(elems); i++){    word = cblistval(elems, i, &wsiz);    delim = FALSE;    j = 0;    pv = 0;    while(TRUE){      dif = j - pv;      if(j >= wsiz){        if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);        break;      }      if(delim){        if(!strchr(OD_DELIMCHARS, word[j])){          if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);          pv = j;          delim = FALSE;        }      } else {        if(strchr(OD_DELIMCHARS, word[j])){          if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);          pv = j;          delim = TRUE;        }      }      j++;    }  }  cblistclose(elems);  return words;}/* Make the normalized form of a word. */char *odnormalizeword(const char *asis){  char *nword;  int i;  assert(asis);  for(i = 0; asis[i] != '\0'; i++){    if(!strchr(OD_DELIMCHARS, asis[i])) break;  }  if(asis[i] == '\0') return cbmemdup("", 0);  nword = cbmemdup(asis, -1);  for(i = 0; nword[i] != '\0'; i++){    if(nword[i] >= 'A' && nword[i] <= 'Z') nword[i] += 'a' - 'A';  }  return nword;}/* Get the common elements of two sets of documents. */ODPAIR *odpairsand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){  CBMAP *map;  ODPAIR *result;  const char *tmp;  int i, rnum;  assert(apairs && anum >= 0 && bpairs && bnum >= 0);  map = odpairsmap(bpairs, bnum);  result = cbmalloc(sizeof(ODPAIR) * anum + 1);  rnum = 0;  for(i = 0; i < anum; i++){    if(!(tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL))) continue;    result[rnum].id = apairs[i].id;    result[rnum].score = apairs[i].score + *(int *)tmp;    rnum++;  }  cbmapclose(map);  cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare);  *np = rnum;  return result;}/* Get the sum of elements of two sets of documents. */ODPAIR *odpairsor(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){  CBMAP *map;  ODPAIR *result;  const char *tmp;  int i, score, rnum;  assert(apairs && anum >= 0 && bpairs && bnum >= 0);  map = odpairsmap(bpairs, bnum);  for(i = 0; i < anum; i++){    score = 0;    if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL)      score = *(int *)tmp;    score += apairs[i].score;    cbmapput(map, (char *)&(apairs[i].id), sizeof(int),             (char *)&score, sizeof(int), TRUE);  }  rnum = cbmaprnum(map);  result = cbmalloc(rnum * sizeof(ODPAIR) + 1);  cbmapiterinit(map);  for(i = 0; (tmp = cbmapiternext(map, NULL)) != NULL; i++){    result[i].id = *(int *)tmp;    result[i].score = *(int *)cbmapget(map, tmp, sizeof(int), NULL);  }  cbmapclose(map);  cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare);  *np = rnum;  return result;}/* Get the difference set of documents. */ODPAIR *odpairsnotand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){  CBMAP *map;  ODPAIR *result;  const char *tmp;  int i, rnum;  assert(apairs && anum >= 0 && bpairs && bnum >= 0);  map = odpairsmap(bpairs, bnum);  result = cbmalloc(sizeof(ODPAIR) * anum + 1);  rnum = 0;  for(i = 0; i < anum; i++){    if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) continue;    result[rnum].id = apairs[i].id;    result[rnum].score = apairs[i].score;    rnum++;  }  cbmapclose(map);  cbqsort(result, rnum, sizeof(ODPAIR), odsortcompare);  *np = rnum;  return result;}/* Sort a set of documents in descending order of scores. */void odpairssort(ODPAIR *pairs, int pnum){  assert(pairs && pnum >= 0);  cbqsort(pairs, pnum, sizeof(ODPAIR), odsortcompare);}/* Get the natural logarithm of a number. */double odlogarithm(double x){  int i;  if(x <= 1.0) return 0.0;  x = x * x * x * x * x * x * x * x * x * x;  for(i = 0; x > 1.0; i++){    x /= 2.718281828459;  }  return (double)i / 10.0;}/* Get the cosine of the angle of two vectors. */double odvectorcosine(const int *avec, const int *bvec, int vnum){  double rv;  assert(avec && bvec && vnum >= 0);  rv = odvecinnerproduct(avec, bvec, vnum) /    ((odvecabsolute(avec, vnum) * odvecabsolute(bvec, vnum)));  return rv > 0.0 ? rv : 0.0;}/************************************************************************************************* * Functions for Experts *************************************************************************************************//* Get the positive one of square roots of a number. */double odsquareroot(double x){  double c, rv;  if(x <= 0.0) return 0.0;  c = x > 1.0 ? x : 1;  do {    rv = c;    c = (x / c + c) * 0.5;  } while(c < rv);  return rv;}/* Get the absolute of a vector. */double odvecabsolute(const int *vec, int vnum){  double rv;  int i;  assert(vec && vnum >= 0);  rv = 0;  for(i = 0; i < vnum; i++){    rv += (double)vec[i] * (double)vec[i];  }  return odsquareroot(rv);}/* Get the inner product of two vectors. */double odvecinnerproduct(const int *avec, const int *bvec, int vnum){  double rv;  int i;  assert(avec && bvec && vnum >= 0);  rv = 0;  for(i = 0; i < vnum; i++){    rv += (double)avec[i] * (double)bvec[i];  }  return rv;}/************************************************************************************************* * private objects *************************************************************************************************//* Sort the records of inverted index.   `odeum' specifies a database handle.   If successful, the return value is true, else, it is false. */static int odsortindex(ODEUM *odeum){  const char *word;  char *tmp;  int wsiz, tsiz;  ODPAIR *pairs;  assert(odeum);  cbmapiterinit(odeum->sortmap);  while((word = cbmapiternext(odeum->sortmap, &wsiz)) != NULL){    if((tmp = crget(odeum->indexdb, word, wsiz, 0, -1, &tsiz)) != NULL){      pairs = (ODPAIR *)tmp;      cbqsort(pairs, tsiz / sizeof(ODPAIR), sizeof(ODPAIR), odsortcompare);      if(!crput(odeum->indexdb, word, wsiz, tmp, tsiz, CR_DOVER)){        free(tmp);        return FALSE;      }      free(tmp);    } else if(dpecode != DP_ENOITEM){      return FALSE;    }  }  cbmapclose(odeum->sortmap);  odeum->sortmap = cbmapopen();  return TRUE;}/* Compare two pairs of structures of a search result.   `a' specifies the pointer to the region of one pair.   `b' specifies the pointer to the region of the other pair.   The return value is positive if the former is big, negative if the latter is big, 0 if both   are equivalent. */static int odsortcompare(const void *a, const void *b){  ODPAIR *ap, *bp;  int rv;  assert(a && b);  ap = (ODPAIR *)a;  bp = (ODPAIR *)b;  rv = bp->score - ap->score;  if(rv != 0) return rv;  return ap->id - bp->id;}/* Purge the elements of the deleted documents from the inverted index.   `odeum' specifies a database handle.   If successful, the return value is true, else, it is false. */static int odpurgeindex(ODEUM *odeum){  ODPAIR *pairs;  char *kbuf, *vbuf;  int i, ksiz, vsiz, pnum, wi;  assert(odeum);  if(!criterinit(odeum->indexdb)) return FALSE;  while(TRUE){    if(!(kbuf = criternext(odeum->indexdb, &ksiz))){      if(dpecode != DP_ENOITEM) return FALSE;      break;    }    if(!(vbuf = crget(odeum->indexdb, kbuf, ksiz, 0, -1, &vsiz))){      dpecode = DP_EBROKEN;      free(kbuf);      return FALSE;    }    pairs = (ODPAIR *)vbuf;    pnum = vsiz / sizeof(ODPAIR);    wi = 0;    for(i = 0; i < pnum; i++){      if(crvsiz(odeum->docsdb, (char *)&(pairs[i].id), sizeof(int)) != -1){        pairs[wi++] = pairs[i];      }    }    if(wi > 0){      if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, wi * sizeof(ODPAIR), CR_DOVER)){        free(vbuf);        free(kbuf);        return FALSE;      }    } else {      if(!crout(odeum->indexdb, kbuf, ksiz)){        free(vbuf);        free(kbuf);        return FALSE;      }    }    free(vbuf);    free(kbuf);  }  return TRUE;}/* Create a map of a document array.   `pairs' specifies the pointer to a document array.   `num' specifies the number of elements of the array.   The return value is a map of the document array. */static CBMAP *odpairsmap(const ODPAIR *pairs, int num){  CBMAP *map;  int i;  assert(pairs && num >= 0);  map = cbmapopen();  for(i = 0; i < num; i++){    cbmapput(map, (char *)&(pairs[i].id), sizeof(int),             (char *)&(pairs[i].score), sizeof(int), TRUE);  }  return map;}/* compare two pairs of structures of words in a document.   `a' specifies the pointer to the region of one word.   `b' specifies the pointer to the region of the other word.   The return value is positive if the former is big, negative if the latter is big, 0 if both   are equivalent. */static int odwordcompare(const void *a, const void *b){  ODWORD *ap, *bp;  int rv;  assert(a && b);  ap = (ODWORD *)a;  bp = (ODWORD *)b;  if((rv = bp->num - ap->num) != 0) return rv;  if((rv = strlen(bp->word) - strlen(ap->word)) != 0) return rv;  return strcmp(ap->word, bp->word);}/* END OF FILE */
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -