⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zvrank.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
        }    /* else: tfs==0 && ds->terms[i].wt==0 */    return;}static void norm_max(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen;    double tfm=0.0;    /**/    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;        if (ds->terms[i].wt > tfm)            tfm=ds->terms[i].wt;    }    if (tfm > 0.0)        for (i=0; i < veclen; i++) {            ds->terms[i].wt=ds->terms[i].wt/tfm;        }    /* else: tfs==0 && ds->terms[i].wt==0 */    return;}/* add: norm_pivot, ... */static double sim_cosine(void *dsi1, void *dsi2) {    DS ds1=(DS)dsi1;    DS ds2=(DS)dsi2;    int i, veclen;    double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;    double v1, v2;    /**/    veclen=ds1->veclen; /* and ds2->veclen */    for (i=0; i < veclen; i++) {        v1=ds1->terms[i].wt;        v2=ds2->terms[i].wt;        smul +=(v1*v2);        sqr11+=(v1*v1);        sqr22+=(v2*v2);    }    sdiv=sqrt(sqr11*sqr22);    if (sdiv==0.0)        return 0.0;    return (smul/sdiv);}/* add: norm_jaccard, norm_dice, ... *//* end weighting functions *//* *** */static void zv_init_scheme(RS rs, const char *sname) {    int slen;    char c0, c1, c2, c3, c4, c5, c6;    const char *def_rscheme="ntc-atn"; /* a good default */    /**/    yaz_log(LOG_DEBUG, "zv_init_scheme");    slen=strlen(sname);    if (slen < 7)         yaz_log(LOG_LOG, "zvrank: invalid weighting-scheme \"%s\"", sname);    if (slen > 0) c0=sname[0]; else c0=def_rscheme[0];    if (slen > 1) c1=sname[1]; else c1=def_rscheme[1];    if (slen > 2) c2=sname[2]; else c2=def_rscheme[2];    c3='-';    if (slen > 4) c4=sname[4]; else c4=def_rscheme[4];    if (slen > 5) c5=sname[5]; else c5=def_rscheme[5];    if (slen > 6) c6=sname[6]; else c6=def_rscheme[6];    /**/                      /* assign doc functions */                      switch (c0) {                      case 'b':                          rs->d_tf_fct=tf_binary;                          rs->rscheme[0]='b';                          break;                      case 'm':                          rs->d_tf_fct=tf_max_norm;                          rs->rscheme[0]='m';                          yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");                          break;                      case 'a':                          rs->d_tf_fct=tf_aug_norm;                          rs->rscheme[0]='a';                          yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");                          break;                      case 's':                          rs->d_tf_fct=tf_square;                          rs->rscheme[0]='s';                          break;                      case 'l':                          rs->d_tf_fct=tf_log;                          rs->rscheme[0]='l';                          break;                      default: /* 'n' */                          rs->d_tf_fct=tf_none;                          rs->rscheme[0]='n';                      }                      switch (c1) {                      case 't':                          rs->d_idf_fct=idf_tfidf;                          rs->rscheme[1]='t';                          yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");                          break;                      case 'p':                          rs->d_idf_fct=idf_prob;                          rs->rscheme[1]='p';                          yaz_log(LOG_DEBUG, "idf_prob: db_docs required");                          break;                      case 'f':                          rs->d_idf_fct=idf_freq;                          rs->rscheme[1]='f';                          yaz_log(LOG_DEBUG, "idf_freq: db_docs required");                          break;                      case 's':                          rs->d_idf_fct=idf_squared;                          rs->rscheme[1]='s';                          yaz_log(LOG_DEBUG, "idf_squared: db_docs required");                          break;                      default: /* 'n' */                          rs->d_idf_fct=idf_none;                          rs->rscheme[1]='n';                      }                      switch (c2) {                      case 's':                          rs->d_norm_fct=norm_sum;                          rs->rscheme[2]='s';                          break;                      case 'c':                          rs->d_norm_fct=norm_cosine;                          rs->rscheme[2]='c';                          break;                      case 'f':                          rs->d_norm_fct=norm_fourth;                          rs->rscheme[2]='t';                          break;                      case 'm':                          rs->d_norm_fct=norm_max;                          rs->rscheme[2]='m';                          break;                      default: /* 'n' */                          rs->d_norm_fct=norm_none;                          rs->rscheme[2]='n';                      }                      /**/                      rs->rscheme[3]='-';                      /* assign query functions */                      switch (c4) {                      case 'b':                          rs->q_tf_fct=tf_binary;                          rs->rscheme[4]='b';                          break;                      case 'm':                          rs->q_tf_fct=tf_max_norm;                          yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");                          rs->rscheme[4]='m';                          break;                      case 'a':                          rs->q_tf_fct=tf_aug_norm;                          rs->rscheme[4]='a';                          yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");                          break;                      case 's':                          rs->q_tf_fct=tf_square;                          rs->rscheme[4]='s';                          break;                      case 'l':                          rs->q_tf_fct=tf_log;                          rs->rscheme[4]='l';                          break;                      default: /* 'n' */                          rs->q_tf_fct=tf_none;                          rs->rscheme[4]='n';                      }                      switch (c5) {                      case 't':                          rs->q_idf_fct=idf_tfidf;                          rs->rscheme[5]='t';                          yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");                          break;                      case 'p':                          rs->q_idf_fct=idf_prob;                          rs->rscheme[5]='p';                          yaz_log(LOG_DEBUG, "idf_prob: db_docs required");                          break;                      case 'f':                          rs->q_idf_fct=idf_freq;                          rs->rscheme[5]='f';                          yaz_log(LOG_DEBUG, "idf_freq: db_docs required");                          break;                      case 's':                          rs->q_idf_fct=idf_squared;                          rs->rscheme[5]='s';                          yaz_log(LOG_DEBUG, "idf_squared: db_docs required");                          break;                      default: /* 'n' */                          rs->q_idf_fct=idf_none;                          rs->rscheme[5]='n';                      }                      switch (c6) {                      case 's':                          rs->q_norm_fct=norm_sum;                          rs->rscheme[6]='s';                          break;                      case 'c':                          rs->q_norm_fct=norm_cosine;                          rs->rscheme[6]='c';                          break;                      case 'f':                          rs->q_norm_fct=norm_fourth;                          rs->rscheme[6]='f';                          break;                      case 'm':                          rs->q_norm_fct=norm_max;                          rs->rscheme[6]='m';                          break;                      default: /* 'n' */                          rs->q_norm_fct=norm_none;                          rs->rscheme[6]='n';                      }                      rs->rscheme[7]='\0';                      /**/                      rs->sim_fct=sim_cosine;                      yaz_log(LOG_DEBUG, "zv_scheme %s", rs->rscheme);                      return;}static void zv_init(RS rs, const char *rscheme) {    yaz_log(LOG_DEBUG, "zv_init");    /**/    rs->db_docs=100000;   /* assign correct value here */    rs->db_terms=500000;  /* assign correct value here (for debugging) */    rs->db_f_max=50;      /* assign correct value here */    rs->db_f_max_str="a"; /* assign correct value here (for debugging) */    zv_init_scheme(rs, rscheme);    return;}/******//* * zv_create: Creates/Initialises this rank handler. This routine is  *  called exactly once. The routine returns the class_handle. */static void *zv_create (ZebraHandle zh) {    int i;    Res res = zh->res;    const char *wscheme;    struct rank_class_info *ci = (struct rank_class_info *)        xmalloc (sizeof(*ci));    yaz_log(LOG_DEBUG, "zv_create");    wscheme=res_get(res, "zvrank.weighting-scheme");    for (i=0; (i < strlen(wscheme)) && (i < 8); i++)         ci->rscheme[i]=wscheme[i];    return ci;}/* * zv_destroy: Destroys this rank handler. This routine is called *  when the handler is no longer needed - i.e. when the server *  dies. The class_handle was previously returned by create. */static void zv_destroy (struct zebra_register *reg, void *class_handle) {    struct rank_class_info *ci = (struct rank_class_info *) class_handle;    yaz_log(LOG_DEBUG, "zv_destroy");    xfree (ci);}/* * zv_begin: Prepares beginning of "real" ranking. Called once for *  each result set. The returned handle is a "set handle" and *  will be used in each of the handlers below. */static void *zv_begin(struct zebra_register *reg, void *class_handle, RSET rset){    struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs));    struct rank_class_info *ci=(struct rank_class_info *)class_handle;    int i;    int veclen, gocc;    /**/    yaz_log(LOG_DEBUG, "zv_begin");    veclen=rset->no_rset_terms; /* smaller vector here */    zv_init(rs, ci->rscheme);    rs->veclen=veclen;    prn_rs(rs);      rs->qdoc=(struct ds_info *)xmalloc(sizeof(*rs->qdoc));    rs->qdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->qdoc->terms)*rs->veclen);    rs->qdoc->veclen=veclen;    rs->qdoc->d_f_max=1; /* no duplicates */     rs->qdoc->d_f_max_str="";     rs->rdoc=(struct ds_info *)xmalloc(sizeof(*rs->rdoc));    rs->rdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->rdoc->terms)*rs->veclen);    rs->rdoc->veclen=veclen;    rs->rdoc->d_f_max=10; /* just a guess */    rs->rdoc->d_f_max_str="";     /* yaz_log(LOG_DEBUG, "zv_begin_init"); */    for (i = 0; i < rs->veclen; i++)    {        gocc=rset->rset_terms[i]->nn;        /* yaz_log(LOG_DEBUG, "zv_begin_init i=%d gocc=%d", i, gocc); */        rs->qdoc->terms[i].gocc=gocc;        rs->qdoc->terms[i].locc=1;  /* assume query has no duplicate terms */        rs->rdoc->terms[i].gocc=gocc;        rs->rdoc->terms[i].locc=0;    }    (*rs->q_tf_fct)(rs, rs->qdoc); /* we do this once only */    (*rs->q_idf_fct)(rs, rs->qdoc);    (*rs->q_norm_fct)(rs, rs->qdoc);    return rs;}/* * zv_end: Terminates ranking process. Called after a result set *  has been ranked. */static void zv_end (struct zebra_register *reg, void *rsi){    RS rs=(RS)rsi;    yaz_log(LOG_DEBUG, "zv_end");    xfree(rs->qdoc->terms);    xfree(rs->rdoc->terms);    xfree(rs->qdoc);    xfree(rs->rdoc);    xfree(rs);    return;}/* * zv_add: Called for each word occurence in a result set. This routine *  should be as fast as possible. This routine should "incrementally" *  update the score. */static void zv_add (void *rsi, int seqno, int i) {    RS rs=(RS)rsi;    /* yaz_log(LOG_DEBUG, "zvrank zv_add seqno=%d term_index=%d", seqno, term_index);*/    rs->rdoc->terms[i].locc++;}/* * zv_calc: Called for each document in a result. This handler should  *  produce a score based on previous call(s) to the add handler. The *  score should be between 0 and 1000. If score cannot be obtained *  -1 should be returned. */static int zv_calc (void *rsi, int sysno){    int i, veclen;     int score=0;    double dscore=0.0;    RS rs=(RS)rsi;    /* yaz_log(LOG_DEBUG, "zv_calc"); */    /**/    veclen=rs->veclen;    if (veclen==0)        return -1;    for (i = 0; i < veclen; i++) {        /* qdoc weight has already been calculated */        (*rs->d_tf_fct)(rs, rs->rdoc);        (*rs->d_idf_fct)(rs, rs->rdoc);        (*rs->d_norm_fct)(rs, rs->rdoc);        dscore=rs->sim_fct(rs->qdoc, rs->rdoc);    }    score = dscore * 1000;    yaz_log (LOG_LOG, "sysno=%d score=%d", sysno, score);    if (score > 1000) /* should not happen */        score = 1000;    return score;}/* * Pseudo-meta code with sequence of calls as they occur in a * server. Handlers are prefixed by --: * *     server init *     -- create *     foreach search *        rank result set *        -- begin *        foreach record *           foreach word *              -- add *           -- calc *        -- end *     -- destroy *     server close */static struct rank_control rank_control_vsm = {    "zvrank",    zv_create,    zv_destroy,    zv_begin,    zv_end,    zv_calc,    zv_add,}; struct rank_control *rankzv_class = &rank_control_vsm;/* EOF */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -