⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zvrank.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
/* $Id: zvrank.c,v 1.5 2003/05/20 09:43:46 adam Exp $   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003   Index Data Aps   This file is part of the Zebra server.   Zebra is free software; you can redistribute it and/or modify it under   the terms of the GNU General Public License as published by the Free   Software Foundation; either version 2, or (at your option) any later   version.   Zebra is distributed in the hope that it will be useful, but WITHOUT ANY   WARRANTY; without even the implied warranty of MERCHANTABILITY or   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License   for more details.   You should have received a copy of the GNU General Public License   along with Zebra; see the file LICENSE.zebra.  If not, write to the   Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA   02111-1307, USA.*//* Zebra Vector Space Model RANKing **** six (seven) letter identifier for weighting scheme** best document weighting:**  tfc nfc (tpc npc) [original naming]**  ntc atc  npc apc  [SMART naming, used here]** best query weighting:**  nfx tfx bfx (npx tpx bpx) [original naming]**  atn ntn btn  apn npn bpn  [SMART naming]** -> should set zvrank.weighting-scheme to one of** "ntc-atn", "atc-atn", etc.*/#include <math.h>  /* for log */#include <stdio.h>#include <assert.h>#ifdef WIN32#include <io.h>#else#include <unistd.h>#endif#include "index.h"static double blog(double x) {     /* log_2, log_e or log_10 is used, best to change it here if necessary */    if (x <= 0)        return 0.0;    return log(x); /* / log(base) */}/* structures */struct rank_class_info { /* now we need this */    int dummy;    char rscheme[8];    /* name of weighting scheme */};struct rs_info {      /* for result set */    int db_docs;        /* number of documents in database (collection) */    int db_terms;       /* number of distinct terms in database (debugging?) */    int db_f_max;       /* maximum of f_t in database (debugging?) */    char *db_f_max_str; /* string (most frequent term) - for debugging */    /**/    char rscheme[8];    /* name of weighting scheme */    /**/    int veclen;    void (*d_tf_fct)(void *, void *);   /* doc term frequency function */    void (*d_idf_fct)(void *, void *);  /* doc idf function */    void (*d_norm_fct)(void *, void *); /* doc normalization function */    /**/    void (*q_tf_fct)(void *, void *);   /* query term frequency function */    void (*q_idf_fct)(void *, void *);  /* query idf function */    void (*q_norm_fct)(void *, void *); /* query normalization function */        double (*sim_fct)(void *, void *);  /* similarity function (scoring function) */    struct ds_info *qdoc;    struct ds_info *rdoc;};typedef struct rs_info *RS;static void prn_rs(RS rs) { /* for debugging */    yaz_log(LOG_DEBUG, "* RS:");    yaz_log(LOG_DEBUG, " db_docs:   %d", rs->db_docs);    yaz_log(LOG_DEBUG, " db_terms:  %d", rs->db_terms);    yaz_log(LOG_DEBUG, " f_max:     %d", rs->db_f_max);    yaz_log(LOG_DEBUG, " f_max_str: %s", rs->db_f_max_str);    yaz_log(LOG_DEBUG, " veclen:    %d", rs->veclen);    /* rscheme implies functions */    yaz_log(LOG_DEBUG, " rscheme:   %s", rs->rscheme);    return;}struct ds_info {       /* document info */    char *docid;         /* unique doc identifier */    int  docno;          /* doc number */    int doclen;          /* document length */    int d_f_max;         /* maximum number of any term in doc (needed) */    char *d_f_max_str;   /* most frequent term in d - for debugging */    int veclen;          /* vector length */    struct ts_info *terms;    double docsim;       /* similarity in [0, ..., 1] (= score/1000) */};typedef struct ds_info* DS;#if 0static void prn_ds(DS ds) { /* for debugging */    yaz_log(LOG_DEBUG, " * DS:");    yaz_log(LOG_DEBUG, " docid:      %s", ds->docid);    yaz_log(LOG_DEBUG, " docno:      %d", ds->docno);    yaz_log(LOG_DEBUG, " doclen:     %d", ds->doclen);    yaz_log(LOG_DEBUG, " d_f_max:    %d", ds->d_f_max);    yaz_log(LOG_DEBUG, " d_f_max_str:%s", ds->d_f_max_str);    yaz_log(LOG_DEBUG, " veclen:     %d", ds->veclen);    return;}#endifstruct ts_info {       /* term info */    char *name;    int *id;    /**/    int gocc;    int locc;    double tf;    double idf;    double wt;};typedef struct ts_info *TS;#if 0static void prn_ts(TS ts) { /* for debugging */    yaz_log(LOG_DEBUG, " * TERM:%s gocc:%d locc:%d  tf:%f idf:%f wt:%f",            ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);    return;}#endif/* end structures *//* *** *//* ** weighting functions ** check: RS is not needed anymore*//* calculate and store new term frequency vector */static void tf_none(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen, freq;    /* no conversion. 1 <= tf */    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        freq=ds->terms[i].locc;        ds->terms[i].tf=freq;    }    return;}static void tf_binary(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen, freq;    /* tf in {0, 1} */    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        freq=ds->terms[i].locc;        if (freq > 0)            ds->terms[i].tf=1.0;        else            ds->terms[i].tf=0.0;    }    return;}static void tf_max_norm(void *rsi, void *dsi) {    DS ds=(DS)dsi;    double tf_max;    int i, veclen, freq;    /* divide each term by max, so 0 <= tf <= 1 */    tf_max=ds->d_f_max; /* largest frequency of t in document */    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        freq=ds->terms[i].locc;        if ((freq > 0) &&            (tf_max > 0.0))             ds->terms[i].tf=freq/tf_max;        else            ds->terms[i].tf=0.0;    }    return;}static void tf_aug_norm(void *rsi, void *dsi) {    DS ds=(DS)dsi;    double K;     double tf_max;    int i, veclen, freq;    /* augmented normalized tf. 0.5 <= tf <= 1  for K = 0.5 */    tf_max=ds->d_f_max; /* largest frequency of t in document */    veclen=ds->veclen;    K=0.5; /* zvrank.const-K */    for (i=0; i < veclen; i++) {        freq=ds->terms[i].locc;        if ((freq > 0) &&            (tf_max > 0.0))             ds->terms[i].tf=K+(1.0-K)*(freq/tf_max);        else            ds->terms[i].tf=0.0;    }    return;}static void tf_square(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen, freq;    /* tf ^ 2 */    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        freq=ds->terms[i].locc;        if (freq > 0)             ds->terms[i].tf=freq*freq;        else            ds->terms[i].tf=0.0;    }    return;}static void tf_log(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen, freq;    /* logarithmic tf */        veclen=ds->veclen;    for (i=0; i < veclen; i++) {        freq=ds->terms[i].locc;        if (freq > 0)             ds->terms[i].tf=1.0+blog(freq);        else            ds->terms[i].tf=0.0;    }    return;}/* calculate and store inverse document frequency vector */static void idf_none(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen;    /* no conversion */    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        ds->terms[i].idf=1.0;    }    return;}static void idf_tfidf(void *rsi, void *dsi) {    RS rs=(RS)rsi;    DS ds=(DS)dsi;    int num_docs, gocc;    int i, veclen;    double idf;    /* normal tfidf weight */    veclen=ds->veclen;    num_docs=rs->db_docs;    for (i=0; i < veclen; i++) {        gocc=ds->terms[i].gocc;        if (gocc==0)             idf=0.0;         else            idf=blog(num_docs/gocc);        ds->terms[i].idf=idf;    }    return;}static void idf_prob(void *rsi, void *dsi) {    RS rs=(RS)rsi;    DS ds=(DS)dsi;    int num_docs, gocc;    int i, veclen;    double idf;    /* probabilistic formulation */    veclen=ds->veclen;    num_docs=rs->db_docs;    for (i=0; i < veclen; i++) {        gocc=ds->terms[i].gocc;        if (gocc==0)            idf=0.0;         else            idf=blog((num_docs-gocc)/gocc);        ds->terms[i].idf=idf;    }    return;}static void idf_freq(void *rsi, void *dsi) {    RS rs=(RS)rsi;    DS ds=(DS)dsi;    int num_docs;    int i, veclen;    double idf;    /* frequency formulation */    veclen=ds->veclen;    num_docs=rs->db_docs;    if (num_docs==0)        idf=0.0;    else        idf=1.0/num_docs;    for (i=0; i < veclen; i++) {        ds->terms[i].idf=idf;    }    return;}static void idf_squared(void *rsi, void *dsi) {    RS rs=(RS)rsi;    DS ds=(DS)dsi;    int num_docs, gocc;    int i, veclen;    double idf;    /* idf ^ 2 */    veclen=ds->veclen;    num_docs=rs->db_docs;    yaz_log(LOG_DEBUG, "idf_squared: db_docs required");    for (i=0; i < veclen; i++) {        gocc=ds->terms[i].gocc;        if (gocc==0)            idf=0.0;        else             idf=blog(num_docs/gocc);        idf=idf*idf;        ds->terms[i].idf=idf;    }    return;}/* calculate and store normalized weight (tf-idf) vector */static void norm_none(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen;    /* no normalization */    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;    }    return;}static void norm_sum(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen;    double tfs=0.0;    /**/    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;        tfs+=ds->terms[i].wt;    }     if (tfs > 0.0)        for (i=0; i < veclen; i++) {            ds->terms[i].wt=ds->terms[i].wt/tfs;        }    /* else: tfs==0 && ds->terms[i].wt==0 */    return;}static void norm_cosine(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen;    double tfs=0.0;    /**/    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;        tfs+=(ds->terms[i].wt*ds->terms[i].wt);    }     tfs=sqrt(tfs);     if (tfs > 0.0)        for (i=0; i < veclen; i++) {            ds->terms[i].wt=ds->terms[i].wt/tfs;        }    /* else: tfs==0 && ds->terms[i].wt==0 */    return;}static void norm_fourth(void *rsi, void *dsi) {    DS ds=(DS)dsi;    int i, veclen;    double tfs=0.0, fr;    /**/    veclen=ds->veclen;    for (i=0; i < veclen; i++) {        ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;        fr=(ds->terms[i].wt*ds->terms[i].wt);        fr=fr*fr; /* ^ 4 */        tfs+=fr;     }    if (tfs > 0.0)        for (i=0; i < veclen; i++) {            ds->terms[i].wt=ds->terms[i].wt/tfs;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -