📄 zvrank.c
字号:
/* $Id: zvrank.c,v 1.5 2003/05/20 09:43:46 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 Index Data Aps This file is part of the Zebra server. Zebra is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. Zebra is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Zebra; see the file LICENSE.zebra. If not, write to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*//* Zebra Vector Space Model RANKing **** six (seven) letter identifier for weighting scheme** best document weighting:** tfc nfc (tpc npc) [original naming]** ntc atc npc apc [SMART naming, used here]** best query weighting:** nfx tfx bfx (npx tpx bpx) [original naming]** atn ntn btn apn npn bpn [SMART naming]** -> should set zvrank.weighting-scheme to one of** "ntc-atn", "atc-atn", etc.*/#include <math.h> /* for log */#include <stdio.h>#include <assert.h>#ifdef WIN32#include <io.h>#else#include <unistd.h>#endif#include "index.h"static double blog(double x) { /* log_2, log_e or log_10 is used, best to change it here if necessary */ if (x <= 0) return 0.0; return log(x); /* / log(base) */}/* structures */struct rank_class_info { /* now we need this */ int dummy; char rscheme[8]; /* name of weighting scheme */};struct rs_info { /* for result set */ int db_docs; /* number of documents in database (collection) */ int db_terms; /* number of distinct terms in database (debugging?) */ int db_f_max; /* maximum of f_t in database (debugging?) */ char *db_f_max_str; /* string (most frequent term) - for debugging */ /**/ char rscheme[8]; /* name of weighting scheme */ /**/ int veclen; void (*d_tf_fct)(void *, void *); /* doc term frequency function */ void (*d_idf_fct)(void *, void *); /* doc idf function */ void (*d_norm_fct)(void *, void *); /* doc normalization function */ /**/ void (*q_tf_fct)(void *, void *); /* query term frequency function */ void (*q_idf_fct)(void *, void *); /* query idf function */ void (*q_norm_fct)(void *, void *); /* query normalization function */ double (*sim_fct)(void *, void *); /* similarity function (scoring function) */ struct ds_info *qdoc; struct ds_info *rdoc;};typedef struct rs_info *RS;static void prn_rs(RS rs) { /* for debugging */ yaz_log(LOG_DEBUG, "* RS:"); yaz_log(LOG_DEBUG, " db_docs: %d", rs->db_docs); yaz_log(LOG_DEBUG, " db_terms: %d", rs->db_terms); yaz_log(LOG_DEBUG, " f_max: %d", rs->db_f_max); yaz_log(LOG_DEBUG, " f_max_str: %s", rs->db_f_max_str); yaz_log(LOG_DEBUG, " veclen: %d", rs->veclen); /* rscheme implies functions */ yaz_log(LOG_DEBUG, " rscheme: %s", rs->rscheme); return;}struct ds_info { /* document info */ char *docid; /* unique doc identifier */ int docno; /* doc number */ int doclen; /* document length */ int d_f_max; /* maximum number of any term in doc (needed) */ char *d_f_max_str; /* most frequent term in d - for debugging */ int veclen; /* vector length */ struct ts_info *terms; double docsim; /* similarity in [0, ..., 1] (= score/1000) */};typedef struct ds_info* DS;#if 0static void prn_ds(DS ds) { /* for debugging */ yaz_log(LOG_DEBUG, " * DS:"); yaz_log(LOG_DEBUG, " docid: %s", ds->docid); yaz_log(LOG_DEBUG, " docno: %d", ds->docno); yaz_log(LOG_DEBUG, " doclen: %d", ds->doclen); yaz_log(LOG_DEBUG, " d_f_max: %d", ds->d_f_max); yaz_log(LOG_DEBUG, " d_f_max_str:%s", ds->d_f_max_str); yaz_log(LOG_DEBUG, " veclen: %d", ds->veclen); return;}#endifstruct ts_info { /* term info */ char *name; int *id; /**/ int gocc; int locc; double tf; double idf; double wt;};typedef struct ts_info *TS;#if 0static void prn_ts(TS ts) { /* for debugging */ yaz_log(LOG_DEBUG, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f", ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt); return;}#endif/* end structures *//* *** *//* ** weighting functions ** check: RS is not needed anymore*//* calculate and store new term frequency vector */static void tf_none(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen, freq; /* no conversion. 1 <= tf */ veclen=ds->veclen; for (i=0; i < veclen; i++) { freq=ds->terms[i].locc; ds->terms[i].tf=freq; } return;}static void tf_binary(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen, freq; /* tf in {0, 1} */ veclen=ds->veclen; for (i=0; i < veclen; i++) { freq=ds->terms[i].locc; if (freq > 0) ds->terms[i].tf=1.0; else ds->terms[i].tf=0.0; } return;}static void tf_max_norm(void *rsi, void *dsi) { DS ds=(DS)dsi; double tf_max; int i, veclen, freq; /* divide each term by max, so 0 <= tf <= 1 */ tf_max=ds->d_f_max; /* largest frequency of t in document */ veclen=ds->veclen; for (i=0; i < veclen; i++) { freq=ds->terms[i].locc; if ((freq > 0) && (tf_max > 0.0)) ds->terms[i].tf=freq/tf_max; else ds->terms[i].tf=0.0; } return;}static void tf_aug_norm(void *rsi, void *dsi) { DS ds=(DS)dsi; double K; double tf_max; int i, veclen, freq; /* augmented normalized tf. 0.5 <= tf <= 1 for K = 0.5 */ tf_max=ds->d_f_max; /* largest frequency of t in document */ veclen=ds->veclen; K=0.5; /* zvrank.const-K */ for (i=0; i < veclen; i++) { freq=ds->terms[i].locc; if ((freq > 0) && (tf_max > 0.0)) ds->terms[i].tf=K+(1.0-K)*(freq/tf_max); else ds->terms[i].tf=0.0; } return;}static void tf_square(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen, freq; /* tf ^ 2 */ veclen=ds->veclen; for (i=0; i < veclen; i++) { freq=ds->terms[i].locc; if (freq > 0) ds->terms[i].tf=freq*freq; else ds->terms[i].tf=0.0; } return;}static void tf_log(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen, freq; /* logarithmic tf */ veclen=ds->veclen; for (i=0; i < veclen; i++) { freq=ds->terms[i].locc; if (freq > 0) ds->terms[i].tf=1.0+blog(freq); else ds->terms[i].tf=0.0; } return;}/* calculate and store inverse document frequency vector */static void idf_none(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen; /* no conversion */ veclen=ds->veclen; for (i=0; i < veclen; i++) { ds->terms[i].idf=1.0; } return;}static void idf_tfidf(void *rsi, void *dsi) { RS rs=(RS)rsi; DS ds=(DS)dsi; int num_docs, gocc; int i, veclen; double idf; /* normal tfidf weight */ veclen=ds->veclen; num_docs=rs->db_docs; for (i=0; i < veclen; i++) { gocc=ds->terms[i].gocc; if (gocc==0) idf=0.0; else idf=blog(num_docs/gocc); ds->terms[i].idf=idf; } return;}static void idf_prob(void *rsi, void *dsi) { RS rs=(RS)rsi; DS ds=(DS)dsi; int num_docs, gocc; int i, veclen; double idf; /* probabilistic formulation */ veclen=ds->veclen; num_docs=rs->db_docs; for (i=0; i < veclen; i++) { gocc=ds->terms[i].gocc; if (gocc==0) idf=0.0; else idf=blog((num_docs-gocc)/gocc); ds->terms[i].idf=idf; } return;}static void idf_freq(void *rsi, void *dsi) { RS rs=(RS)rsi; DS ds=(DS)dsi; int num_docs; int i, veclen; double idf; /* frequency formulation */ veclen=ds->veclen; num_docs=rs->db_docs; if (num_docs==0) idf=0.0; else idf=1.0/num_docs; for (i=0; i < veclen; i++) { ds->terms[i].idf=idf; } return;}static void idf_squared(void *rsi, void *dsi) { RS rs=(RS)rsi; DS ds=(DS)dsi; int num_docs, gocc; int i, veclen; double idf; /* idf ^ 2 */ veclen=ds->veclen; num_docs=rs->db_docs; yaz_log(LOG_DEBUG, "idf_squared: db_docs required"); for (i=0; i < veclen; i++) { gocc=ds->terms[i].gocc; if (gocc==0) idf=0.0; else idf=blog(num_docs/gocc); idf=idf*idf; ds->terms[i].idf=idf; } return;}/* calculate and store normalized weight (tf-idf) vector */static void norm_none(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen; /* no normalization */ veclen=ds->veclen; for (i=0; i < veclen; i++) { ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; } return;}static void norm_sum(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen; double tfs=0.0; /**/ veclen=ds->veclen; for (i=0; i < veclen; i++) { ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; tfs+=ds->terms[i].wt; } if (tfs > 0.0) for (i=0; i < veclen; i++) { ds->terms[i].wt=ds->terms[i].wt/tfs; } /* else: tfs==0 && ds->terms[i].wt==0 */ return;}static void norm_cosine(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen; double tfs=0.0; /**/ veclen=ds->veclen; for (i=0; i < veclen; i++) { ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; tfs+=(ds->terms[i].wt*ds->terms[i].wt); } tfs=sqrt(tfs); if (tfs > 0.0) for (i=0; i < veclen; i++) { ds->terms[i].wt=ds->terms[i].wt/tfs; } /* else: tfs==0 && ds->terms[i].wt==0 */ return;}static void norm_fourth(void *rsi, void *dsi) { DS ds=(DS)dsi; int i, veclen; double tfs=0.0, fr; /**/ veclen=ds->veclen; for (i=0; i < veclen; i++) { ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; fr=(ds->terms[i].wt*ds->terms[i].wt); fr=fr*fr; /* ^ 4 */ tfs+=fr; } if (tfs > 0.0) for (i=0; i < veclen; i++) { ds->terms[i].wt=ds->terms[i].wt/tfs;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -