tsrank.c

来自「postgresql8.3.4源码,开源数据库」· C语言 代码 · 共 857 行 · 第 1/2 页

C
857
字号
/*------------------------------------------------------------------------- * * tsrank.c *		rank tsvector by tsquery * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * * * IDENTIFICATION *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.12 2008/01/01 19:45:53 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include <math.h>#include "tsearch/ts_type.h"#include "tsearch/ts_utils.h"#include "utils/array.h"#include "miscadmin.h"static float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};#define wpos(wep)	( w[ WEP_GETWEIGHT(wep) ] )#define RANK_NO_NORM			0x00#define RANK_NORM_LOGLENGTH		0x01#define RANK_NORM_LENGTH		0x02#define RANK_NORM_EXTDIST		0x04#define RANK_NORM_UNIQ			0x08#define RANK_NORM_LOGUNIQ		0x10#define RANK_NORM_RDIVRPLUS1	0x20#define DEF_NORM_METHOD			RANK_NO_NORMstatic float calc_rank_or(float *w, TSVector t, TSQuery q);static float calc_rank_and(float *w, TSVector t, TSQuery q);/* * Returns a weight of a word collocation */static float4word_distance(int4 w){	if (w > 100)		return 1e-30f;	return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));}static intcnt_length(TSVector t){	WordEntry  *ptr = ARRPTR(t),			   *end = (WordEntry *) STRPTR(t);	int			len = 0;	while (ptr < end)	{		int			clen = POSDATALEN(t, ptr);		if (clen == 0)			len += 1;		else			len += clen;		ptr++;	}	return len;}static intWordECompareQueryItem(char *eval, char *qval, WordEntry *ptr, QueryOperand *item){	if (ptr->len == item->length)		return strncmp(					   eval + ptr->pos,					   qval + item->distance,					   item->length);	return (ptr->len > item->length) ? 1 : -1;}/* * Returns a pointer to a WordEntry corresponding 'item' from tsvector 't'. 'q' * is the TSQuery containing 'item'. Returns NULL if not found. */static WordEntry *find_wordentry(TSVector t, TSQuery q, QueryOperand *item){	WordEntry  *StopLow = ARRPTR(t);	WordEntry  *StopHigh = (WordEntry *) STRPTR(t);	WordEntry  *StopMiddle;	int			difference;	/* Loop invariant: StopLow <= item < StopHigh */	while (StopLow < StopHigh)	{		StopMiddle = StopLow + (StopHigh - StopLow) / 2;		difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item);		if (difference == 0)			return StopMiddle;		else if (difference < 0)			StopLow = StopMiddle + 1;		else			StopHigh = StopMiddle;	}	return NULL;}/* * sort QueryOperands by (length, word) */static intcompareQueryOperand(const void *a, const void *b, void *arg){	char	   *operand = (char *) arg;	QueryOperand *qa = (*(QueryOperand **) a);	QueryOperand *qb = (*(QueryOperand **) b);	if (qa->length == qb->length)		return strncmp(operand + qa->distance,					   operand + qb->distance,					   qb->length);	return (qa->length > qb->length) ? 1 : -1;}/* * Returns a sorted, de-duplicated array of QueryOperands in a query. * The returned QueryOperands are pointers to the original QueryOperands * in the query. * * Length of the returned array is stored in *size */static QueryOperand **SortAndUniqItems(TSQuery q, int *size){	char	   *operand = GETOPERAND(q);	QueryItem  *item = GETQUERY(q);	QueryOperand **res,			  **ptr,			  **prevptr;	ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);	/* Collect all operands from the tree to res */	while ((*size)--)	{		if (item->type == QI_VAL)		{			*ptr = (QueryOperand *) item;			ptr++;		}		item++;	}	*size = ptr - res;	if (*size < 2)		return res;	qsort_arg(res, *size, sizeof(QueryOperand **), compareQueryOperand, (void *) operand);	ptr = res + 1;	prevptr = res;	/* remove duplicates */	while (ptr - res < *size)	{		if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)		{			prevptr++;			*prevptr = *ptr;		}		ptr++;	}	*size = prevptr + 1 - res;	return res;}/* A dummy WordEntryPos array to use when haspos is false */static WordEntryPosVector POSNULL = {	1,							/* Number of elements that follow */	{0}};static floatcalc_rank_and(float *w, TSVector t, TSQuery q){	WordEntryPosVector **pos;	int			i,				k,				l,				p;	WordEntry  *entry;	WordEntryPos *post,			   *ct;	int4		dimt,				lenct,				dist;	float		res = -1.0;	QueryOperand **item;	int			size = q->size;	item = SortAndUniqItems(q, &size);	if (size < 2)	{		pfree(item);		return calc_rank_or(w, t, q);	}	pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);	WEP_SETPOS(POSNULL.pos[0], MAXENTRYPOS - 1);	for (i = 0; i < size; i++)	{		entry = find_wordentry(t, q, item[i]);		if (!entry)			continue;		if (entry->haspos)			pos[i] = _POSVECPTR(t, entry);		else			pos[i] = &POSNULL;		dimt = pos[i]->npos;		post = pos[i]->pos;		for (k = 0; k < i; k++)		{			if (!pos[k])				continue;			lenct = pos[k]->npos;			ct = pos[k]->pos;			for (l = 0; l < dimt; l++)			{				for (p = 0; p < lenct; p++)				{					dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));					if (dist || (dist == 0 && (pos[i] == &POSNULL || pos[k] == &POSNULL)))					{						float		curw;						if (!dist)							dist = MAXENTRYPOS;						curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));						res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);					}				}			}		}	}	pfree(pos);	pfree(item);	return res;}static floatcalc_rank_or(float *w, TSVector t, TSQuery q){	WordEntry  *entry;	WordEntryPos *post;	int4		dimt,				j,				i;	float		res = 0.0;	QueryOperand **item;	int			size = q->size;	item = SortAndUniqItems(q, &size);	for (i = 0; i < size; i++)	{		float		resj,					wjm;		int4		jm;		entry = find_wordentry(t, q, item[i]);		if (!entry)			continue;		if (entry->haspos)		{			dimt = POSDATALEN(t, entry);			post = POSDATAPTR(t, entry);		}		else		{			dimt = POSNULL.npos;			post = POSNULL.pos;		}		resj = 0.0;		wjm = -1.0;		jm = 0;		for (j = 0; j < dimt; j++)		{			resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));			if (wpos(post[j]) > wjm)			{				wjm = wpos(post[j]);				jm = j;			}		}/*		limit (sum(i/i^2),i->inf) = pi^2/6		resj = sum(wi/i^2),i=1,noccurence,		wi - should be sorted desc,		don't sort for now, just choose maximum weight. This should be corrected		Oleg Bartunov*/		res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;	}	if (size > 0)		res = res / size;	pfree(item);	return res;}static floatcalc_rank(float *w, TSVector t, TSQuery q, int4 method){	QueryItem  *item = GETQUERY(q);	float		res = 0.0;	int			len;	if (!t->size || !q->size)		return 0.0;	/* XXX: What about NOT? */	res = (item->type == QI_OPR && item->operator.oper == OP_AND) ?		calc_rank_and(w, t, q) : calc_rank_or(w, t, q);	if (res < 0)		res = 1e-20f;	if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)		res /= log((double) (cnt_length(t) + 1)) / log(2.0);	if (method & RANK_NORM_LENGTH)	{		len = cnt_length(t);		if (len > 0)			res /= (float) len;	}	/* RANK_NORM_EXTDIST not applicable */	if ((method & RANK_NORM_UNIQ) && t->size > 0)		res /= (float) (t->size);	if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)		res /= log((double) (t->size + 1)) / log(2.0);	if (method & RANK_NORM_RDIVRPLUS1)		res /= (res + 1);	return res;}static float *getWeights(ArrayType *win){	static float ws[lengthof(weights)];	int			i;	float4	   *arrdata;	if (win == 0)		return weights;	if (ARR_NDIM(win) != 1)		ereport(ERROR,				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),				 errmsg("array of weight must be one-dimensional")));	if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))		ereport(ERROR,				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),				 errmsg("array of weight is too short")));	if (ARR_HASNULL(win))		ereport(ERROR,				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),				 errmsg("array of weight must not contain nulls")));	arrdata = (float4 *) ARR_DATA_PTR(win);	for (i = 0; i < lengthof(weights); i++)	{		ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];		if (ws[i] > 1.0)			ereport(ERROR,					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),					 errmsg("weight out of range")));	}	return ws;}Datumts_rank_wttf(PG_FUNCTION_ARGS){	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));	TSVector	txt = PG_GETARG_TSVECTOR(1);	TSQuery		query = PG_GETARG_TSQUERY(2);	int			method = PG_GETARG_INT32(3);	float		res;	res = calc_rank(getWeights(win), txt, query, method);	PG_FREE_IF_COPY(win, 0);	PG_FREE_IF_COPY(txt, 1);	PG_FREE_IF_COPY(query, 2);	PG_RETURN_FLOAT4(res);}Datumts_rank_wtt(PG_FUNCTION_ARGS){	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));	TSVector	txt = PG_GETARG_TSVECTOR(1);	TSQuery		query = PG_GETARG_TSQUERY(2);	float		res;	res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?