tsvector.c

来自「postgresql8.3.4源码,开源数据库」· C语言 代码 · 共 551 行

C
551
字号
/*------------------------------------------------------------------------- * * tsvector.c *	  I/O functions for tsvector * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * * * IDENTIFICATION *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.11 2008/01/01 19:45:53 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "libpq/pqformat.h"#include "tsearch/ts_type.h"#include "tsearch/ts_locale.h"#include "tsearch/ts_utils.h"#include "utils/memutils.h"typedef struct{	WordEntry	entry;			/* must be first! */	WordEntryPos *pos;	int			poslen;			/* number of elements in pos */} WordEntryIN;/* Compare two WordEntryPos values for qsort */static intcomparePos(const void *a, const void *b){	int			apos = WEP_GETPOS(*(const WordEntryPos *) a);	int			bpos = WEP_GETPOS(*(const WordEntryPos *) b);	if (apos == bpos)		return 0;	return (apos > bpos) ? 1 : -1;}/* * Removes duplicate pos entries. If there's two entries with same pos * but different weight, the higher weight is retained. * * Returns new length. */static intuniquePos(WordEntryPos *a, int l){	WordEntryPos *ptr,			   *res;	if (l <= 1)		return l;	qsort((void *) a, l, sizeof(WordEntryPos), comparePos);	res = a;	ptr = a + 1;	while (ptr - a < l)	{		if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))		{			res++;			*res = *ptr;			if (res - a >= MAXNUMPOS - 1 ||				WEP_GETPOS(*res) == MAXENTRYPOS - 1)				break;		}		else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))			WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));		ptr++;	}	return res + 1 - a;}/* Compare two WordEntryIN values for qsort */static intcompareentry(const void *va, const void *vb, void *arg){	const WordEntryIN *a = (const WordEntryIN *) va;	const WordEntryIN *b = (const WordEntryIN *) vb;	char	   *BufferStr = (char *) arg;	if (a->entry.len == b->entry.len)	{		return strncmp(&BufferStr[a->entry.pos],					   &BufferStr[b->entry.pos],					   a->entry.len);	}	return (a->entry.len > b->entry.len) ? 1 : -1;}/* * Sort an array of WordEntryIN, remove duplicates. * *outbuflen receives the amount of space needed for strings and positions. */static intuniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen){	int			buflen;	WordEntryIN *ptr,			   *res;	Assert(l >= 1);	if (l > 1)		qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,				  (void *) buf);	buflen = 0;	res = a;	ptr = a + 1;	while (ptr - a < l)	{		if (!(ptr->entry.len == res->entry.len &&			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],					  res->entry.len) == 0))		{			/* done accumulating data into *res, count space needed */			buflen += res->entry.len;			if (res->entry.haspos)			{				res->poslen = uniquePos(res->pos, res->poslen);				buflen = SHORTALIGN(buflen);				buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);			}			res++;			memcpy(res, ptr, sizeof(WordEntryIN));		}		else if (ptr->entry.haspos)		{			if (res->entry.haspos)			{				/* append ptr's positions to res's positions */				int			newlen = ptr->poslen + res->poslen;				res->pos = (WordEntryPos *)					repalloc(res->pos, newlen * sizeof(WordEntryPos));				memcpy(&res->pos[res->poslen], ptr->pos,					   ptr->poslen * sizeof(WordEntryPos));				res->poslen = newlen;				pfree(ptr->pos);			}			else			{				/* just give ptr's positions to pos */				res->entry.haspos = 1;				res->pos = ptr->pos;				res->poslen = ptr->poslen;			}		}		ptr++;	}	/* count space needed for last item */	buflen += res->entry.len;	if (res->entry.haspos)	{		res->poslen = uniquePos(res->pos, res->poslen);		buflen = SHORTALIGN(buflen);		buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);	}	*outbuflen = buflen;	return res + 1 - a;}static intWordEntryCMP(WordEntry *a, WordEntry *b, char *buf){	return compareentry(a, b, buf);}Datumtsvectorin(PG_FUNCTION_ARGS){	char	   *buf = PG_GETARG_CSTRING(0);	TSVectorParseState state;	WordEntryIN *arr;	int			totallen;	int			arrlen;			/* allocated size of arr */	WordEntry  *inarr;	int			len = 0;	TSVector	in;	int			i;	char	   *token;	int			toklen;	WordEntryPos *pos;	int			poslen;	char	   *strbuf;	int			stroff;	/*	 * Tokens are appended to tmpbuf, cur is a pointer to the end of used	 * space in tmpbuf.	 */	char	   *tmpbuf;	char	   *cur;	int			buflen = 256;	/* allocated size of tmpbuf */	pg_verifymbstr(buf, strlen(buf), false);	state = init_tsvector_parser(buf, false, false);	arrlen = 64;	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);	cur = tmpbuf = (char *) palloc(buflen);	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))	{		if (toklen >= MAXSTRLEN)			ereport(ERROR,					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),					 errmsg("word is too long (%ld bytes, max %ld bytes)",							(long) toklen,							(long) (MAXSTRLEN - 1))));		if (cur - tmpbuf > MAXSTRPOS)			ereport(ERROR,					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),					 errmsg("string is too long for tsvector")));		/*		 * Enlarge buffers if needed		 */		if (len >= arrlen)		{			arrlen *= 2;			arr = (WordEntryIN *)				repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);		}		while ((cur - tmpbuf) + toklen >= buflen)		{			int			dist = cur - tmpbuf;			buflen *= 2;			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);			cur = tmpbuf + dist;		}		arr[len].entry.len = toklen;		arr[len].entry.pos = cur - tmpbuf;		memcpy((void *) cur, (void *) token, toklen);		cur += toklen;		if (poslen != 0)		{			arr[len].entry.haspos = 1;			arr[len].pos = pos;			arr[len].poslen = poslen;		}		else		{			arr[len].entry.haspos = 0;			arr[len].pos = NULL;			arr[len].poslen = 0;		}		len++;	}	close_tsvector_parser(state);	if (len > 0)		len = uniqueentry(arr, len, tmpbuf, &buflen);	else		buflen = 0;	if (buflen > MAXSTRPOS)		ereport(ERROR,				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),				 errmsg("string is too long for tsvector")));	totallen = CALCDATASIZE(len, buflen);	in = (TSVector) palloc0(totallen);	SET_VARSIZE(in, totallen);	in->size = len;	inarr = ARRPTR(in);	strbuf = STRPTR(in);	stroff = 0;	for (i = 0; i < len; i++)	{		memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);		arr[i].entry.pos = stroff;		stroff += arr[i].entry.len;		if (arr[i].entry.haspos)		{			if (arr[i].poslen > 0xFFFF)				elog(ERROR, "positions array too long");			/* Copy number of positions */			stroff = SHORTALIGN(stroff);			*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;			stroff += sizeof(uint16);			/* Copy positions */			memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));			stroff += arr[i].poslen * sizeof(WordEntryPos);			pfree(arr[i].pos);		}		inarr[i] = arr[i].entry;	}	Assert((strbuf + stroff - (char *) in) == totallen);	PG_RETURN_TSVECTOR(in);}Datumtsvectorout(PG_FUNCTION_ARGS){	TSVector	out = PG_GETARG_TSVECTOR(0);	char	   *outbuf;	int4		i,				lenbuf = 0,				pp;	WordEntry  *ptr = ARRPTR(out);	char	   *curbegin,			   *curin,			   *curout;	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;	for (i = 0; i < out->size; i++)	{		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;		if (ptr[i].haspos)			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));	}	curout = outbuf = (char *) palloc(lenbuf);	for (i = 0; i < out->size; i++)	{		curbegin = curin = STRPTR(out) + ptr->pos;		if (i != 0)			*curout++ = ' ';		*curout++ = '\'';		while (curin - curbegin < ptr->len)		{			int			len = pg_mblen(curin);			if (t_iseq(curin, '\''))				*curout++ = '\'';			else if (t_iseq(curin, '\\'))				*curout++ = '\\';			while (len--)				*curout++ = *curin++;		}		*curout++ = '\'';		if ((pp = POSDATALEN(out, ptr)) != 0)		{			WordEntryPos *wptr;			*curout++ = ':';			wptr = POSDATAPTR(out, ptr);			while (pp)			{				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));				switch (WEP_GETWEIGHT(*wptr))				{					case 3:						*curout++ = 'A';						break;					case 2:						*curout++ = 'B';						break;					case 1:						*curout++ = 'C';						break;					case 0:					default:						break;				}				if (pp > 1)					*curout++ = ',';				pp--;				wptr++;			}		}		ptr++;	}	*curout = '\0';	PG_FREE_IF_COPY(out, 0);	PG_RETURN_CSTRING(outbuf);}/* * Binary Input / Output functions. The binary format is as follows: * * uint32	number of lexemes * * for each lexeme: *		lexeme text in client encoding, null-terminated *		uint16	number of positions *		for each position: *			uint16 WordEntryPos */Datumtsvectorsend(PG_FUNCTION_ARGS){	TSVector	vec = PG_GETARG_TSVECTOR(0);	StringInfoData buf;	int			i,				j;	WordEntry  *weptr = ARRPTR(vec);	pq_begintypsend(&buf);	pq_sendint(&buf, vec->size, sizeof(int32));	for (i = 0; i < vec->size; i++)	{		uint16		npos;		/*		 * the strings in the TSVector array are not null-terminated, so we		 * have to send the null-terminator separately		 */		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);		pq_sendbyte(&buf, '\0');		npos = POSDATALEN(vec, weptr);		pq_sendint(&buf, npos, sizeof(uint16));		if (npos > 0)		{			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);			for (j = 0; j < npos; j++)				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));		}		weptr++;	}	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));}Datumtsvectorrecv(PG_FUNCTION_ARGS){	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);	TSVector	vec;	int			i;	int32		nentries;	int			datalen;		/* number of bytes used in the variable size								 * area after fixed size TSVector header and								 * WordEntries */	Size		hdrlen;	Size		len;			/* allocated size of vec */	nentries = pq_getmsgint(buf, sizeof(int32));	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))		elog(ERROR, "invalid size of tsvector");	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;	len = hdrlen * 2;			/* times two to make room for lexemes */	vec = (TSVector) palloc0(len);	vec->size = nentries;	datalen = 0;	for (i = 0; i < nentries; i++)	{		const char *lexeme;		uint16		npos;		size_t		lex_len;		lexeme = pq_getmsgstring(buf);		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));		/* sanity checks */		lex_len = strlen(lexeme);		if (lex_len < 0 || lex_len > MAXSTRLEN)			elog(ERROR, "invalid tsvector: lexeme too long");		if (datalen > MAXSTRPOS)			elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");		if (npos > MAXNUMPOS)			elog(ERROR, "unexpected number of tsvector positions");		/*		 * Looks valid. Fill the WordEntry struct, and copy lexeme.		 *		 * But make sure the buffer is large enough first.		 */		while (hdrlen + SHORTALIGN(datalen + lex_len) +			   (npos + 1) * sizeof(WordEntryPos) >= len)		{			len *= 2;			vec = (TSVector) repalloc(vec, len);		}		vec->entries[i].haspos = (npos > 0) ? 1 : 0;		vec->entries[i].len = lex_len;		vec->entries[i].pos = datalen;		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);		datalen += lex_len;		if (i > 0 && WordEntryCMP(&vec->entries[i],								  &vec->entries[i - 1],								  STRPTR(vec)) <= 0)			elog(ERROR, "lexemes are misordered");		/* Receive positions */		if (npos > 0)		{			uint16		j;			WordEntryPos *wepptr;			/*			 * Pad to 2-byte alignment if necessary. Though we used palloc0			 * for the initial allocation, subsequent repalloc'd memory areas			 * are not initialized to zero.			 */			if (datalen != SHORTALIGN(datalen))			{				*(STRPTR(vec) + datalen) = '\0';				datalen = SHORTALIGN(datalen);			}			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));			wepptr = POSDATAPTR(vec, &vec->entries[i]);			for (j = 0; j < npos; j++)			{				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))					elog(ERROR, "position information is misordered");			}			datalen += (npos + 1) * sizeof(WordEntry);		}	}	SET_VARSIZE(vec, hdrlen + datalen);	PG_RETURN_TSVECTOR(vec);}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?