regexp.c

来自「postgresql8.3.4源码,开源数据库」· C语言 代码 · 共 1,177 行 · 第 1/3 页

C
1,177
字号
/*------------------------------------------------------------------------- * * regexp.c *	  Postgres' interface to the regular expression package. * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.78.2.1 2008/03/19 02:40:43 tgl Exp $ * *		Alistair Crooks added the code for the regex caching *		agc - cached the regular expressions used - there's a good chance *		that we'll get a hit, so this saves a compile step for every *		attempted match. I haven't actually measured the speed improvement, *		but it `looks' a lot quicker visually when watching regression *		test output. * *		agc - incorporated Keith Bostic's Berkeley regex code into *		the tree for all ports. To distinguish this regex code from any that *		is existent on a platform, I've prepended the string "pg_" to *		the functions regcomp, regerror, regexec and regfree. *		Fixed a bug that was originally a typo by me, where `i' was used *		instead of `oldest' when compiling regular expressions - benign *		results mostly, although occasionally it bit you... * *------------------------------------------------------------------------- */#include "postgres.h"#include "catalog/pg_type.h"#include "funcapi.h"#include "regex/regex.h"#include "utils/builtins.h"#include "utils/guc.h"#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \	(PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)/* GUC-settable flavor parameter */static int	regex_flavor = REG_ADVANCED;/* all the options of interest for regex functions */typedef struct pg_re_flags{	int			cflags;			/* compile flags for Spencer's regex code */	bool		glob;			/* do it globally (for each occurrence) */} pg_re_flags;/* cross-call state for regexp_matches(), also regexp_split() */typedef struct regexp_matches_ctx{	text	   *orig_str;		/* data string in original TEXT form */	int			nmatches;		/* number of places where pattern matched */	int			npatterns;		/* number of capturing subpatterns */	/* We store start char index and end+1 char index for each match */	/* so the number of entries in match_locs is nmatches * npatterns * 2 */	int		   *match_locs;		/* 0-based character indexes */	int			next_match;		/* 0-based index of next match to process */	/* workspace for build_regexp_matches_result() */	Datum	   *elems;			/* has npatterns elements */	bool	   *nulls;			/* has npatterns elements */} regexp_matches_ctx;/* * We cache precompiled regular expressions using a "self organizing list" * structure, in which recently-used items tend to be near the front. * Whenever we use an entry, it's moved up to the front of the list. * Over time, an item's average position corresponds to its frequency of use. * * When we first create an entry, it's inserted at the front of * the array, dropping the entry at the end of the array if necessary to * make room.  (This might seem to be weighting the new entry too heavily, * but if we insert new entries further back, we'll be unable to adjust to * a sudden shift in the query mix where we are presented with MAX_CACHED_RES * never-before-seen items used circularly.  We ought to be able to handle * that case, so we have to insert at the front.) * * Knuth mentions a variant strategy in which a used item is moved up just * one place in the list.  Although he says this uses fewer comparisons on * average, it seems not to adapt very well to the situation where you have * both some reusable patterns and a steady stream of non-reusable patterns. * A reusable pattern that isn't used at least as often as non-reusable * patterns are seen will "fail to keep up" and will drop off the end of the * cache.  With move-to-front, a reusable pattern is guaranteed to stay in * the cache as long as it's used at least once in every MAX_CACHED_RES uses. *//* this is the maximum number of cached regular expressions */#ifndef MAX_CACHED_RES#define MAX_CACHED_RES	32#endif/* this structure describes one cached regular expression */typedef struct cached_re_str{	char	   *cre_pat;		/* original RE (not null terminated!) */	int			cre_pat_len;	/* length of original RE, in bytes */	int			cre_flags;		/* compile flags: extended,icase etc */	regex_t		cre_re;			/* the compiled regular expression */} cached_re_str;static int	num_res = 0;		/* # of cached re's */static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's *//* Local functions */static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,					 text *flags,					 bool force_glob,					 bool use_subpatterns,					 bool ignore_degenerate);static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);/* * RE_compile_and_cache - compile a RE, caching if possible * * Returns regex_t * * *	text_re --- the pattern, expressed as a TEXT object *	cflags --- compile options for the pattern * * Pattern is given in the database encoding.  We internally convert to * an array of pg_wchar, which is what Spencer's regex package wants. */static regex_t *RE_compile_and_cache(text *text_re, int cflags){	int			text_re_len = VARSIZE_ANY_EXHDR(text_re);	char	   *text_re_val = VARDATA_ANY(text_re);	pg_wchar   *pattern;	int			pattern_len;	int			i;	int			regcomp_result;	cached_re_str re_temp;	char		errMsg[100];	/*	 * Look for a match among previously compiled REs.	Since the data	 * structure is self-organizing with most-used entries at the front, our	 * search strategy can just be to scan from the front.	 */	for (i = 0; i < num_res; i++)	{		if (re_array[i].cre_pat_len == text_re_len &&			re_array[i].cre_flags == cflags &&			memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)		{			/*			 * Found a match; move it to front if not there already.			 */			if (i > 0)			{				re_temp = re_array[i];				memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));				re_array[0] = re_temp;			}			return &re_array[0].cre_re;		}	}	/*	 * Couldn't find it, so try to compile the new RE.  To avoid leaking	 * resources on failure, we build into the re_temp local.	 */	/* Convert pattern string to wide characters */	pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));	pattern_len = pg_mb2wchar_with_len(text_re_val,									   pattern,									   text_re_len);	regcomp_result = pg_regcomp(&re_temp.cre_re,								pattern,								pattern_len,								cflags);	pfree(pattern);	if (regcomp_result != REG_OKAY)	{		/* re didn't compile */		pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));		/* XXX should we pg_regfree here? */		ereport(ERROR,				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),				 errmsg("invalid regular expression: %s", errMsg)));	}	/*	 * We use malloc/free for the cre_pat field because the storage has to	 * persist across transactions, and because we want to get control back on	 * out-of-memory.  The Max() is because some malloc implementations return	 * NULL for malloc(0).	 */	re_temp.cre_pat = malloc(Max(text_re_len, 1));	if (re_temp.cre_pat == NULL)	{		pg_regfree(&re_temp.cre_re);		ereport(ERROR,				(errcode(ERRCODE_OUT_OF_MEMORY),				 errmsg("out of memory")));	}	memcpy(re_temp.cre_pat, text_re_val, text_re_len);	re_temp.cre_pat_len = text_re_len;	re_temp.cre_flags = cflags;	/*	 * Okay, we have a valid new item in re_temp; insert it into the storage	 * array.  Discard last entry if needed.	 */	if (num_res >= MAX_CACHED_RES)	{		--num_res;		Assert(num_res < MAX_CACHED_RES);		pg_regfree(&re_array[num_res].cre_re);		free(re_array[num_res].cre_pat);	}	if (num_res > 0)		memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));	re_array[0] = re_temp;	num_res++;	return &re_array[0].cre_re;}/* * RE_wchar_execute - execute a RE on pg_wchar data * * Returns TRUE on match, FALSE on no match * *	re --- the compiled pattern as returned by RE_compile_and_cache *	data --- the data to match against (need not be null-terminated) *	data_len --- the length of the data string *	start_search -- the offset in the data to start searching *	nmatch, pmatch	--- optional return area for match details * * Data is given as array of pg_wchar which is what Spencer's regex package * wants. */static boolRE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,				 int start_search, int nmatch, regmatch_t *pmatch){	int			regexec_result;	char		errMsg[100];	/* Perform RE match and return result */	regexec_result = pg_regexec(re,								data,								data_len,								start_search,								NULL,	/* no details */								nmatch,								pmatch,								0);	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)	{		/* re failed??? */		pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));		ereport(ERROR,				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),				 errmsg("regular expression failed: %s", errMsg)));	}	return (regexec_result == REG_OKAY);}/* * RE_execute - execute a RE * * Returns TRUE on match, FALSE on no match * *	re --- the compiled pattern as returned by RE_compile_and_cache *	dat --- the data to match against (need not be null-terminated) *	dat_len --- the length of the data string *	nmatch, pmatch	--- optional return area for match details * * Data is given in the database encoding.	We internally * convert to array of pg_wchar which is what Spencer's regex package wants. */static boolRE_execute(regex_t *re, char *dat, int dat_len,		   int nmatch, regmatch_t *pmatch){	pg_wchar   *data;	int			data_len;	bool		match;	/* Convert data string to wide characters */	data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));	data_len = pg_mb2wchar_with_len(dat, data, dat_len);	/* Perform RE match and return result */	match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);	pfree(data);	return match;}/* * RE_compile_and_execute - compile and execute a RE * * Returns TRUE on match, FALSE on no match * *	text_re --- the pattern, expressed as a TEXT object *	dat --- the data to match against (need not be null-terminated) *	dat_len --- the length of the data string *	cflags --- compile options for the pattern *	nmatch, pmatch	--- optional return area for match details * * Both pattern and data are given in the database encoding.  We internally * convert to array of pg_wchar which is what Spencer's regex package wants. */static boolRE_compile_and_execute(text *text_re, char *dat, int dat_len,					   int cflags, int nmatch, regmatch_t *pmatch){	regex_t    *re;	/* Compile RE */	re = RE_compile_and_cache(text_re, cflags);	return RE_execute(re, dat, dat_len, nmatch, pmatch);}/* * parse_re_flags - parse the options argument of regexp_matches and friends * *	flags --- output argument, filled with desired options *	opts --- TEXT object, or NULL for defaults * * This accepts all the options allowed by any of the callers; callers that * don't want some have to reject them after the fact. */static voidparse_re_flags(pg_re_flags *flags, text *opts){	/* regex_flavor is always folded into the compile flags */	flags->cflags = regex_flavor;	flags->glob = false;	if (opts)	{		char	   *opt_p = VARDATA_ANY(opts);		int			opt_len = VARSIZE_ANY_EXHDR(opts);		int			i;		for (i = 0; i < opt_len; i++)		{			switch (opt_p[i])			{				case 'g':					flags->glob = true;					break;				case 'b':		/* BREs (but why???) */					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);					break;				case 'c':		/* case sensitive */					flags->cflags &= ~REG_ICASE;					break;				case 'e':		/* plain EREs */					flags->cflags |= REG_EXTENDED;					flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);					break;				case 'i':		/* case insensitive */					flags->cflags |= REG_ICASE;					break;				case 'm':		/* Perloid synonym for n */				case 'n':		/* \n affects ^ $ . [^ */					flags->cflags |= REG_NEWLINE;					break;				case 'p':		/* ~Perl, \n affects . [^ */					flags->cflags |= REG_NLSTOP;					flags->cflags &= ~REG_NLANCH;					break;				case 'q':		/* literal string */					flags->cflags |= REG_QUOTE;					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);					break;				case 's':		/* single line, \n ordinary */					flags->cflags &= ~REG_NEWLINE;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?