analyze.c

来自「PostgreSQL 8.1.4的源码 适用于Linux下的开源数据库系统」· C语言 代码 · 共 2,242 行 · 第 1/5 页

C
2,242
字号
		eqfunc = oprfuncid(func_operator);		ReleaseSysCache(func_operator);	}	if (!OidIsValid(eqfunc))		return false;	/* Is there a "<" operator with suitable semantics? */	func_operator = ordering_oper(attr->atttypid, true);	if (func_operator != NULL)	{		ltopr = oprid(func_operator);		ReleaseSysCache(func_operator);	}	/* Save the operator info for compute_stats routines */	mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData));	mystats->eqopr = eqopr;	mystats->eqfunc = eqfunc;	mystats->ltopr = ltopr;	stats->extra_data = mystats;	/*	 * Determine which standard statistics algorithm to use	 */	if (OidIsValid(ltopr))	{		/* Seems to be a scalar datatype */		stats->compute_stats = compute_scalar_stats;		/*--------------------		 * The following choice of minrows is based on the paper		 * "Random sampling for histogram construction: how much is enough?"		 * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in		 * Proceedings of ACM SIGMOD International Conference on Management		 * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5		 * says that for table size n, histogram size k, maximum relative		 * error in bin size f, and error probability gamma, the minimum		 * random sample size is		 *		r = 4 * k * ln(2*n/gamma) / f^2		 * Taking f = 0.5, gamma = 0.01, n = 1 million rows, we obtain		 *		r = 305.82 * k		 * Note that because of the log function, the dependence on n is		 * quite weak; even at n = 1 billion, a 300*k sample gives <= 0.59		 * bin size error with probability 0.99.  So there's no real need to		 * scale for n, which is a good thing because we don't necessarily		 * know it at this point.		 *--------------------		 */		stats->minrows = 300 * attr->attstattarget;	}	else	{		/* Can't do much but the minimal stuff */		stats->compute_stats = compute_minimal_stats;		/* Might as well use the same minrows as above */		stats->minrows = 300 * attr->attstattarget;	}	return true;}/* *	compute_minimal_stats() -- compute minimal column statistics * *	We use this when we can find only an "=" operator for the datatype. * *	We determine the fraction of non-null rows, the average width, the *	most common values, and the (estimated) number of distinct values. * *	The most common values are determined by brute force: we keep a list *	of previously seen values, ordered by number of times seen, as we scan *	the samples.  A newly seen value is inserted just after the last *	multiply-seen value, causing the bottommost (oldest) singly-seen value *	to drop off the list.  The accuracy of this method, and also its cost, *	depend mainly on the length of the list we are willing to keep. */static voidcompute_minimal_stats(VacAttrStatsP stats,					  AnalyzeAttrFetchFunc fetchfunc,					  int samplerows,					  double totalrows){	int			i;	int			null_cnt = 0;	int			nonnull_cnt = 0;	int			toowide_cnt = 0;	double		total_width = 0;	bool		is_varlena = (!stats->attr->attbyval &&							  stats->attr->attlen == -1);	bool		is_varwidth = (!stats->attr->attbyval &&							   stats->attr->attlen < 0);	FmgrInfo	f_cmpeq;	typedef struct	{		Datum		value;		int			count;	} TrackItem;	TrackItem  *track;	int			track_cnt,				track_max;	int			num_mcv = stats->attr->attstattarget;	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;	/*	 * We track up to 2*n values for an n-element MCV list; but at least 10	 */	track_max = 2 * num_mcv;	if (track_max < 10)		track_max = 10;	track = (TrackItem *) palloc(track_max * sizeof(TrackItem));	track_cnt = 0;	fmgr_info(mystats->eqfunc, &f_cmpeq);	for (i = 0; i < samplerows; i++)	{		Datum		value;		bool		isnull;		bool		match;		int			firstcount1,					j;		vacuum_delay_point();		value = fetchfunc(stats, i, &isnull);		/* Check for null/nonnull */		if (isnull)		{			null_cnt++;			continue;		}		nonnull_cnt++;		/*		 * If it's a variable-width field, add up widths for average width		 * calculation.  Note that if the value is toasted, we use the toasted		 * width.  We don't bother with this calculation if it's a fixed-width		 * type.		 */		if (is_varlena)		{			total_width += VARSIZE(DatumGetPointer(value));			/*			 * If the value is toasted, we want to detoast it just once to			 * avoid repeated detoastings and resultant excess memory usage			 * during the comparisons.	Also, check to see if the value is			 * excessively wide, and if so don't detoast at all --- just			 * ignore the value.			 */			if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)			{				toowide_cnt++;				continue;			}			value = PointerGetDatum(PG_DETOAST_DATUM(value));		}		else if (is_varwidth)		{			/* must be cstring */			total_width += strlen(DatumGetCString(value)) + 1;		}		/*		 * See if the value matches anything we're already tracking.		 */		match = false;		firstcount1 = track_cnt;		for (j = 0; j < track_cnt; j++)		{			if (DatumGetBool(FunctionCall2(&f_cmpeq, value, track[j].value)))			{				match = true;				break;			}			if (j < firstcount1 && track[j].count == 1)				firstcount1 = j;		}		if (match)		{			/* Found a match */			track[j].count++;			/* This value may now need to "bubble up" in the track list */			while (j > 0 && track[j].count > track[j - 1].count)			{				swapDatum(track[j].value, track[j - 1].value);				swapInt(track[j].count, track[j - 1].count);				j--;			}		}		else		{			/* No match.  Insert at head of count-1 list */			if (track_cnt < track_max)				track_cnt++;			for (j = track_cnt - 1; j > firstcount1; j--)			{				track[j].value = track[j - 1].value;				track[j].count = track[j - 1].count;			}			if (firstcount1 < track_cnt)			{				track[firstcount1].value = value;				track[firstcount1].count = 1;			}		}	}	/* We can only compute real stats if we found some non-null values. */	if (nonnull_cnt > 0)	{		int			nmultiple,					summultiple;		stats->stats_valid = true;		/* Do the simple null-frac and width stats */		stats->stanullfrac = (double) null_cnt / (double) samplerows;		if (is_varwidth)			stats->stawidth = total_width / (double) nonnull_cnt;		else			stats->stawidth = stats->attrtype->typlen;		/* Count the number of values we found multiple times */		summultiple = 0;		for (nmultiple = 0; nmultiple < track_cnt; nmultiple++)		{			if (track[nmultiple].count == 1)				break;			summultiple += track[nmultiple].count;		}		if (nmultiple == 0)		{			/* If we found no repeated values, assume it's a unique column */			stats->stadistinct = -1.0;		}		else if (track_cnt < track_max && toowide_cnt == 0 &&				 nmultiple == track_cnt)		{			/*			 * Our track list includes every value in the sample, and every			 * value appeared more than once.  Assume the column has just			 * these values.			 */			stats->stadistinct = track_cnt;		}		else		{			/*----------			 * Estimate the number of distinct values using the estimator			 * proposed by Haas and Stokes in IBM Research Report RJ 10025:			 *		n*d / (n - f1 + f1*n/N)			 * where f1 is the number of distinct values that occurred			 * exactly once in our sample of n rows (from a total of N),			 * and d is the total number of distinct values in the sample.			 * This is their Duj1 estimator; the other estimators they			 * recommend are considerably more complex, and are numerically			 * very unstable when n is much smaller than N.			 *			 * We assume (not very reliably!) that all the multiply-occurring			 * values are reflected in the final track[] list, and the other			 * nonnull values all appeared but once.  (XXX this usually			 * results in a drastic overestimate of ndistinct.	Can we do			 * any better?)			 *----------			 */			int			f1 = nonnull_cnt - summultiple;			int			d = f1 + nmultiple;			double		numer,						denom,						stadistinct;			numer = (double) samplerows *(double) d;			denom = (double) (samplerows - f1) +				(double) f1 *(double) samplerows / totalrows;			stadistinct = numer / denom;			/* Clamp to sane range in case of roundoff error */			if (stadistinct < (double) d)				stadistinct = (double) d;			if (stadistinct > totalrows)				stadistinct = totalrows;			stats->stadistinct = floor(stadistinct + 0.5);		}		/*		 * If we estimated the number of distinct values at more than 10% of		 * the total row count (a very arbitrary limit), then assume that		 * stadistinct should scale with the row count rather than be a fixed		 * value.		 */		if (stats->stadistinct > 0.1 * totalrows)			stats->stadistinct = -(stats->stadistinct / totalrows);		/*		 * Decide how many values are worth storing as most-common values. If		 * we are able to generate a complete MCV list (all the values in the		 * sample will fit, and we think these are all the ones in the table),		 * then do so.	Otherwise, store only those values that are		 * significantly more common than the (estimated) average. We set the		 * threshold rather arbitrarily at 25% more than average, with at		 * least 2 instances in the sample.		 */		if (track_cnt < track_max && toowide_cnt == 0 &&			stats->stadistinct > 0 &&			track_cnt <= num_mcv)		{			/* Track list includes all values seen, and all will fit */			num_mcv = track_cnt;		}		else		{			double		ndistinct = stats->stadistinct;			double		avgcount,						mincount;			if (ndistinct < 0)				ndistinct = -ndistinct * totalrows;			/* estimate # of occurrences in sample of a typical value */			avgcount = (double) samplerows / ndistinct;			/* set minimum threshold count to store a value */			mincount = avgcount * 1.25;			if (mincount < 2)				mincount = 2;			if (num_mcv > track_cnt)				num_mcv = track_cnt;			for (i = 0; i < num_mcv; i++)			{				if (track[i].count < mincount)				{					num_mcv = i;					break;				}			}		}		/* Generate MCV slot entry */		if (num_mcv > 0)		{			MemoryContext old_context;			Datum	   *mcv_values;			float4	   *mcv_freqs;			/* Must copy the target values into anl_context */			old_context = MemoryContextSwitchTo(stats->anl_context);			mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));			mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));			for (i = 0; i < num_mcv; i++)			{				mcv_values[i] = datumCopy(track[i].value,										  stats->attr->attbyval,										  stats->attr->attlen);				mcv_freqs[i] = (double) track[i].count / (double) samplerows;			}			MemoryContextSwitchTo(old_context);			stats->stakind[0] = STATISTIC_KIND_MCV;			stats->staop[0] = mystats->eqopr;			stats->stanumbers[0] = mcv_freqs;			stats->numnumbers[0] = num_mcv;			stats->stavalues[0] = mcv_values;			stats->numvalues[0] = num_mcv;		}	}	else if (null_cnt > 0)	{		/* We found only nulls; assume the column is entirely null */		stats->stats_valid = true;		stats->stanullfrac = 1.0;		if (is_varwidth)			stats->stawidth = 0;	/* "unknown" */		else			stats->stawidth = stats->attrtype->typlen;		stats->stadistinct = 0.0;		/* "unknown" */	}	/* We don't need to bother cleaning up any of our temporary palloc's */}/* *	compute_scalar_stats() -- compute column statistics * *	We use this when we can find "=" and "<" operators for the datatype. * *	We determine the fraction of non-null rows, the average width, the *	most common values, the (estimated) number of distinct values, the *	distribution histogram, and the correlation of physical to logical order. * *	The desired stats can be determined fairly easily after sorting the *	data values into order. */static voidcompute_scalar_stats(VacAttrStatsP stats,					 AnalyzeAttrFetchFunc fetchfunc,					 int samplerows,					 double totalrows){	int			i;	int			null_cnt = 0;	int			nonnull_cnt = 0;	int			toowide_cnt = 0;	double		total_width = 0;	bool		is_varlena = (!stats->attr->attbyval &&							  stats->attr->attlen == -1);	bool		is_varwidth = (!stats->attr->attbyval &&							   stats->attr->attlen < 0);	double		corr_xysum;	RegProcedure cmpFn;	SortFunctionKind cmpFnKind;	FmgrInfo	f_cmpfn;	ScalarItem *values;	int			values_cnt = 0;	int		   *tupnoLink;	ScalarMCVItem *track;	int			track_cnt = 0;	int			num_mcv = stats->attr->attstattarget;	int			num_bins = stats->attr->attstattarget;	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;	values = (ScalarItem *) palloc(samplerows * sizeof(ScalarItem));	tupnoLink = (int *) palloc(samplerows * sizeof(int));	track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem));	SelectSortFunction(mystats->ltopr, &cmpFn, &cmpFnKind);	fmgr_info(cmpFn, &f_cmpfn);	/* Initial scan to find sortable values */	for (i = 0; i < samplerows; i++)	{		Datum		value;		bool		isnull;		vacuum_delay_point();		value = fetchfunc(stats, i, &isnull);		/* Check for null/nonnull */		if (isnull)		{			null_cnt++;			continue;		}		nonnull_cnt++;		/*		 * If it's a variable-width field, add up widths for average width

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?