analyze.c

来自「PostgreSQL7.4.6 for Linux」· C语言 代码 · 共 1,819 行 · 第 1/4 页

C
1,819
字号
		int			firstcount1,					j;		CHECK_FOR_INTERRUPTS();		value = heap_getattr(tuple, stats->attnum, tupDesc, &isnull);		/* Check for null/nonnull */		if (isnull)		{			null_cnt++;			continue;		}		nonnull_cnt++;		/*		 * If it's a variable-width field, add up widths for average width		 * calculation.  Note that if the value is toasted, we use the		 * toasted width.  We don't bother with this calculation if it's a		 * fixed-width type.		 */		if (is_varlena)		{			total_width += VARSIZE(DatumGetPointer(value));			/*			 * If the value is toasted, we want to detoast it just once to			 * avoid repeated detoastings and resultant excess memory			 * usage during the comparisons.  Also, check to see if the			 * value is excessively wide, and if so don't detoast at all			 * --- just ignore the value.			 */			if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)			{				toowide_cnt++;				continue;			}			value = PointerGetDatum(PG_DETOAST_DATUM(value));		}		else if (is_varwidth)		{			/* must be cstring */			total_width += strlen(DatumGetCString(value)) + 1;		}		/*		 * See if the value matches anything we're already tracking.		 */		match = false;		firstcount1 = track_cnt;		for (j = 0; j < track_cnt; j++)		{			if (DatumGetBool(FunctionCall2(&f_cmpeq, value, track[j].value)))			{				match = true;				break;			}			if (j < firstcount1 && track[j].count == 1)				firstcount1 = j;		}		if (match)		{			/* Found a match */			track[j].count++;			/* This value may now need to "bubble up" in the track list */			while (j > 0 && track[j].count > track[j - 1].count)			{				swapDatum(track[j].value, track[j - 1].value);				swapInt(track[j].count, track[j - 1].count);				j--;			}		}		else		{			/* No match.  Insert at head of count-1 list */			if (track_cnt < track_max)				track_cnt++;			for (j = track_cnt - 1; j > firstcount1; j--)			{				track[j].value = track[j - 1].value;				track[j].count = track[j - 1].count;			}			if (firstcount1 < track_cnt)			{				track[firstcount1].value = value;				track[firstcount1].count = 1;			}		}	}	/* We can only compute valid stats if we found some non-null values. */	if (nonnull_cnt > 0)	{		int			nmultiple,					summultiple;		stats->stats_valid = true;		/* Do the simple null-frac and width stats */		stats->stanullfrac = (double) null_cnt / (double) numrows;		if (is_varwidth)			stats->stawidth = total_width / (double) nonnull_cnt;		else			stats->stawidth = stats->attrtype->typlen;		/* Count the number of values we found multiple times */		summultiple = 0;		for (nmultiple = 0; nmultiple < track_cnt; nmultiple++)		{			if (track[nmultiple].count == 1)				break;			summultiple += track[nmultiple].count;		}		if (nmultiple == 0)		{			/* If we found no repeated values, assume it's a unique column */			stats->stadistinct = -1.0;		}		else if (track_cnt < track_max && toowide_cnt == 0 &&				 nmultiple == track_cnt)		{			/*			 * Our track list includes every value in the sample, and			 * every value appeared more than once.  Assume the column has			 * just these values.			 */			stats->stadistinct = track_cnt;		}		else		{			/*----------			 * Estimate the number of distinct values using the estimator			 * proposed by Haas and Stokes in IBM Research Report RJ 10025:			 *		n*d / (n - f1 + f1*n/N)			 * where f1 is the number of distinct values that occurred			 * exactly once in our sample of n rows (from a total of N),			 * and d is the total number of distinct values in the sample.			 * This is their Duj1 estimator; the other estimators they			 * recommend are considerably more complex, and are numerically			 * very unstable when n is much smaller than N.			 *			 * We assume (not very reliably!) that all the multiply-occurring			 * values are reflected in the final track[] list, and the other			 * nonnull values all appeared but once.  (XXX this usually			 * results in a drastic overestimate of ndistinct.	Can we do			 * any better?)			 *----------			 */			int			f1 = nonnull_cnt - summultiple;			int			d = f1 + nmultiple;			double		numer,						denom,						stadistinct;			numer = (double) numrows *(double) d;			denom = (double) (numrows - f1) +				(double) f1 *(double) numrows / totalrows;			stadistinct = numer / denom;			/* Clamp to sane range in case of roundoff error */			if (stadistinct < (double) d)				stadistinct = (double) d;			if (stadistinct > totalrows)				stadistinct = totalrows;			stats->stadistinct = floor(stadistinct + 0.5);		}		/*		 * If we estimated the number of distinct values at more than 10%		 * of the total row count (a very arbitrary limit), then assume		 * that stadistinct should scale with the row count rather than be		 * a fixed value.		 */		if (stats->stadistinct > 0.1 * totalrows)			stats->stadistinct = -(stats->stadistinct / totalrows);		/*		 * Decide how many values are worth storing as most-common values.		 * If we are able to generate a complete MCV list (all the values		 * in the sample will fit, and we think these are all the ones in		 * the table), then do so.	Otherwise, store only those values		 * that are significantly more common than the (estimated)		 * average. We set the threshold rather arbitrarily at 25% more		 * than average, with at least 2 instances in the sample.		 */		if (track_cnt < track_max && toowide_cnt == 0 &&			stats->stadistinct > 0 &&			track_cnt <= num_mcv)		{			/* Track list includes all values seen, and all will fit */			num_mcv = track_cnt;		}		else		{			double		ndistinct = stats->stadistinct;			double		avgcount,						mincount;			if (ndistinct < 0)				ndistinct = -ndistinct * totalrows;			/* estimate # of occurrences in sample of a typical value */			avgcount = (double) numrows / ndistinct;			/* set minimum threshold count to store a value */			mincount = avgcount * 1.25;			if (mincount < 2)				mincount = 2;			if (num_mcv > track_cnt)				num_mcv = track_cnt;			for (i = 0; i < num_mcv; i++)			{				if (track[i].count < mincount)				{					num_mcv = i;					break;				}			}		}		/* Generate MCV slot entry */		if (num_mcv > 0)		{			MemoryContext old_context;			Datum	   *mcv_values;			float4	   *mcv_freqs;			/* Must copy the target values into anl_context */			old_context = MemoryContextSwitchTo(anl_context);			mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));			mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));			for (i = 0; i < num_mcv; i++)			{				mcv_values[i] = datumCopy(track[i].value,										  stats->attr->attbyval,										  stats->attr->attlen);				mcv_freqs[i] = (double) track[i].count / (double) numrows;			}			MemoryContextSwitchTo(old_context);			stats->stakind[0] = STATISTIC_KIND_MCV;			stats->staop[0] = stats->eqopr;			stats->stanumbers[0] = mcv_freqs;			stats->numnumbers[0] = num_mcv;			stats->stavalues[0] = mcv_values;			stats->numvalues[0] = num_mcv;		}	}	/* We don't need to bother cleaning up any of our temporary palloc's */}/* *	compute_scalar_stats() -- compute column statistics * *	We use this when we can find "=" and "<" operators for the datatype. * *	We determine the fraction of non-null rows, the average width, the *	most common values, the (estimated) number of distinct values, the *	distribution histogram, and the correlation of physical to logical order. * *	The desired stats can be determined fairly easily after sorting the *	data values into order. */static voidcompute_scalar_stats(VacAttrStats *stats,					 TupleDesc tupDesc, double totalrows,					 HeapTuple *rows, int numrows){	int			i;	int			null_cnt = 0;	int			nonnull_cnt = 0;	int			toowide_cnt = 0;	double		total_width = 0;	bool		is_varlena = (!stats->attr->attbyval &&							  stats->attr->attlen == -1);	bool		is_varwidth = (!stats->attr->attbyval &&							   stats->attr->attlen < 0);	double		corr_xysum;	RegProcedure cmpFn;	SortFunctionKind cmpFnKind;	FmgrInfo	f_cmpfn;	ScalarItem *values;	int			values_cnt = 0;	int		   *tupnoLink;	ScalarMCVItem *track;	int			track_cnt = 0;	int			num_mcv = stats->attr->attstattarget;	int			num_bins = stats->attr->attstattarget;	values = (ScalarItem *) palloc(numrows * sizeof(ScalarItem));	tupnoLink = (int *) palloc(numrows * sizeof(int));	track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem));	SelectSortFunction(stats->ltopr, &cmpFn, &cmpFnKind);	fmgr_info(cmpFn, &f_cmpfn);	/* Initial scan to find sortable values */	for (i = 0; i < numrows; i++)	{		HeapTuple	tuple = rows[i];		Datum		value;		bool		isnull;		CHECK_FOR_INTERRUPTS();		value = heap_getattr(tuple, stats->attnum, tupDesc, &isnull);		/* Check for null/nonnull */		if (isnull)		{			null_cnt++;			continue;		}		nonnull_cnt++;		/*		 * If it's a variable-width field, add up widths for average width		 * calculation.  Note that if the value is toasted, we use the		 * toasted width.  We don't bother with this calculation if it's a		 * fixed-width type.		 */		if (is_varlena)		{			total_width += VARSIZE(DatumGetPointer(value));			/*			 * If the value is toasted, we want to detoast it just once to			 * avoid repeated detoastings and resultant excess memory			 * usage during the comparisons.  Also, check to see if the			 * value is excessively wide, and if so don't detoast at all			 * --- just ignore the value.			 */			if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)			{				toowide_cnt++;				continue;			}			value = PointerGetDatum(PG_DETOAST_DATUM(value));		}		else if (is_varwidth)		{			/* must be cstring */			total_width += strlen(DatumGetCString(value)) + 1;		}		/* Add it to the list to be sorted */		values[values_cnt].value = value;		values[values_cnt].tupno = values_cnt;		tupnoLink[values_cnt] = values_cnt;		values_cnt++;	}	/* We can only compute valid stats if we found some sortable values. */	if (values_cnt > 0)	{		int			ndistinct,	/* # distinct values in sample */					nmultiple,	/* # that appear multiple times */					num_hist,					dups_cnt;		int			slot_idx = 0;		/* Sort the collected values */		datumCmpFn = &f_cmpfn;		datumCmpFnKind = cmpFnKind;		datumCmpTupnoLink = tupnoLink;		qsort((void *) values, values_cnt,			  sizeof(ScalarItem), compare_scalars);		/*		 * Now scan the values in order, find the most common ones, and		 * also accumulate ordering-correlation statistics.		 *		 * To determine which are most common, we first have to count the		 * number of duplicates of each value.	The duplicates are		 * adjacent in the sorted list, so a brute-force approach is to		 * compare successive datum values until we find two that are not		 * equal. However, that requires N-1 invocations of the datum		 * comparison routine, which are completely redundant with work		 * that was done during the sort.  (The sort algorithm must at		 * some point have compared each pair of items that are adjacent		 * in the sorted order; otherwise it could not know that it's		 * ordered the pair correctly.) We exploit this by having		 * compare_scalars remember the highest tupno index that each		 * ScalarItem has been found equal to.	At the end of the sort, a		 * ScalarItem's tupnoLink will still point to itself if and only		 * if it is the last item of its group of duplicates (since the		 * group will be ordered by tupno).		 */		corr_xysum = 0;		ndistinct = 0;		nmultiple = 0;		dups_cnt = 0;		for (i = 0; i < values_cnt; i++)		{			int			tupno = values[i].tupno;			corr_xysum += ((double) i) * ((double) tupno);			dups_cnt++;			if (tupnoLink[tupno] == tupno)			{				/* Reached end of duplicates of this value */				ndistinct++;				if (dups_cnt > 1)				{					nmultiple++;					if (track_cnt < num_mcv ||						dups_cnt > track[track_cnt - 1].count)					{						/*						 * Found a new item for the mcv list; find its						 * position, bubbling down old items if needed.						 * Loop invariant is that j points at an empty/						 * replaceable slot.						 */						int			j;						if (track_cnt < num_mcv)							track_cnt++;						for (j = track_cnt - 1; j > 0; j--)						{							if (dups_cnt <= track[j - 1].count)								break;							track[j].count = track[j - 1].count;							track[j].first = track[j - 1].first;						}						track[j].count = dups_cnt;						track[j].first = i + 1 - dups_cnt;					}				}				dups_cnt = 0;			}		}		stats->stats_valid = true;		/* Do the simple null-frac and width stats */		stats->stanullfrac = (double) null_cnt / (double) numrows;		if (is_varwidth)			stats->stawidth = total_width / (double) nonnull_cnt;		else			stats->stawidth = stats->attrtype->typlen;		if (nmultiple == 0)		{			/* If we found no repeated values, assume it's a unique column */			stats->stadistinct = -1.0;		}		else if (toowide_cnt == 0 && nmultiple == ndistinct)		{			/*			 * Every value in the sample appeared more than once.  Assume			 * the column has just these values.			 */			stats->stadistinct = ndistinct;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?