analyze.c

来自「PostgreSQL 8.1.4的源码 适用于Linux下的开源数据库系统」· C语言 代码 · 共 2,242 行 · 第 1/5 页

C
2,242
字号
		 * calculation.  Note that if the value is toasted, we use the toasted		 * width.  We don't bother with this calculation if it's a fixed-width		 * type.		 */		if (is_varlena)		{			total_width += VARSIZE(DatumGetPointer(value));			/*			 * If the value is toasted, we want to detoast it just once to			 * avoid repeated detoastings and resultant excess memory usage			 * during the comparisons.	Also, check to see if the value is			 * excessively wide, and if so don't detoast at all --- just			 * ignore the value.			 */			if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)			{				toowide_cnt++;				continue;			}			value = PointerGetDatum(PG_DETOAST_DATUM(value));		}		else if (is_varwidth)		{			/* must be cstring */			total_width += strlen(DatumGetCString(value)) + 1;		}		/* Add it to the list to be sorted */		values[values_cnt].value = value;		values[values_cnt].tupno = values_cnt;		tupnoLink[values_cnt] = values_cnt;		values_cnt++;	}	/* We can only compute real stats if we found some sortable values. */	if (values_cnt > 0)	{		int			ndistinct,	/* # distinct values in sample */					nmultiple,	/* # that appear multiple times */					num_hist,					dups_cnt;		int			slot_idx = 0;		/* Sort the collected values */		datumCmpFn = &f_cmpfn;		datumCmpFnKind = cmpFnKind;		datumCmpTupnoLink = tupnoLink;		qsort((void *) values, values_cnt,			  sizeof(ScalarItem), compare_scalars);		/*		 * Now scan the values in order, find the most common ones, and also		 * accumulate ordering-correlation statistics.		 *		 * To determine which are most common, we first have to count the		 * number of duplicates of each value.	The duplicates are adjacent in		 * the sorted list, so a brute-force approach is to compare successive		 * datum values until we find two that are not equal. However, that		 * requires N-1 invocations of the datum comparison routine, which are		 * completely redundant with work that was done during the sort.  (The		 * sort algorithm must at some point have compared each pair of items		 * that are adjacent in the sorted order; otherwise it could not know		 * that it's ordered the pair correctly.) We exploit this by having		 * compare_scalars remember the highest tupno index that each		 * ScalarItem has been found equal to.	At the end of the sort, a		 * ScalarItem's tupnoLink will still point to itself if and only if it		 * is the last item of its group of duplicates (since the group will		 * be ordered by tupno).		 */		corr_xysum = 0;		ndistinct = 0;		nmultiple = 0;		dups_cnt = 0;		for (i = 0; i < values_cnt; i++)		{			int			tupno = values[i].tupno;			corr_xysum += ((double) i) * ((double) tupno);			dups_cnt++;			if (tupnoLink[tupno] == tupno)			{				/* Reached end of duplicates of this value */				ndistinct++;				if (dups_cnt > 1)				{					nmultiple++;					if (track_cnt < num_mcv ||						dups_cnt > track[track_cnt - 1].count)					{						/*						 * Found a new item for the mcv list; find its						 * position, bubbling down old items if needed. Loop						 * invariant is that j points at an empty/ replaceable						 * slot.						 */						int			j;						if (track_cnt < num_mcv)							track_cnt++;						for (j = track_cnt - 1; j > 0; j--)						{							if (dups_cnt <= track[j - 1].count)								break;							track[j].count = track[j - 1].count;							track[j].first = track[j - 1].first;						}						track[j].count = dups_cnt;						track[j].first = i + 1 - dups_cnt;					}				}				dups_cnt = 0;			}		}		stats->stats_valid = true;		/* Do the simple null-frac and width stats */		stats->stanullfrac = (double) null_cnt / (double) samplerows;		if (is_varwidth)			stats->stawidth = total_width / (double) nonnull_cnt;		else			stats->stawidth = stats->attrtype->typlen;		if (nmultiple == 0)		{			/* If we found no repeated values, assume it's a unique column */			stats->stadistinct = -1.0;		}		else if (toowide_cnt == 0 && nmultiple == ndistinct)		{			/*			 * Every value in the sample appeared more than once.  Assume the			 * column has just these values.			 */			stats->stadistinct = ndistinct;		}		else		{			/*----------			 * Estimate the number of distinct values using the estimator			 * proposed by Haas and Stokes in IBM Research Report RJ 10025:			 *		n*d / (n - f1 + f1*n/N)			 * where f1 is the number of distinct values that occurred			 * exactly once in our sample of n rows (from a total of N),			 * and d is the total number of distinct values in the sample.			 * This is their Duj1 estimator; the other estimators they			 * recommend are considerably more complex, and are numerically			 * very unstable when n is much smaller than N.			 *			 * Overwidth values are assumed to have been distinct.			 *----------			 */			int			f1 = ndistinct - nmultiple + toowide_cnt;			int			d = f1 + nmultiple;			double		numer,						denom,						stadistinct;			numer = (double) samplerows *(double) d;			denom = (double) (samplerows - f1) +				(double) f1 *(double) samplerows / totalrows;			stadistinct = numer / denom;			/* Clamp to sane range in case of roundoff error */			if (stadistinct < (double) d)				stadistinct = (double) d;			if (stadistinct > totalrows)				stadistinct = totalrows;			stats->stadistinct = floor(stadistinct + 0.5);		}		/*		 * If we estimated the number of distinct values at more than 10% of		 * the total row count (a very arbitrary limit), then assume that		 * stadistinct should scale with the row count rather than be a fixed		 * value.		 */		if (stats->stadistinct > 0.1 * totalrows)			stats->stadistinct = -(stats->stadistinct / totalrows);		/*		 * Decide how many values are worth storing as most-common values. If		 * we are able to generate a complete MCV list (all the values in the		 * sample will fit, and we think these are all the ones in the table),		 * then do so.	Otherwise, store only those values that are		 * significantly more common than the (estimated) average. We set the		 * threshold rather arbitrarily at 25% more than average, with at		 * least 2 instances in the sample.  Also, we won't suppress values		 * that have a frequency of at least 1/K where K is the intended		 * number of histogram bins; such values might otherwise cause us to		 * emit duplicate histogram bin boundaries.		 */		if (track_cnt == ndistinct && toowide_cnt == 0 &&			stats->stadistinct > 0 &&			track_cnt <= num_mcv)		{			/* Track list includes all values seen, and all will fit */			num_mcv = track_cnt;		}		else		{			double		ndistinct = stats->stadistinct;			double		avgcount,						mincount,						maxmincount;			if (ndistinct < 0)				ndistinct = -ndistinct * totalrows;			/* estimate # of occurrences in sample of a typical value */			avgcount = (double) samplerows / ndistinct;			/* set minimum threshold count to store a value */			mincount = avgcount * 1.25;			if (mincount < 2)				mincount = 2;			/* don't let threshold exceed 1/K, however */			maxmincount = (double) samplerows / (double) num_bins;			if (mincount > maxmincount)				mincount = maxmincount;			if (num_mcv > track_cnt)				num_mcv = track_cnt;			for (i = 0; i < num_mcv; i++)			{				if (track[i].count < mincount)				{					num_mcv = i;					break;				}			}		}		/* Generate MCV slot entry */		if (num_mcv > 0)		{			MemoryContext old_context;			Datum	   *mcv_values;			float4	   *mcv_freqs;			/* Must copy the target values into anl_context */			old_context = MemoryContextSwitchTo(stats->anl_context);			mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));			mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));			for (i = 0; i < num_mcv; i++)			{				mcv_values[i] = datumCopy(values[track[i].first].value,										  stats->attr->attbyval,										  stats->attr->attlen);				mcv_freqs[i] = (double) track[i].count / (double) samplerows;			}			MemoryContextSwitchTo(old_context);			stats->stakind[slot_idx] = STATISTIC_KIND_MCV;			stats->staop[slot_idx] = mystats->eqopr;			stats->stanumbers[slot_idx] = mcv_freqs;			stats->numnumbers[slot_idx] = num_mcv;			stats->stavalues[slot_idx] = mcv_values;			stats->numvalues[slot_idx] = num_mcv;			slot_idx++;		}		/*		 * Generate a histogram slot entry if there are at least two distinct		 * values not accounted for in the MCV list.  (This ensures the		 * histogram won't collapse to empty or a singleton.)		 */		num_hist = ndistinct - num_mcv;		if (num_hist > num_bins)			num_hist = num_bins + 1;		if (num_hist >= 2)		{			MemoryContext old_context;			Datum	   *hist_values;			int			nvals;			/* Sort the MCV items into position order to speed next loop */			qsort((void *) track, num_mcv,				  sizeof(ScalarMCVItem), compare_mcvs);			/*			 * Collapse out the MCV items from the values[] array.			 *			 * Note we destroy the values[] array here... but we don't need it			 * for anything more.  We do, however, still need values_cnt.			 * nvals will be the number of remaining entries in values[].			 */			if (num_mcv > 0)			{				int			src,							dest;				int			j;				src = dest = 0;				j = 0;			/* index of next interesting MCV item */				while (src < values_cnt)				{					int			ncopy;					if (j < num_mcv)					{						int			first = track[j].first;						if (src >= first)						{							/* advance past this MCV item */							src = first + track[j].count;							j++;							continue;						}						ncopy = first - src;					}					else						ncopy = values_cnt - src;					memmove(&values[dest], &values[src],							ncopy * sizeof(ScalarItem));					src += ncopy;					dest += ncopy;				}				nvals = dest;			}			else				nvals = values_cnt;			Assert(nvals >= num_hist);			/* Must copy the target values into anl_context */			old_context = MemoryContextSwitchTo(stats->anl_context);			hist_values = (Datum *) palloc(num_hist * sizeof(Datum));			for (i = 0; i < num_hist; i++)			{				int			pos;				pos = (i * (nvals - 1)) / (num_hist - 1);				hist_values[i] = datumCopy(values[pos].value,										   stats->attr->attbyval,										   stats->attr->attlen);			}			MemoryContextSwitchTo(old_context);			stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM;			stats->staop[slot_idx] = mystats->ltopr;			stats->stavalues[slot_idx] = hist_values;			stats->numvalues[slot_idx] = num_hist;			slot_idx++;		}		/* Generate a correlation entry if there are multiple values */		if (values_cnt > 1)		{			MemoryContext old_context;			float4	   *corrs;			double		corr_xsum,						corr_x2sum;			/* Must copy the target values into anl_context */			old_context = MemoryContextSwitchTo(stats->anl_context);			corrs = (float4 *) palloc(sizeof(float4));			MemoryContextSwitchTo(old_context);			/*----------			 * Since we know the x and y value sets are both			 *		0, 1, ..., values_cnt-1			 * we have sum(x) = sum(y) =			 *		(values_cnt-1)*values_cnt / 2			 * and sum(x^2) = sum(y^2) =			 *		(values_cnt-1)*values_cnt*(2*values_cnt-1) / 6.			 *----------			 */			corr_xsum = ((double) (values_cnt - 1)) *				((double) values_cnt) / 2.0;			corr_x2sum = ((double) (values_cnt - 1)) *				((double) values_cnt) * (double) (2 * values_cnt - 1) / 6.0;			/* And the correlation coefficient reduces to */			corrs[0] = (values_cnt * corr_xysum - corr_xsum * corr_xsum) /				(values_cnt * corr_x2sum - corr_xsum * corr_xsum);			stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION;			stats->staop[slot_idx] = mystats->ltopr;			stats->stanumbers[slot_idx] = corrs;			stats->numnumbers[slot_idx] = 1;			slot_idx++;		}	}	else if (nonnull_cnt == 0 && null_cnt > 0)	{		/* We found only nulls; assume the column is entirely null */		stats->stats_valid = true;		stats->stanullfrac = 1.0;		if (is_varwidth)			stats->stawidth = 0;	/* "unknown" */		else			stats->stawidth = stats->attrtype->typlen;		stats->stadistinct = 0.0;		/* "unknown" */	}	/* We don't need to bother cleaning up any of our temporary palloc's */}/* * qsort comparator for sorting ScalarItems * * Aside from sorting the items, we update the datumCmpTupnoLink[] array * whenever two ScalarItems are found to contain equal datums.	The array * is indexed by tupno; for each ScalarItem, it contains the highest * tupno that that item's datum has been found to be equal to.  This allows * us to avoid additional comparisons in compute_scalar_stats(). */static intcompare_scalars(const void *a, const void *b){	Datum		da = ((ScalarItem *) a)->value;	int			ta = ((ScalarItem *) a)->tupno;	Datum		db = ((ScalarItem *) b)->value;	int			tb = ((ScalarItem *) b)->tupno;	int32		compare;	compare = ApplySortFunction(datumCmpFn, datumCmpFnKind,								da, false, db, false);	if (compare != 0)		return compare;	/*	 * The two datums are equal, so update datumCmpTupnoLink[].	 */	if (datumCmpTupnoLink[ta] < tb)		datumCmpTupnoLink[ta] = tb;	if (datumCmpTupnoLink[tb] < ta)		datumCmpTupnoLink[tb] = ta;	/*	 * For equal datums, sort by tupno	 */	return ta - tb;}/* * qsort comparator for sorting ScalarMCVItems by position */static intcompare_mcvs(const void *a, const void *b){	int			da = ((ScalarMCVItem *) a)->first;	int			db = ((ScalarMCVItem *) b)->first;	return da - db;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?