analyze.c

来自「PostgreSQL7.4.6 for Linux」· C语言 代码 · 共 1,819 行 · 第 1/4 页

C
1,819
字号
		}		else		{			/*----------			 * Estimate the number of distinct values using the estimator			 * proposed by Haas and Stokes in IBM Research Report RJ 10025:			 *		n*d / (n - f1 + f1*n/N)			 * where f1 is the number of distinct values that occurred			 * exactly once in our sample of n rows (from a total of N),			 * and d is the total number of distinct values in the sample.			 * This is their Duj1 estimator; the other estimators they			 * recommend are considerably more complex, and are numerically			 * very unstable when n is much smaller than N.			 *			 * Overwidth values are assumed to have been distinct.			 *----------			 */			int			f1 = ndistinct - nmultiple + toowide_cnt;			int			d = f1 + nmultiple;			double		numer,						denom,						stadistinct;			numer = (double) numrows *(double) d;			denom = (double) (numrows - f1) +				(double) f1 *(double) numrows / totalrows;			stadistinct = numer / denom;			/* Clamp to sane range in case of roundoff error */			if (stadistinct < (double) d)				stadistinct = (double) d;			if (stadistinct > totalrows)				stadistinct = totalrows;			stats->stadistinct = floor(stadistinct + 0.5);		}		/*		 * If we estimated the number of distinct values at more than 10%		 * of the total row count (a very arbitrary limit), then assume		 * that stadistinct should scale with the row count rather than be		 * a fixed value.		 */		if (stats->stadistinct > 0.1 * totalrows)			stats->stadistinct = -(stats->stadistinct / totalrows);		/*		 * Decide how many values are worth storing as most-common values.		 * If we are able to generate a complete MCV list (all the values		 * in the sample will fit, and we think these are all the ones in		 * the table), then do so.	Otherwise, store only those values		 * that are significantly more common than the (estimated)		 * average. We set the threshold rather arbitrarily at 25% more		 * than average, with at least 2 instances in the sample.  Also,		 * we won't suppress values that have a frequency of at least 1/K		 * where K is the intended number of histogram bins; such values		 * might otherwise cause us to emit duplicate histogram bin		 * boundaries.		 */		if (track_cnt == ndistinct && toowide_cnt == 0 &&			stats->stadistinct > 0 &&			track_cnt <= num_mcv)		{			/* Track list includes all values seen, and all will fit */			num_mcv = track_cnt;		}		else		{			double		ndistinct = stats->stadistinct;			double		avgcount,						mincount,						maxmincount;			if (ndistinct < 0)				ndistinct = -ndistinct * totalrows;			/* estimate # of occurrences in sample of a typical value */			avgcount = (double) numrows / ndistinct;			/* set minimum threshold count to store a value */			mincount = avgcount * 1.25;			if (mincount < 2)				mincount = 2;			/* don't let threshold exceed 1/K, however */			maxmincount = (double) numrows / (double) num_bins;			if (mincount > maxmincount)				mincount = maxmincount;			if (num_mcv > track_cnt)				num_mcv = track_cnt;			for (i = 0; i < num_mcv; i++)			{				if (track[i].count < mincount)				{					num_mcv = i;					break;				}			}		}		/* Generate MCV slot entry */		if (num_mcv > 0)		{			MemoryContext old_context;			Datum	   *mcv_values;			float4	   *mcv_freqs;			/* Must copy the target values into anl_context */			old_context = MemoryContextSwitchTo(anl_context);			mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));			mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));			for (i = 0; i < num_mcv; i++)			{				mcv_values[i] = datumCopy(values[track[i].first].value,										  stats->attr->attbyval,										  stats->attr->attlen);				mcv_freqs[i] = (double) track[i].count / (double) numrows;			}			MemoryContextSwitchTo(old_context);			stats->stakind[slot_idx] = STATISTIC_KIND_MCV;			stats->staop[slot_idx] = stats->eqopr;			stats->stanumbers[slot_idx] = mcv_freqs;			stats->numnumbers[slot_idx] = num_mcv;			stats->stavalues[slot_idx] = mcv_values;			stats->numvalues[slot_idx] = num_mcv;			slot_idx++;		}		/*		 * Generate a histogram slot entry if there are at least two		 * distinct values not accounted for in the MCV list.  (This		 * ensures the histogram won't collapse to empty or a singleton.)		 */		num_hist = ndistinct - num_mcv;		if (num_hist > num_bins)			num_hist = num_bins + 1;		if (num_hist >= 2)		{			MemoryContext old_context;			Datum	   *hist_values;			int			nvals;			/* Sort the MCV items into position order to speed next loop */			qsort((void *) track, num_mcv,				  sizeof(ScalarMCVItem), compare_mcvs);			/*			 * Collapse out the MCV items from the values[] array.			 *			 * Note we destroy the values[] array here... but we don't need			 * it for anything more.  We do, however, still need			 * values_cnt. nvals will be the number of remaining entries			 * in values[].			 */			if (num_mcv > 0)			{				int			src,							dest;				int			j;				src = dest = 0;				j = 0;			/* index of next interesting MCV item */				while (src < values_cnt)				{					int			ncopy;					if (j < num_mcv)					{						int			first = track[j].first;						if (src >= first)						{							/* advance past this MCV item */							src = first + track[j].count;							j++;							continue;						}						ncopy = first - src;					}					else						ncopy = values_cnt - src;					memmove(&values[dest], &values[src],							ncopy * sizeof(ScalarItem));					src += ncopy;					dest += ncopy;				}				nvals = dest;			}			else				nvals = values_cnt;			Assert(nvals >= num_hist);			/* Must copy the target values into anl_context */			old_context = MemoryContextSwitchTo(anl_context);			hist_values = (Datum *) palloc(num_hist * sizeof(Datum));			for (i = 0; i < num_hist; i++)			{				int			pos;				pos = (i * (nvals - 1)) / (num_hist - 1);				hist_values[i] = datumCopy(values[pos].value,										   stats->attr->attbyval,										   stats->attr->attlen);			}			MemoryContextSwitchTo(old_context);			stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM;			stats->staop[slot_idx] = stats->ltopr;			stats->stavalues[slot_idx] = hist_values;			stats->numvalues[slot_idx] = num_hist;			slot_idx++;		}		/* Generate a correlation entry if there are multiple values */		if (values_cnt > 1)		{			MemoryContext old_context;			float4	   *corrs;			double		corr_xsum,						corr_x2sum;			/* Must copy the target values into anl_context */			old_context = MemoryContextSwitchTo(anl_context);			corrs = (float4 *) palloc(sizeof(float4));			MemoryContextSwitchTo(old_context);			/*----------			 * Since we know the x and y value sets are both			 *		0, 1, ..., values_cnt-1			 * we have sum(x) = sum(y) =			 *		(values_cnt-1)*values_cnt / 2			 * and sum(x^2) = sum(y^2) =			 *		(values_cnt-1)*values_cnt*(2*values_cnt-1) / 6.			 *----------			 */			corr_xsum = ((double) (values_cnt - 1)) *				((double) values_cnt) / 2.0;			corr_x2sum = ((double) (values_cnt - 1)) *				((double) values_cnt) * (double) (2 * values_cnt - 1) / 6.0;			/* And the correlation coefficient reduces to */			corrs[0] = (values_cnt * corr_xysum - corr_xsum * corr_xsum) /				(values_cnt * corr_x2sum - corr_xsum * corr_xsum);			stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION;			stats->staop[slot_idx] = stats->ltopr;			stats->stanumbers[slot_idx] = corrs;			stats->numnumbers[slot_idx] = 1;			slot_idx++;		}	}	/* We don't need to bother cleaning up any of our temporary palloc's */}/* * qsort comparator for sorting ScalarItems * * Aside from sorting the items, we update the datumCmpTupnoLink[] array * whenever two ScalarItems are found to contain equal datums.	The array * is indexed by tupno; for each ScalarItem, it contains the highest * tupno that that item's datum has been found to be equal to.  This allows * us to avoid additional comparisons in compute_scalar_stats(). */static intcompare_scalars(const void *a, const void *b){	Datum		da = ((ScalarItem *) a)->value;	int			ta = ((ScalarItem *) a)->tupno;	Datum		db = ((ScalarItem *) b)->value;	int			tb = ((ScalarItem *) b)->tupno;	int32		compare;	compare = ApplySortFunction(datumCmpFn, datumCmpFnKind,								da, false, db, false);	if (compare != 0)		return compare;	/*	 * The two datums are equal, so update datumCmpTupnoLink[].	 */	if (datumCmpTupnoLink[ta] < tb)		datumCmpTupnoLink[ta] = tb;	if (datumCmpTupnoLink[tb] < ta)		datumCmpTupnoLink[tb] = ta;	/*	 * For equal datums, sort by tupno	 */	return ta - tb;}/* * qsort comparator for sorting ScalarMCVItems by position */static intcompare_mcvs(const void *a, const void *b){	int			da = ((ScalarMCVItem *) a)->first;	int			db = ((ScalarMCVItem *) b)->first;	return da - db;}/* *	update_attstats() -- update attribute statistics for one relation * *		Statistics are stored in several places: the pg_class row for the *		relation has stats about the whole relation, and there is a *		pg_statistic row for each (non-system) attribute that has ever *		been analyzed.	The pg_class values are updated by VACUUM, not here. * *		pg_statistic rows are just added or updated normally.  This means *		that pg_statistic will probably contain some deleted rows at the *		completion of a vacuum cycle, unless it happens to get vacuumed last. * *		To keep things simple, we punt for pg_statistic, and don't try *		to compute or store rows for pg_statistic itself in pg_statistic. *		This could possibly be made to work, but it's not worth the trouble. *		Note analyze_rel() has seen to it that we won't come here when *		vacuuming pg_statistic itself. * *		Note: if two backends concurrently try to analyze the same relation, *		the second one is likely to fail here with a "tuple concurrently *		updated" error.  This is slightly annoying, but no real harm is done. *		We could prevent the problem by using a stronger lock on the *		relation for ANALYZE (ie, ShareUpdateExclusiveLock instead *		of AccessShareLock); but that cure seems worse than the disease, *		especially now that ANALYZE doesn't start a new transaction *		for each relation.	The lock could be held for a long time... */static voidupdate_attstats(Oid relid, int natts, VacAttrStats **vacattrstats){	Relation	sd;	int			attno;	sd = heap_openr(StatisticRelationName, RowExclusiveLock);	for (attno = 0; attno < natts; attno++)	{		VacAttrStats *stats = vacattrstats[attno];		HeapTuple	stup,					oldtup;		int			i,					k,					n;		Datum		values[Natts_pg_statistic];		char		nulls[Natts_pg_statistic];		char		replaces[Natts_pg_statistic];		/* Ignore attr if we weren't able to collect stats */		if (!stats->stats_valid)			continue;		/*		 * Construct a new pg_statistic tuple		 */		for (i = 0; i < Natts_pg_statistic; ++i)		{			nulls[i] = ' ';			replaces[i] = 'r';		}		i = 0;		values[i++] = ObjectIdGetDatum(relid);	/* starelid */		values[i++] = Int16GetDatum(stats->attnum);		/* staattnum */		values[i++] = Float4GetDatum(stats->stanullfrac);		/* stanullfrac */		values[i++] = Int32GetDatum(stats->stawidth);	/* stawidth */		values[i++] = Float4GetDatum(stats->stadistinct);		/* stadistinct */		for (k = 0; k < STATISTIC_NUM_SLOTS; k++)		{			values[i++] = Int16GetDatum(stats->stakind[k]);		/* stakindN */		}		for (k = 0; k < STATISTIC_NUM_SLOTS; k++)		{			values[i++] = ObjectIdGetDatum(stats->staop[k]);	/* staopN */		}		for (k = 0; k < STATISTIC_NUM_SLOTS; k++)		{			int			nnum = stats->numnumbers[k];			if (nnum > 0)			{				Datum	   *numdatums = (Datum *) palloc(nnum * sizeof(Datum));				ArrayType  *arry;				for (n = 0; n < nnum; n++)					numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]);				/* XXX knows more than it should about type float4: */				arry = construct_array(numdatums, nnum,									   FLOAT4OID,									   sizeof(float4), false, 'i');				values[i++] = PointerGetDatum(arry);	/* stanumbersN */			}			else			{				nulls[i] = 'n';				values[i++] = (Datum) 0;			}		}		for (k = 0; k < STATISTIC_NUM_SLOTS; k++)		{			if (stats->numvalues[k] > 0)			{				ArrayType  *arry;				arry = construct_array(stats->stavalues[k],									   stats->numvalues[k],									   stats->attr->atttypid,									   stats->attrtype->typlen,									   stats->attrtype->typbyval,									   stats->attrtype->typalign);				values[i++] = PointerGetDatum(arry);	/* stavaluesN */			}			else			{				nulls[i] = 'n';				values[i++] = (Datum) 0;			}		}		/* Is there already a pg_statistic tuple for this attribute? */		oldtup = SearchSysCache(STATRELATT,								ObjectIdGetDatum(relid),								Int16GetDatum(stats->attnum),								0, 0);		if (HeapTupleIsValid(oldtup))		{			/* Yes, replace it */			stup = heap_modifytuple(oldtup,									sd,									values,									nulls,									replaces);			ReleaseSysCache(oldtup);			simple_heap_update(sd, &stup->t_self, stup);		}		else		{			/* No, insert new tuple */			stup = heap_formtuple(sd->rd_att, values, nulls);			simple_heap_insert(sd, stup);		}		/* update indexes too */		CatalogUpdateIndexes(sd, stup);		heap_freetuple(stup);	}	heap_close(sd, RowExclusiveLock);}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?