selfuncs.c

来自「PostgreSQL7.4.6 for Linux」· C语言 代码 · 共 2,384 行 · 第 1/5 页

C
2,384
字号
	if (!op_mergejoinable(opno,						  &lsortop,						  &rsortop))		return;					/* shouldn't happen */	/* Try to get maximum values of both vars */	if (!get_var_maximum(root, left, lsortop, &leftmax))		return;					/* no max available from stats */	if (!get_var_maximum(root, right, rsortop, &rightmax))		return;					/* no max available from stats */	/* Look up the "left < right" and "left > right" operators */	op_mergejoin_crossops(opno, &ltop, &gtop, NULL, NULL);	/* Look up the "left <= right" operator */	leop = get_negator(gtop);	if (!OidIsValid(leop))		return;					/* insufficient info in catalogs */	/* Look up the "right > left" operator */	revgtop = get_commutator(ltop);	if (!OidIsValid(revgtop))		return;					/* insufficient info in catalogs */	/* Look up the "right <= left" operator */	revleop = get_negator(revgtop);	if (!OidIsValid(revleop))		return;					/* insufficient info in catalogs */	/*	 * Now, the fraction of the left variable that will be scanned is the	 * fraction that's <= the right-side maximum value.  But only believe	 * non-default estimates, else stick with our 1.0.	 */	selec = scalarineqsel(root, leop, false, left,						  rightmax, righttype);	if (selec != DEFAULT_INEQ_SEL)		*leftscan = selec;	/* And similarly for the right variable. */	selec = scalarineqsel(root, revleop, false, right,						  leftmax, lefttype);	if (selec != DEFAULT_INEQ_SEL)		*rightscan = selec;	/*	 * Only one of the two fractions can really be less than 1.0; believe	 * the smaller estimate and reset the other one to exactly 1.0.  If we	 * get exactly equal estimates (as can easily happen with self-joins),	 * believe neither.	 */	if (*leftscan > *rightscan)		*leftscan = 1.0;	else if (*leftscan < *rightscan)		*rightscan = 1.0;	else		*leftscan = *rightscan = 1.0;}/* * estimate_num_groups		- Estimate number of groups in a grouped query * * Given a query having a GROUP BY clause, estimate how many groups there * will be --- ie, the number of distinct combinations of the GROUP BY * expressions. * * This routine is also used to estimate the number of rows emitted by * a DISTINCT filtering step; that is an isomorphic problem.  (Note: * actually, we only use it for DISTINCT when there's no grouping or * aggregation ahead of the DISTINCT.) * * Inputs: *	root - the query *	groupExprs - list of expressions being grouped by *	input_rows - number of rows estimated to arrive at the group/unique *		filter step * * Given the lack of any cross-correlation statistics in the system, it's * impossible to do anything really trustworthy with GROUP BY conditions * involving multiple Vars.  We should however avoid assuming the worst * case (all possible cross-product terms actually appear as groups) since * very often the grouped-by Vars are highly correlated.  Our current approach * is as follows: *	1.	Reduce the given expressions to a list of unique Vars used.  For *		example, GROUP BY a, a + b is treated the same as GROUP BY a, b. *		It is clearly correct not to count the same Var more than once. *		It is also reasonable to treat f(x) the same as x: f() cannot *		increase the number of distinct values (unless it is volatile, *		which we consider unlikely for grouping), but it probably won't *		reduce the number of distinct values much either. *	2.	If the list contains Vars of different relations that are known equal *		due to equijoin clauses, then drop all but one of the Vars from each *		known-equal set, keeping the one with smallest estimated # of values *		(since the extra values of the others can't appear in joined rows). *		Note the reason we only consider Vars of different relations is that *		if we considered ones of the same rel, we'd be double-counting the *		restriction selectivity of the equality in the next step. *	3.	For Vars within a single source rel, we multiply together the numbers *		of values, clamp to the number of rows in the rel, and then multiply *		by the selectivity of the restriction clauses for that rel.  The *		initial product is probably too high (it's the worst case) but since *		we can clamp to the rel's rows it won't be hugely bad.	Multiplying *		by the restriction selectivity is effectively assuming that the *		restriction clauses are independent of the grouping, which is a crummy *		assumption, but it's hard to do better. *	4.	If there are Vars from multiple rels, we repeat step 3 for each such *		rel, and multiply the results together. * Note that rels not containing grouped Vars are ignored completely, as are * join clauses other than the equijoin clauses used in step 2.  Such rels * cannot increase the number of groups, and we assume such clauses do not * reduce the number either (somewhat bogus, but we don't have the info to * do better). */doubleestimate_num_groups(Query *root, List *groupExprs, double input_rows){	List	   *allvars = NIL;	List	   *varinfos = NIL;	double		numdistinct;	List	   *l;	typedef struct	{							/* varinfos is a List of these */		Var		   *var;		double		ndistinct;	} MyVarInfo;	/* We should not be called unless query has GROUP BY (or DISTINCT) */	Assert(groupExprs != NIL);	/* Step 1: get the unique Vars used */	foreach(l, groupExprs)	{		Node	   *groupexpr = (Node *) lfirst(l);		List	   *varshere;		varshere = pull_var_clause(groupexpr, false);		/*		 * If we find any variable-free GROUP BY item, then either it is a		 * constant (and we can ignore it) or it contains a volatile		 * function; in the latter case we punt and assume that each input		 * row will yield a distinct group.		 */		if (varshere == NIL)		{			if (contain_volatile_functions(groupexpr))				return input_rows;			continue;		}		allvars = nconc(allvars, varshere);	}	/* If now no Vars, we must have an all-constant GROUP BY list. */	if (allvars == NIL)		return 1.0;	/* Use set_union() to discard duplicates */	allvars = set_union(NIL, allvars);	/*	 * Step 2: acquire statistical estimate of number of distinct values	 * of each Var (total in its table, without regard for filtering).	 * Also, detect known-equal Vars and discard the ones we don't want.	 */	foreach(l, allvars)	{		Var		   *var = (Var *) lfirst(l);		Oid			relid = getrelid(var->varno, root->rtable);		HeapTuple	statsTuple = NULL;		Form_pg_statistic stats = NULL;		double		ndistinct;		bool		keep = true;		List	   *l2;		if (OidIsValid(relid))		{			statsTuple = SearchSysCache(STATRELATT,										ObjectIdGetDatum(relid),										Int16GetDatum(var->varattno),										0, 0);			if (HeapTupleIsValid(statsTuple))				stats = (Form_pg_statistic) GETSTRUCT(statsTuple);		}		ndistinct = get_att_numdistinct(root, var, stats);		if (HeapTupleIsValid(statsTuple))			ReleaseSysCache(statsTuple);		/* cannot use foreach here because of possible lremove */		l2 = varinfos;		while (l2)		{			MyVarInfo  *varinfo = (MyVarInfo *) lfirst(l2);			/* must advance l2 before lremove possibly pfree's it */			l2 = lnext(l2);			if (var->varno != varinfo->var->varno &&			exprs_known_equal(root, (Node *) var, (Node *) varinfo->var))			{				/* Found a match */				if (varinfo->ndistinct <= ndistinct)				{					/* Keep older item, forget new one */					keep = false;					break;				}				else				{					/* Delete the older item */					varinfos = lremove(varinfo, varinfos);				}			}		}		if (keep)		{			MyVarInfo  *varinfo = (MyVarInfo *) palloc(sizeof(MyVarInfo));			varinfo->var = var;			varinfo->ndistinct = ndistinct;			varinfos = lcons(varinfo, varinfos);		}	}	/*	 * Steps 3/4: group Vars by relation and estimate total numdistinct.	 *	 * For each iteration of the outer loop, we process the frontmost Var in	 * varinfos, plus all other Vars in the same relation.	We remove	 * these Vars from the newvarinfos list for the next iteration. This	 * is the easiest way to group Vars of same rel together.	 */	Assert(varinfos != NIL);	numdistinct = 1.0;	do	{		MyVarInfo  *varinfo1 = (MyVarInfo *) lfirst(varinfos);		RelOptInfo *rel = find_base_rel(root, varinfo1->var->varno);		double		reldistinct = varinfo1->ndistinct;		List	   *newvarinfos = NIL;		/*		 * Get the largest numdistinct estimate of the Vars for this rel.		 * Also, construct new varinfos list of remaining Vars.		 */		foreach(l, lnext(varinfos))		{			MyVarInfo  *varinfo2 = (MyVarInfo *) lfirst(l);			if (varinfo2->var->varno == varinfo1->var->varno)				reldistinct *= varinfo2->ndistinct;			else			{				/* not time to process varinfo2 yet */				newvarinfos = lcons(varinfo2, newvarinfos);			}		}		/*		 * Sanity check --- don't divide by zero if empty relation.		 */		Assert(rel->reloptkind == RELOPT_BASEREL);		if (rel->tuples > 0)		{			/*			 * Clamp to size of rel, multiply by restriction selectivity.			 */			if (reldistinct > rel->tuples)				reldistinct = rel->tuples;			reldistinct *= rel->rows / rel->tuples;			/*			 * Update estimate of total distinct groups.			 */			numdistinct *= reldistinct;		}		varinfos = newvarinfos;	} while (varinfos != NIL);	numdistinct = ceil(numdistinct);	/* Guard against out-of-range answers */	if (numdistinct > input_rows)		numdistinct = input_rows;	if (numdistinct < 1.0)		numdistinct = 1.0;	return numdistinct;}/*------------------------------------------------------------------------- * * Support routines * *------------------------------------------------------------------------- *//* * get_var_maximum *		Estimate the maximum value of the specified variable. *		If successful, store value in *max and return TRUE. *		If no data available, return FALSE. * * sortop is the "<" comparison operator to use.  (To extract the * minimum instead of the maximum, just pass the ">" operator instead.) */static boolget_var_maximum(Query *root, Var *var, Oid sortop, Datum *max){	Datum		tmax = 0;	bool		have_max = false;	Oid			relid;	HeapTuple	statsTuple;	Form_pg_statistic stats;	int16		typLen;	bool		typByVal;	Datum	   *values;	int			nvalues;	int			i;	relid = getrelid(var->varno, root->rtable);	if (relid == InvalidOid)		return false;	/* get stats for the attribute */	statsTuple = SearchSysCache(STATRELATT,								ObjectIdGetDatum(relid),								Int16GetDatum(var->varattno),								0, 0);	if (!HeapTupleIsValid(statsTuple))	{		/* no stats available, so default result */		return false;	}	stats = (Form_pg_statistic) GETSTRUCT(statsTuple);	get_typlenbyval(var->vartype, &typLen, &typByVal);	/*	 * If there is a histogram, grab the last or first value as	 * appropriate.	 *	 * If there is a histogram that is sorted with some other operator than	 * the one we want, fail --- this suggests that there is data we can't	 * use.	 */	if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,						 STATISTIC_KIND_HISTOGRAM, sortop,						 &values, &nvalues,						 NULL, NULL))	{		if (nvalues > 0)		{			tmax = datumCopy(values[nvalues - 1], typByVal, typLen);			have_max = true;		}		free_attstatsslot(var->vartype, values, nvalues, NULL, 0);	}	else	{		Oid			rsortop = get_commutator(sortop);		if (OidIsValid(rsortop) &&			get_attstatsslot(statsTuple, var->vartype, var->vartypmod,							 STATISTIC_KIND_HISTOGRAM, rsortop,							 &values, &nvalues,							 NULL, NULL))		{			if (nvalues > 0)			{				tmax = datumCopy(values[0], typByVal, typLen);				have_max = true;			}			free_attstatsslot(var->vartype, values, nvalues, NULL, 0);		}		else if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,								  STATISTIC_KIND_HISTOGRAM, InvalidOid,								  &values, &nvalues,								  NULL, NULL))		{			free_attstatsslot(var->vartype, values, nvalues, NULL, 0);			ReleaseSysCache(statsTuple);			return false;		}	}	/*	 * If we have most-common-values info, look for a large MCV.  This is	 * needed even if we also have a histogram, since the histogram	 * excludes the MCVs.  However, usually the MCVs will not be the	 * extreme values, so avoid unnecessary data copying.	 */	if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,						 STATISTIC_KIND_MCV, InvalidOid,						 &values, &nvalues,						 NULL, NULL))	{		bool		large_mcv = false;		FmgrInfo	opproc;		fmgr_info(get_opcode(sortop), &opproc);		for (i = 0; i < nvalues; i++)		{			if (!have_max)			{				tmax = values[i];				large_mcv = have_max = true;			}			else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))			{				tmax = values[i];				large_mcv = true;			}		}		if (large_mcv)			tmax = datumCopy(tmax, typByVal, typLen);		free_attstatsslot(var->vartype, values, nvalues, NULL, 0);	}	ReleaseSysCache(statsTuple);	*max = tmax;	return have_max;}/* * convert_to_scalar *	  Convert non-NULL values of the indicated types to the comparison *	  scale needed by scalarltsel()/scalargtsel(). *	  Returns "true" if successful. * * XXX this routine is a hack: ideally we should look up the conversion * subroutines in pg_type. * * All numeric datatypes are simply converted to their equivalent * "double" values.  (NUMERIC values that are outside the range of "double" * are clamped to +/- HUGE_VAL.) * * String datatypes are converted by convert_string_to_scalar(), * which is explained below.  The reason why this routine deals with * three values at a time, not just one, is that we need it for strings. * * The bytea datatype is just enough different from strings that it has * to be treated separately. * * The several datatypes representing absolute times are all converted * to Timestamp, which is actually a double, and then we just use that * double value.  Note this will give correct results even for the "special" * values of Timestamp, since those are chosen to compare correctly; * see timestamp_cmp. * * The several datatypes representing relative times (intervals) are all * converted to measurements expressed in seconds. */static boolconvert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,				  Datum lobound, Datum hibound, Oid boundstypid,				  double *scaledlobound, double *scaledhibound){	/*	 * In present usage, we can assume that the valuetypid exactly matches	 * the declared input type of the operator we are invoked for (because	 * constant-folding will ensure that any Const passed to the operator	 * has been reduced to the correct type).  However, the boundstypid is	 * the type of some variable that might be only binary-compatible with	 * the declared type; in particular it might be a domain type.	Must	 * fold the variable type down to base type so we can recognize it.	 * (But we can skip that lookup if the variable type matches the	 * const.)	 */	if (boun

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?