selfuncs.c
来自「PostgreSQL7.4.6 for Linux」· C语言 代码 · 共 2,384 行 · 第 1/5 页
C
2,384 行
if (!op_mergejoinable(opno, &lsortop, &rsortop)) return; /* shouldn't happen */ /* Try to get maximum values of both vars */ if (!get_var_maximum(root, left, lsortop, &leftmax)) return; /* no max available from stats */ if (!get_var_maximum(root, right, rsortop, &rightmax)) return; /* no max available from stats */ /* Look up the "left < right" and "left > right" operators */ op_mergejoin_crossops(opno, <op, >op, NULL, NULL); /* Look up the "left <= right" operator */ leop = get_negator(gtop); if (!OidIsValid(leop)) return; /* insufficient info in catalogs */ /* Look up the "right > left" operator */ revgtop = get_commutator(ltop); if (!OidIsValid(revgtop)) return; /* insufficient info in catalogs */ /* Look up the "right <= left" operator */ revleop = get_negator(revgtop); if (!OidIsValid(revleop)) return; /* insufficient info in catalogs */ /* * Now, the fraction of the left variable that will be scanned is the * fraction that's <= the right-side maximum value. But only believe * non-default estimates, else stick with our 1.0. */ selec = scalarineqsel(root, leop, false, left, rightmax, righttype); if (selec != DEFAULT_INEQ_SEL) *leftscan = selec; /* And similarly for the right variable. */ selec = scalarineqsel(root, revleop, false, right, leftmax, lefttype); if (selec != DEFAULT_INEQ_SEL) *rightscan = selec; /* * Only one of the two fractions can really be less than 1.0; believe * the smaller estimate and reset the other one to exactly 1.0. If we * get exactly equal estimates (as can easily happen with self-joins), * believe neither. */ if (*leftscan > *rightscan) *leftscan = 1.0; else if (*leftscan < *rightscan) *rightscan = 1.0; else *leftscan = *rightscan = 1.0;}/* * estimate_num_groups - Estimate number of groups in a grouped query * * Given a query having a GROUP BY clause, estimate how many groups there * will be --- ie, the number of distinct combinations of the GROUP BY * expressions. * * This routine is also used to estimate the number of rows emitted by * a DISTINCT filtering step; that is an isomorphic problem. (Note: * actually, we only use it for DISTINCT when there's no grouping or * aggregation ahead of the DISTINCT.) * * Inputs: * root - the query * groupExprs - list of expressions being grouped by * input_rows - number of rows estimated to arrive at the group/unique * filter step * * Given the lack of any cross-correlation statistics in the system, it's * impossible to do anything really trustworthy with GROUP BY conditions * involving multiple Vars. We should however avoid assuming the worst * case (all possible cross-product terms actually appear as groups) since * very often the grouped-by Vars are highly correlated. Our current approach * is as follows: * 1. Reduce the given expressions to a list of unique Vars used. For * example, GROUP BY a, a + b is treated the same as GROUP BY a, b. * It is clearly correct not to count the same Var more than once. * It is also reasonable to treat f(x) the same as x: f() cannot * increase the number of distinct values (unless it is volatile, * which we consider unlikely for grouping), but it probably won't * reduce the number of distinct values much either. * 2. If the list contains Vars of different relations that are known equal * due to equijoin clauses, then drop all but one of the Vars from each * known-equal set, keeping the one with smallest estimated # of values * (since the extra values of the others can't appear in joined rows). * Note the reason we only consider Vars of different relations is that * if we considered ones of the same rel, we'd be double-counting the * restriction selectivity of the equality in the next step. * 3. For Vars within a single source rel, we multiply together the numbers * of values, clamp to the number of rows in the rel, and then multiply * by the selectivity of the restriction clauses for that rel. The * initial product is probably too high (it's the worst case) but since * we can clamp to the rel's rows it won't be hugely bad. Multiplying * by the restriction selectivity is effectively assuming that the * restriction clauses are independent of the grouping, which is a crummy * assumption, but it's hard to do better. * 4. If there are Vars from multiple rels, we repeat step 3 for each such * rel, and multiply the results together. * Note that rels not containing grouped Vars are ignored completely, as are * join clauses other than the equijoin clauses used in step 2. Such rels * cannot increase the number of groups, and we assume such clauses do not * reduce the number either (somewhat bogus, but we don't have the info to * do better). */doubleestimate_num_groups(Query *root, List *groupExprs, double input_rows){ List *allvars = NIL; List *varinfos = NIL; double numdistinct; List *l; typedef struct { /* varinfos is a List of these */ Var *var; double ndistinct; } MyVarInfo; /* We should not be called unless query has GROUP BY (or DISTINCT) */ Assert(groupExprs != NIL); /* Step 1: get the unique Vars used */ foreach(l, groupExprs) { Node *groupexpr = (Node *) lfirst(l); List *varshere; varshere = pull_var_clause(groupexpr, false); /* * If we find any variable-free GROUP BY item, then either it is a * constant (and we can ignore it) or it contains a volatile * function; in the latter case we punt and assume that each input * row will yield a distinct group. */ if (varshere == NIL) { if (contain_volatile_functions(groupexpr)) return input_rows; continue; } allvars = nconc(allvars, varshere); } /* If now no Vars, we must have an all-constant GROUP BY list. */ if (allvars == NIL) return 1.0; /* Use set_union() to discard duplicates */ allvars = set_union(NIL, allvars); /* * Step 2: acquire statistical estimate of number of distinct values * of each Var (total in its table, without regard for filtering). * Also, detect known-equal Vars and discard the ones we don't want. */ foreach(l, allvars) { Var *var = (Var *) lfirst(l); Oid relid = getrelid(var->varno, root->rtable); HeapTuple statsTuple = NULL; Form_pg_statistic stats = NULL; double ndistinct; bool keep = true; List *l2; if (OidIsValid(relid)) { statsTuple = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid), Int16GetDatum(var->varattno), 0, 0); if (HeapTupleIsValid(statsTuple)) stats = (Form_pg_statistic) GETSTRUCT(statsTuple); } ndistinct = get_att_numdistinct(root, var, stats); if (HeapTupleIsValid(statsTuple)) ReleaseSysCache(statsTuple); /* cannot use foreach here because of possible lremove */ l2 = varinfos; while (l2) { MyVarInfo *varinfo = (MyVarInfo *) lfirst(l2); /* must advance l2 before lremove possibly pfree's it */ l2 = lnext(l2); if (var->varno != varinfo->var->varno && exprs_known_equal(root, (Node *) var, (Node *) varinfo->var)) { /* Found a match */ if (varinfo->ndistinct <= ndistinct) { /* Keep older item, forget new one */ keep = false; break; } else { /* Delete the older item */ varinfos = lremove(varinfo, varinfos); } } } if (keep) { MyVarInfo *varinfo = (MyVarInfo *) palloc(sizeof(MyVarInfo)); varinfo->var = var; varinfo->ndistinct = ndistinct; varinfos = lcons(varinfo, varinfos); } } /* * Steps 3/4: group Vars by relation and estimate total numdistinct. * * For each iteration of the outer loop, we process the frontmost Var in * varinfos, plus all other Vars in the same relation. We remove * these Vars from the newvarinfos list for the next iteration. This * is the easiest way to group Vars of same rel together. */ Assert(varinfos != NIL); numdistinct = 1.0; do { MyVarInfo *varinfo1 = (MyVarInfo *) lfirst(varinfos); RelOptInfo *rel = find_base_rel(root, varinfo1->var->varno); double reldistinct = varinfo1->ndistinct; List *newvarinfos = NIL; /* * Get the largest numdistinct estimate of the Vars for this rel. * Also, construct new varinfos list of remaining Vars. */ foreach(l, lnext(varinfos)) { MyVarInfo *varinfo2 = (MyVarInfo *) lfirst(l); if (varinfo2->var->varno == varinfo1->var->varno) reldistinct *= varinfo2->ndistinct; else { /* not time to process varinfo2 yet */ newvarinfos = lcons(varinfo2, newvarinfos); } } /* * Sanity check --- don't divide by zero if empty relation. */ Assert(rel->reloptkind == RELOPT_BASEREL); if (rel->tuples > 0) { /* * Clamp to size of rel, multiply by restriction selectivity. */ if (reldistinct > rel->tuples) reldistinct = rel->tuples; reldistinct *= rel->rows / rel->tuples; /* * Update estimate of total distinct groups. */ numdistinct *= reldistinct; } varinfos = newvarinfos; } while (varinfos != NIL); numdistinct = ceil(numdistinct); /* Guard against out-of-range answers */ if (numdistinct > input_rows) numdistinct = input_rows; if (numdistinct < 1.0) numdistinct = 1.0; return numdistinct;}/*------------------------------------------------------------------------- * * Support routines * *------------------------------------------------------------------------- *//* * get_var_maximum * Estimate the maximum value of the specified variable. * If successful, store value in *max and return TRUE. * If no data available, return FALSE. * * sortop is the "<" comparison operator to use. (To extract the * minimum instead of the maximum, just pass the ">" operator instead.) */static boolget_var_maximum(Query *root, Var *var, Oid sortop, Datum *max){ Datum tmax = 0; bool have_max = false; Oid relid; HeapTuple statsTuple; Form_pg_statistic stats; int16 typLen; bool typByVal; Datum *values; int nvalues; int i; relid = getrelid(var->varno, root->rtable); if (relid == InvalidOid) return false; /* get stats for the attribute */ statsTuple = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid), Int16GetDatum(var->varattno), 0, 0); if (!HeapTupleIsValid(statsTuple)) { /* no stats available, so default result */ return false; } stats = (Form_pg_statistic) GETSTRUCT(statsTuple); get_typlenbyval(var->vartype, &typLen, &typByVal); /* * If there is a histogram, grab the last or first value as * appropriate. * * If there is a histogram that is sorted with some other operator than * the one we want, fail --- this suggests that there is data we can't * use. */ if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, STATISTIC_KIND_HISTOGRAM, sortop, &values, &nvalues, NULL, NULL)) { if (nvalues > 0) { tmax = datumCopy(values[nvalues - 1], typByVal, typLen); have_max = true; } free_attstatsslot(var->vartype, values, nvalues, NULL, 0); } else { Oid rsortop = get_commutator(sortop); if (OidIsValid(rsortop) && get_attstatsslot(statsTuple, var->vartype, var->vartypmod, STATISTIC_KIND_HISTOGRAM, rsortop, &values, &nvalues, NULL, NULL)) { if (nvalues > 0) { tmax = datumCopy(values[0], typByVal, typLen); have_max = true; } free_attstatsslot(var->vartype, values, nvalues, NULL, 0); } else if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, STATISTIC_KIND_HISTOGRAM, InvalidOid, &values, &nvalues, NULL, NULL)) { free_attstatsslot(var->vartype, values, nvalues, NULL, 0); ReleaseSysCache(statsTuple); return false; } } /* * If we have most-common-values info, look for a large MCV. This is * needed even if we also have a histogram, since the histogram * excludes the MCVs. However, usually the MCVs will not be the * extreme values, so avoid unnecessary data copying. */ if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod, STATISTIC_KIND_MCV, InvalidOid, &values, &nvalues, NULL, NULL)) { bool large_mcv = false; FmgrInfo opproc; fmgr_info(get_opcode(sortop), &opproc); for (i = 0; i < nvalues; i++) { if (!have_max) { tmax = values[i]; large_mcv = have_max = true; } else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i]))) { tmax = values[i]; large_mcv = true; } } if (large_mcv) tmax = datumCopy(tmax, typByVal, typLen); free_attstatsslot(var->vartype, values, nvalues, NULL, 0); } ReleaseSysCache(statsTuple); *max = tmax; return have_max;}/* * convert_to_scalar * Convert non-NULL values of the indicated types to the comparison * scale needed by scalarltsel()/scalargtsel(). * Returns "true" if successful. * * XXX this routine is a hack: ideally we should look up the conversion * subroutines in pg_type. * * All numeric datatypes are simply converted to their equivalent * "double" values. (NUMERIC values that are outside the range of "double" * are clamped to +/- HUGE_VAL.) * * String datatypes are converted by convert_string_to_scalar(), * which is explained below. The reason why this routine deals with * three values at a time, not just one, is that we need it for strings. * * The bytea datatype is just enough different from strings that it has * to be treated separately. * * The several datatypes representing absolute times are all converted * to Timestamp, which is actually a double, and then we just use that * double value. Note this will give correct results even for the "special" * values of Timestamp, since those are chosen to compare correctly; * see timestamp_cmp. * * The several datatypes representing relative times (intervals) are all * converted to measurements expressed in seconds. */static boolconvert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, Datum lobound, Datum hibound, Oid boundstypid, double *scaledlobound, double *scaledhibound){ /* * In present usage, we can assume that the valuetypid exactly matches * the declared input type of the operator we are invoked for (because * constant-folding will ensure that any Const passed to the operator * has been reduced to the correct type). However, the boundstypid is * the type of some variable that might be only binary-compatible with * the declared type; in particular it might be a domain type. Must * fold the variable type down to base type so we can recognize it. * (But we can skip that lookup if the variable type matches the * const.) */ if (boun
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?