📄 selfuncs.c
字号:
{ if (varinfo->ndistinct <= ndistinct) { /* Keep older item, forget new one */ return varinfos; } else { /* Delete the older item */ varinfos = list_delete_ptr(varinfos, varinfo); } } } varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo)); varinfo->var = var; varinfo->rel = vardata->rel; varinfo->ndistinct = ndistinct; varinfos = lappend(varinfos, varinfo); return varinfos;}/* * estimate_num_groups - Estimate number of groups in a grouped query * * Given a query having a GROUP BY clause, estimate how many groups there * will be --- ie, the number of distinct combinations of the GROUP BY * expressions. * * This routine is also used to estimate the number of rows emitted by * a DISTINCT filtering step; that is an isomorphic problem. (Note: * actually, we only use it for DISTINCT when there's no grouping or * aggregation ahead of the DISTINCT.) * * Inputs: * root - the query * groupExprs - list of expressions being grouped by * input_rows - number of rows estimated to arrive at the group/unique * filter step * * Given the lack of any cross-correlation statistics in the system, it's * impossible to do anything really trustworthy with GROUP BY conditions * involving multiple Vars. We should however avoid assuming the worst * case (all possible cross-product terms actually appear as groups) since * very often the grouped-by Vars are highly correlated. Our current approach * is as follows: * 1. Reduce the given expressions to a list of unique Vars used. For * example, GROUP BY a, a + b is treated the same as GROUP BY a, b. * It is clearly correct not to count the same Var more than once. * It is also reasonable to treat f(x) the same as x: f() cannot * increase the number of distinct values (unless it is volatile, * which we consider unlikely for grouping), but it probably won't * reduce the number of distinct values much either. * As a special case, if a GROUP BY expression can be matched to an * expressional index for which we have statistics, then we treat the * whole expression as though it were just a Var. * 2. If the list contains Vars of different relations that are known equal * due to equijoin clauses, then drop all but one of the Vars from each * known-equal set, keeping the one with smallest estimated # of values * (since the extra values of the others can't appear in joined rows). * Note the reason we only consider Vars of different relations is that * if we considered ones of the same rel, we'd be double-counting the * restriction selectivity of the equality in the next step. * 3. For Vars within a single source rel, we multiply together the numbers * of values, clamp to the number of rows in the rel (divided by 10 if * more than one Var), and then multiply by the selectivity of the * restriction clauses for that rel. When there's more than one Var, * the initial product is probably too high (it's the worst case) but * clamping to a fraction of the rel's rows seems to be a helpful * heuristic for not letting the estimate get out of hand. (The factor * of 10 is derived from pre-Postgres-7.4 practice.) Multiplying * by the restriction selectivity is effectively assuming that the * restriction clauses are independent of the grouping, which is a crummy * assumption, but it's hard to do better. * 4. If there are Vars from multiple rels, we repeat step 3 for each such * rel, and multiply the results together. * Note that rels not containing grouped Vars are ignored completely, as are * join clauses other than the equijoin clauses used in step 2. Such rels * cannot increase the number of groups, and we assume such clauses do not * reduce the number either (somewhat bogus, but we don't have the info to * do better). */doubleestimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows){ List *varinfos = NIL; double numdistinct; ListCell *l; /* We should not be called unless query has GROUP BY (or DISTINCT) */ Assert(groupExprs != NIL); /* * Steps 1/2: find the unique Vars used, treating an expression as a Var * if we can find stats for it. For each one, record the statistical * estimate of number of distinct values (total in its table, without * regard for filtering). */ foreach(l, groupExprs) { Node *groupexpr = (Node *) lfirst(l); VariableStatData vardata; List *varshere; ListCell *l2; /* * If examine_variable is able to deduce anything about the GROUP BY * expression, treat it as a single variable even if it's really more * complicated. */ examine_variable(root, groupexpr, 0, &vardata); if (vardata.statsTuple != NULL || vardata.isunique) { varinfos = add_unique_group_var(root, varinfos, groupexpr, &vardata); ReleaseVariableStats(vardata); continue; } ReleaseVariableStats(vardata); /* * Else pull out the component Vars */ varshere = pull_var_clause(groupexpr, false); /* * If we find any variable-free GROUP BY item, then either it is a * constant (and we can ignore it) or it contains a volatile function; * in the latter case we punt and assume that each input row will * yield a distinct group. */ if (varshere == NIL) { if (contain_volatile_functions(groupexpr)) return input_rows; continue; } /* * Else add variables to varinfos list */ foreach(l2, varshere) { Node *var = (Node *) lfirst(l2); examine_variable(root, var, 0, &vardata); varinfos = add_unique_group_var(root, varinfos, var, &vardata); ReleaseVariableStats(vardata); } } /* If now no Vars, we must have an all-constant GROUP BY list. */ if (varinfos == NIL) return 1.0; /* * Steps 3/4: group Vars by relation and estimate total numdistinct. * * For each iteration of the outer loop, we process the frontmost Var in * varinfos, plus all other Vars in the same relation. We remove these * Vars from the newvarinfos list for the next iteration. This is the * easiest way to group Vars of same rel together. */ numdistinct = 1.0; do { GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos); RelOptInfo *rel = varinfo1->rel; double reldistinct = varinfo1->ndistinct; double relmaxndistinct = reldistinct; int relvarcount = 1; List *newvarinfos = NIL; /* * Get the product of numdistinct estimates of the Vars for this rel. * Also, construct new varinfos list of remaining Vars. */ for_each_cell(l, lnext(list_head(varinfos))) { GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l); if (varinfo2->rel == varinfo1->rel) { reldistinct *= varinfo2->ndistinct; if (relmaxndistinct < varinfo2->ndistinct) relmaxndistinct = varinfo2->ndistinct; relvarcount++; } else { /* not time to process varinfo2 yet */ newvarinfos = lcons(varinfo2, newvarinfos); } } /* * Sanity check --- don't divide by zero if empty relation. */ Assert(rel->reloptkind == RELOPT_BASEREL); if (rel->tuples > 0) { /* * Clamp to size of rel, or size of rel / 10 if multiple Vars. The * fudge factor is because the Vars are probably correlated but we * don't know by how much. We should never clamp to less than the * largest ndistinct value for any of the Vars, though, since * there will surely be at least that many groups. */ double clamp = rel->tuples; if (relvarcount > 1) { clamp *= 0.1; if (clamp < relmaxndistinct) { clamp = relmaxndistinct; /* for sanity in case some ndistinct is too large: */ if (clamp > rel->tuples) clamp = rel->tuples; } } if (reldistinct > clamp) reldistinct = clamp; /* * Multiply by restriction selectivity. */ reldistinct *= rel->rows / rel->tuples; /* * Update estimate of total distinct groups. */ numdistinct *= reldistinct; } varinfos = newvarinfos; } while (varinfos != NIL); numdistinct = ceil(numdistinct); /* Guard against out-of-range answers */ if (numdistinct > input_rows) numdistinct = input_rows; if (numdistinct < 1.0) numdistinct = 1.0; return numdistinct;}/* * Estimate hash bucketsize fraction (ie, number of entries in a bucket * divided by total tuples in relation) if the specified expression is used * as a hash key. * * XXX This is really pretty bogus since we're effectively assuming that the * distribution of hash keys will be the same after applying restriction * clauses as it was in the underlying relation. However, we are not nearly * smart enough to figure out how the restrict clauses might change the * distribution, so this will have to do for now. * * We are passed the number of buckets the executor will use for the given * input relation. If the data were perfectly distributed, with the same * number of tuples going into each available bucket, then the bucketsize * fraction would be 1/nbuckets. But this happy state of affairs will occur * only if (a) there are at least nbuckets distinct data values, and (b) * we have a not-too-skewed data distribution. Otherwise the buckets will * be nonuniformly occupied. If the other relation in the join has a key * distribution similar to this one's, then the most-loaded buckets are * exactly those that will be probed most often. Therefore, the "average" * bucket size for costing purposes should really be taken as something close * to the "worst case" bucket size. We try to estimate this by adjusting the * fraction if there are too few distinct data values, and then scaling up * by the ratio of the most common value's frequency to the average frequency. * * If no statistics are available, use a default estimate of 0.1. This will * discourage use of a hash rather strongly if the inner relation is large, * which is what we want. We do not want to hash unless we know that the * inner rel is well-dispersed (or the alternatives seem much worse). */Selectivityestimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets){ VariableStatData vardata; double estfract, ndistinct, stanullfrac, mcvfreq, avgfreq; float4 *numbers; int nnumbers; examine_variable(root, hashkey, 0, &vardata); /* Get number of distinct values and fraction that are null */ ndistinct = get_variable_numdistinct(&vardata); if (HeapTupleIsValid(vardata.statsTuple)) { Form_pg_statistic stats; stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); stanullfrac = stats->stanullfrac; } else { /* * Believe a default ndistinct only if it came from stats. Otherwise * punt and return 0.1, per comments above. */ if (ndistinct == DEFAULT_NUM_DISTINCT) { ReleaseVariableStats(vardata); return (Selectivity) 0.1; } stanullfrac = 0.0; } /* Compute avg freq of all distinct data values in raw relation */ avgfreq = (1.0 - stanullfrac) / ndistinct; /* * Adjust ndistinct to account for restriction clauses. Observe we are * assuming that the data distribution is affected uniformly by the * restriction clauses! * * XXX Possibly better way, but much more expensive: multiply by * selectivity of rel's restriction clauses that mention the target Var. */ if (vardata.rel) ndistinct *= vardata.rel->rows / vardata.rel->tuples; /* * Initial estimate of bucketsize fraction is 1/nbuckets as long as the * number of buckets is less than the expected number of distinct values; * otherwise it is 1/ndistinct. */ if (ndistinct > nbuckets) estfract = 1.0 / nbuckets; else estfract = 1.0 / ndistinct; /* * Look up the frequency of the most common value, if available. */ mcvfreq = 0.0; if (HeapTupleIsValid(vardata.statsTuple)) { if (get_attstatsslot(vardata.statsTuple, vardata.atttype, vardata.atttypmod, STATISTIC_KIND_MCV, InvalidOid, NULL, NULL, &numbers, &nnumbers)) { /* * The first MCV stat is for the most common value. */ if (nnumbers > 0) mcvfreq = numbers[0]; free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers); } } /* * Adjust estimated bucketsize upward to account for skewed distribution. */ if (avgfreq > 0.0 && mcvfreq > avgfreq) estfract *= mcvfreq / avgfreq; /* * Clamp bucketsize to sane range (the above adjustment could easily * produce an out-of-range result). We set the lower bound a little above * zero, since zero isn't a very sane result. */ if (estfract < 1.0e-6) estfract = 1.0e-6; else if (estfract > 1.0) estfract = 1.0; ReleaseVariableStats(vardata); return (Selectivity) estfract;}/*------------------------------------------------------------------------- * * Support routines * *------------------------------------------------------------------------- *//* * convert_to_scalar * Convert non-NULL values of the indicated types to the comparison * scale needed by scalarltsel()/scalargtsel(). * Returns "true" if successful. * * XXX this routine is a hack: ideally we should look up the conversion * subroutines in pg_type. * * All numeric datatypes are simply converted to their equivalent * "double" values. (NUMERIC values that are outside the range of "double" * are clamped to +/- HUGE_VAL.) * * String datatypes are converted by convert_string_to_scalar(), * which is explained below. The reason why this routine deals with * three values at a time, not just one, is that we need it for strings. * * The bytea datatype is just enough different from strings that it has * to be treated separately. * * The several datatypes representing absolute times are all converted * to Timestamp, which is actually a double, and then we just use that * double value. Note this will give correct results even for the "special" * values of Timestamp, since those are chosen to compare correctly; * see timestamp_cmp. * * The several datatypes representing relative times (intervals) are all * converted to measurements expressed in seconds. */static boolconvert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, Datum lobound, Datum hibound, Oid boundstypid, double *scaledlobound, double *scaledhibound){ /* * Both the valuetypid and the boundstypid should exactly match the * declared input type(s) of the operator we are invoked for, so we just * error out if either is not recognized. * * XXX The histogram we are interpolating between points of could belong * to a column that's only binary-compatible with the declared type. In * essence we are assuming that the semantics of binary-compatible types * are enough alike that we can use a histogram generated with one type's * operators to estimate selectivity for the other's. This is outright * wrong in some cases --- in particular signed versus unsigned * interpretation could trip us up. But it's useful enough in the * majority of cases that we do it anyway. Should think about more * rigorous ways to do it. */ switch (valuetypid) { /* * Built-in numeric types */ case BOOLOID: case INT2OID: case INT4OID: case INT8OID: case FLOAT4OID: case FLOAT8OID: case NUMERICOID: case OIDOID: case REGPROCOID: case REGPROCEDUREOID: case REGOPEROID: case REGOPERATOROID: case REGCLASSOID: case REGTYPEOID: *scaledvalue = conver
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -