analyze.c
来自「PostgreSQL7.4.6 for Linux」· C语言 代码 · 共 1,819 行 · 第 1/4 页
C
1,819 行
} else { /*---------- * Estimate the number of distinct values using the estimator * proposed by Haas and Stokes in IBM Research Report RJ 10025: * n*d / (n - f1 + f1*n/N) * where f1 is the number of distinct values that occurred * exactly once in our sample of n rows (from a total of N), * and d is the total number of distinct values in the sample. * This is their Duj1 estimator; the other estimators they * recommend are considerably more complex, and are numerically * very unstable when n is much smaller than N. * * Overwidth values are assumed to have been distinct. *---------- */ int f1 = ndistinct - nmultiple + toowide_cnt; int d = f1 + nmultiple; double numer, denom, stadistinct; numer = (double) numrows *(double) d; denom = (double) (numrows - f1) + (double) f1 *(double) numrows / totalrows; stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ if (stadistinct < (double) d) stadistinct = (double) d; if (stadistinct > totalrows) stadistinct = totalrows; stats->stadistinct = floor(stadistinct + 0.5); } /* * If we estimated the number of distinct values at more than 10% * of the total row count (a very arbitrary limit), then assume * that stadistinct should scale with the row count rather than be * a fixed value. */ if (stats->stadistinct > 0.1 * totalrows) stats->stadistinct = -(stats->stadistinct / totalrows); /* * Decide how many values are worth storing as most-common values. * If we are able to generate a complete MCV list (all the values * in the sample will fit, and we think these are all the ones in * the table), then do so. Otherwise, store only those values * that are significantly more common than the (estimated) * average. We set the threshold rather arbitrarily at 25% more * than average, with at least 2 instances in the sample. Also, * we won't suppress values that have a frequency of at least 1/K * where K is the intended number of histogram bins; such values * might otherwise cause us to emit duplicate histogram bin * boundaries. */ if (track_cnt == ndistinct && toowide_cnt == 0 && stats->stadistinct > 0 && track_cnt <= num_mcv) { /* Track list includes all values seen, and all will fit */ num_mcv = track_cnt; } else { double ndistinct = stats->stadistinct; double avgcount, mincount, maxmincount; if (ndistinct < 0) ndistinct = -ndistinct * totalrows; /* estimate # of occurrences in sample of a typical value */ avgcount = (double) numrows / ndistinct; /* set minimum threshold count to store a value */ mincount = avgcount * 1.25; if (mincount < 2) mincount = 2; /* don't let threshold exceed 1/K, however */ maxmincount = (double) numrows / (double) num_bins; if (mincount > maxmincount) mincount = maxmincount; if (num_mcv > track_cnt) num_mcv = track_cnt; for (i = 0; i < num_mcv; i++) { if (track[i].count < mincount) { num_mcv = i; break; } } } /* Generate MCV slot entry */ if (num_mcv > 0) { MemoryContext old_context; Datum *mcv_values; float4 *mcv_freqs; /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(anl_context); mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum)); mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4)); for (i = 0; i < num_mcv; i++) { mcv_values[i] = datumCopy(values[track[i].first].value, stats->attr->attbyval, stats->attr->attlen); mcv_freqs[i] = (double) track[i].count / (double) numrows; } MemoryContextSwitchTo(old_context); stats->stakind[slot_idx] = STATISTIC_KIND_MCV; stats->staop[slot_idx] = stats->eqopr; stats->stanumbers[slot_idx] = mcv_freqs; stats->numnumbers[slot_idx] = num_mcv; stats->stavalues[slot_idx] = mcv_values; stats->numvalues[slot_idx] = num_mcv; slot_idx++; } /* * Generate a histogram slot entry if there are at least two * distinct values not accounted for in the MCV list. (This * ensures the histogram won't collapse to empty or a singleton.) */ num_hist = ndistinct - num_mcv; if (num_hist > num_bins) num_hist = num_bins + 1; if (num_hist >= 2) { MemoryContext old_context; Datum *hist_values; int nvals; /* Sort the MCV items into position order to speed next loop */ qsort((void *) track, num_mcv, sizeof(ScalarMCVItem), compare_mcvs); /* * Collapse out the MCV items from the values[] array. * * Note we destroy the values[] array here... but we don't need * it for anything more. We do, however, still need * values_cnt. nvals will be the number of remaining entries * in values[]. */ if (num_mcv > 0) { int src, dest; int j; src = dest = 0; j = 0; /* index of next interesting MCV item */ while (src < values_cnt) { int ncopy; if (j < num_mcv) { int first = track[j].first; if (src >= first) { /* advance past this MCV item */ src = first + track[j].count; j++; continue; } ncopy = first - src; } else ncopy = values_cnt - src; memmove(&values[dest], &values[src], ncopy * sizeof(ScalarItem)); src += ncopy; dest += ncopy; } nvals = dest; } else nvals = values_cnt; Assert(nvals >= num_hist); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(anl_context); hist_values = (Datum *) palloc(num_hist * sizeof(Datum)); for (i = 0; i < num_hist; i++) { int pos; pos = (i * (nvals - 1)) / (num_hist - 1); hist_values[i] = datumCopy(values[pos].value, stats->attr->attbyval, stats->attr->attlen); } MemoryContextSwitchTo(old_context); stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM; stats->staop[slot_idx] = stats->ltopr; stats->stavalues[slot_idx] = hist_values; stats->numvalues[slot_idx] = num_hist; slot_idx++; } /* Generate a correlation entry if there are multiple values */ if (values_cnt > 1) { MemoryContext old_context; float4 *corrs; double corr_xsum, corr_x2sum; /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(anl_context); corrs = (float4 *) palloc(sizeof(float4)); MemoryContextSwitchTo(old_context); /*---------- * Since we know the x and y value sets are both * 0, 1, ..., values_cnt-1 * we have sum(x) = sum(y) = * (values_cnt-1)*values_cnt / 2 * and sum(x^2) = sum(y^2) = * (values_cnt-1)*values_cnt*(2*values_cnt-1) / 6. *---------- */ corr_xsum = ((double) (values_cnt - 1)) * ((double) values_cnt) / 2.0; corr_x2sum = ((double) (values_cnt - 1)) * ((double) values_cnt) * (double) (2 * values_cnt - 1) / 6.0; /* And the correlation coefficient reduces to */ corrs[0] = (values_cnt * corr_xysum - corr_xsum * corr_xsum) / (values_cnt * corr_x2sum - corr_xsum * corr_xsum); stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION; stats->staop[slot_idx] = stats->ltopr; stats->stanumbers[slot_idx] = corrs; stats->numnumbers[slot_idx] = 1; slot_idx++; } } /* We don't need to bother cleaning up any of our temporary palloc's */}/* * qsort comparator for sorting ScalarItems * * Aside from sorting the items, we update the datumCmpTupnoLink[] array * whenever two ScalarItems are found to contain equal datums. The array * is indexed by tupno; for each ScalarItem, it contains the highest * tupno that that item's datum has been found to be equal to. This allows * us to avoid additional comparisons in compute_scalar_stats(). */static intcompare_scalars(const void *a, const void *b){ Datum da = ((ScalarItem *) a)->value; int ta = ((ScalarItem *) a)->tupno; Datum db = ((ScalarItem *) b)->value; int tb = ((ScalarItem *) b)->tupno; int32 compare; compare = ApplySortFunction(datumCmpFn, datumCmpFnKind, da, false, db, false); if (compare != 0) return compare; /* * The two datums are equal, so update datumCmpTupnoLink[]. */ if (datumCmpTupnoLink[ta] < tb) datumCmpTupnoLink[ta] = tb; if (datumCmpTupnoLink[tb] < ta) datumCmpTupnoLink[tb] = ta; /* * For equal datums, sort by tupno */ return ta - tb;}/* * qsort comparator for sorting ScalarMCVItems by position */static intcompare_mcvs(const void *a, const void *b){ int da = ((ScalarMCVItem *) a)->first; int db = ((ScalarMCVItem *) b)->first; return da - db;}/* * update_attstats() -- update attribute statistics for one relation * * Statistics are stored in several places: the pg_class row for the * relation has stats about the whole relation, and there is a * pg_statistic row for each (non-system) attribute that has ever * been analyzed. The pg_class values are updated by VACUUM, not here. * * pg_statistic rows are just added or updated normally. This means * that pg_statistic will probably contain some deleted rows at the * completion of a vacuum cycle, unless it happens to get vacuumed last. * * To keep things simple, we punt for pg_statistic, and don't try * to compute or store rows for pg_statistic itself in pg_statistic. * This could possibly be made to work, but it's not worth the trouble. * Note analyze_rel() has seen to it that we won't come here when * vacuuming pg_statistic itself. * * Note: if two backends concurrently try to analyze the same relation, * the second one is likely to fail here with a "tuple concurrently * updated" error. This is slightly annoying, but no real harm is done. * We could prevent the problem by using a stronger lock on the * relation for ANALYZE (ie, ShareUpdateExclusiveLock instead * of AccessShareLock); but that cure seems worse than the disease, * especially now that ANALYZE doesn't start a new transaction * for each relation. The lock could be held for a long time... */static voidupdate_attstats(Oid relid, int natts, VacAttrStats **vacattrstats){ Relation sd; int attno; sd = heap_openr(StatisticRelationName, RowExclusiveLock); for (attno = 0; attno < natts; attno++) { VacAttrStats *stats = vacattrstats[attno]; HeapTuple stup, oldtup; int i, k, n; Datum values[Natts_pg_statistic]; char nulls[Natts_pg_statistic]; char replaces[Natts_pg_statistic]; /* Ignore attr if we weren't able to collect stats */ if (!stats->stats_valid) continue; /* * Construct a new pg_statistic tuple */ for (i = 0; i < Natts_pg_statistic; ++i) { nulls[i] = ' '; replaces[i] = 'r'; } i = 0; values[i++] = ObjectIdGetDatum(relid); /* starelid */ values[i++] = Int16GetDatum(stats->attnum); /* staattnum */ values[i++] = Float4GetDatum(stats->stanullfrac); /* stanullfrac */ values[i++] = Int32GetDatum(stats->stawidth); /* stawidth */ values[i++] = Float4GetDatum(stats->stadistinct); /* stadistinct */ for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */ } for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { values[i++] = ObjectIdGetDatum(stats->staop[k]); /* staopN */ } for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { int nnum = stats->numnumbers[k]; if (nnum > 0) { Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum)); ArrayType *arry; for (n = 0; n < nnum; n++) numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]); /* XXX knows more than it should about type float4: */ arry = construct_array(numdatums, nnum, FLOAT4OID, sizeof(float4), false, 'i'); values[i++] = PointerGetDatum(arry); /* stanumbersN */ } else { nulls[i] = 'n'; values[i++] = (Datum) 0; } } for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { if (stats->numvalues[k] > 0) { ArrayType *arry; arry = construct_array(stats->stavalues[k], stats->numvalues[k], stats->attr->atttypid, stats->attrtype->typlen, stats->attrtype->typbyval, stats->attrtype->typalign); values[i++] = PointerGetDatum(arry); /* stavaluesN */ } else { nulls[i] = 'n'; values[i++] = (Datum) 0; } } /* Is there already a pg_statistic tuple for this attribute? */ oldtup = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid), Int16GetDatum(stats->attnum), 0, 0); if (HeapTupleIsValid(oldtup)) { /* Yes, replace it */ stup = heap_modifytuple(oldtup, sd, values, nulls, replaces); ReleaseSysCache(oldtup); simple_heap_update(sd, &stup->t_self, stup); } else { /* No, insert new tuple */ stup = heap_formtuple(sd->rd_att, values, nulls); simple_heap_insert(sd, stup); } /* update indexes too */ CatalogUpdateIndexes(sd, stup); heap_freetuple(stup); } heap_close(sd, RowExclusiveLock);}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?