analyze.c
来自「PostgreSQL 8.1.4的源码 适用于Linux下的开源数据库系统」· C语言 代码 · 共 2,242 行 · 第 1/5 页
C
2,242 行
eqfunc = oprfuncid(func_operator); ReleaseSysCache(func_operator); } if (!OidIsValid(eqfunc)) return false; /* Is there a "<" operator with suitable semantics? */ func_operator = ordering_oper(attr->atttypid, true); if (func_operator != NULL) { ltopr = oprid(func_operator); ReleaseSysCache(func_operator); } /* Save the operator info for compute_stats routines */ mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData)); mystats->eqopr = eqopr; mystats->eqfunc = eqfunc; mystats->ltopr = ltopr; stats->extra_data = mystats; /* * Determine which standard statistics algorithm to use */ if (OidIsValid(ltopr)) { /* Seems to be a scalar datatype */ stats->compute_stats = compute_scalar_stats; /*-------------------- * The following choice of minrows is based on the paper * "Random sampling for histogram construction: how much is enough?" * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in * Proceedings of ACM SIGMOD International Conference on Management * of Data, 1998, Pages 436-447. Their Corollary 1 to Theorem 5 * says that for table size n, histogram size k, maximum relative * error in bin size f, and error probability gamma, the minimum * random sample size is * r = 4 * k * ln(2*n/gamma) / f^2 * Taking f = 0.5, gamma = 0.01, n = 1 million rows, we obtain * r = 305.82 * k * Note that because of the log function, the dependence on n is * quite weak; even at n = 1 billion, a 300*k sample gives <= 0.59 * bin size error with probability 0.99. So there's no real need to * scale for n, which is a good thing because we don't necessarily * know it at this point. *-------------------- */ stats->minrows = 300 * attr->attstattarget; } else { /* Can't do much but the minimal stuff */ stats->compute_stats = compute_minimal_stats; /* Might as well use the same minrows as above */ stats->minrows = 300 * attr->attstattarget; } return true;}/* * compute_minimal_stats() -- compute minimal column statistics * * We use this when we can find only an "=" operator for the datatype. * * We determine the fraction of non-null rows, the average width, the * most common values, and the (estimated) number of distinct values. * * The most common values are determined by brute force: we keep a list * of previously seen values, ordered by number of times seen, as we scan * the samples. A newly seen value is inserted just after the last * multiply-seen value, causing the bottommost (oldest) singly-seen value * to drop off the list. The accuracy of this method, and also its cost, * depend mainly on the length of the list we are willing to keep. */static voidcompute_minimal_stats(VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows){ int i; int null_cnt = 0; int nonnull_cnt = 0; int toowide_cnt = 0; double total_width = 0; bool is_varlena = (!stats->attr->attbyval && stats->attr->attlen == -1); bool is_varwidth = (!stats->attr->attbyval && stats->attr->attlen < 0); FmgrInfo f_cmpeq; typedef struct { Datum value; int count; } TrackItem; TrackItem *track; int track_cnt, track_max; int num_mcv = stats->attr->attstattarget; StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; /* * We track up to 2*n values for an n-element MCV list; but at least 10 */ track_max = 2 * num_mcv; if (track_max < 10) track_max = 10; track = (TrackItem *) palloc(track_max * sizeof(TrackItem)); track_cnt = 0; fmgr_info(mystats->eqfunc, &f_cmpeq); for (i = 0; i < samplerows; i++) { Datum value; bool isnull; bool match; int firstcount1, j; vacuum_delay_point(); value = fetchfunc(stats, i, &isnull); /* Check for null/nonnull */ if (isnull) { null_cnt++; continue; } nonnull_cnt++; /* * If it's a variable-width field, add up widths for average width * calculation. Note that if the value is toasted, we use the toasted * width. We don't bother with this calculation if it's a fixed-width * type. */ if (is_varlena) { total_width += VARSIZE(DatumGetPointer(value)); /* * If the value is toasted, we want to detoast it just once to * avoid repeated detoastings and resultant excess memory usage * during the comparisons. Also, check to see if the value is * excessively wide, and if so don't detoast at all --- just * ignore the value. */ if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) { toowide_cnt++; continue; } value = PointerGetDatum(PG_DETOAST_DATUM(value)); } else if (is_varwidth) { /* must be cstring */ total_width += strlen(DatumGetCString(value)) + 1; } /* * See if the value matches anything we're already tracking. */ match = false; firstcount1 = track_cnt; for (j = 0; j < track_cnt; j++) { if (DatumGetBool(FunctionCall2(&f_cmpeq, value, track[j].value))) { match = true; break; } if (j < firstcount1 && track[j].count == 1) firstcount1 = j; } if (match) { /* Found a match */ track[j].count++; /* This value may now need to "bubble up" in the track list */ while (j > 0 && track[j].count > track[j - 1].count) { swapDatum(track[j].value, track[j - 1].value); swapInt(track[j].count, track[j - 1].count); j--; } } else { /* No match. Insert at head of count-1 list */ if (track_cnt < track_max) track_cnt++; for (j = track_cnt - 1; j > firstcount1; j--) { track[j].value = track[j - 1].value; track[j].count = track[j - 1].count; } if (firstcount1 < track_cnt) { track[firstcount1].value = value; track[firstcount1].count = 1; } } } /* We can only compute real stats if we found some non-null values. */ if (nonnull_cnt > 0) { int nmultiple, summultiple; stats->stats_valid = true; /* Do the simple null-frac and width stats */ stats->stanullfrac = (double) null_cnt / (double) samplerows; if (is_varwidth) stats->stawidth = total_width / (double) nonnull_cnt; else stats->stawidth = stats->attrtype->typlen; /* Count the number of values we found multiple times */ summultiple = 0; for (nmultiple = 0; nmultiple < track_cnt; nmultiple++) { if (track[nmultiple].count == 1) break; summultiple += track[nmultiple].count; } if (nmultiple == 0) { /* If we found no repeated values, assume it's a unique column */ stats->stadistinct = -1.0; } else if (track_cnt < track_max && toowide_cnt == 0 && nmultiple == track_cnt) { /* * Our track list includes every value in the sample, and every * value appeared more than once. Assume the column has just * these values. */ stats->stadistinct = track_cnt; } else { /*---------- * Estimate the number of distinct values using the estimator * proposed by Haas and Stokes in IBM Research Report RJ 10025: * n*d / (n - f1 + f1*n/N) * where f1 is the number of distinct values that occurred * exactly once in our sample of n rows (from a total of N), * and d is the total number of distinct values in the sample. * This is their Duj1 estimator; the other estimators they * recommend are considerably more complex, and are numerically * very unstable when n is much smaller than N. * * We assume (not very reliably!) that all the multiply-occurring * values are reflected in the final track[] list, and the other * nonnull values all appeared but once. (XXX this usually * results in a drastic overestimate of ndistinct. Can we do * any better?) *---------- */ int f1 = nonnull_cnt - summultiple; int d = f1 + nmultiple; double numer, denom, stadistinct; numer = (double) samplerows *(double) d; denom = (double) (samplerows - f1) + (double) f1 *(double) samplerows / totalrows; stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ if (stadistinct < (double) d) stadistinct = (double) d; if (stadistinct > totalrows) stadistinct = totalrows; stats->stadistinct = floor(stadistinct + 0.5); } /* * If we estimated the number of distinct values at more than 10% of * the total row count (a very arbitrary limit), then assume that * stadistinct should scale with the row count rather than be a fixed * value. */ if (stats->stadistinct > 0.1 * totalrows) stats->stadistinct = -(stats->stadistinct / totalrows); /* * Decide how many values are worth storing as most-common values. If * we are able to generate a complete MCV list (all the values in the * sample will fit, and we think these are all the ones in the table), * then do so. Otherwise, store only those values that are * significantly more common than the (estimated) average. We set the * threshold rather arbitrarily at 25% more than average, with at * least 2 instances in the sample. */ if (track_cnt < track_max && toowide_cnt == 0 && stats->stadistinct > 0 && track_cnt <= num_mcv) { /* Track list includes all values seen, and all will fit */ num_mcv = track_cnt; } else { double ndistinct = stats->stadistinct; double avgcount, mincount; if (ndistinct < 0) ndistinct = -ndistinct * totalrows; /* estimate # of occurrences in sample of a typical value */ avgcount = (double) samplerows / ndistinct; /* set minimum threshold count to store a value */ mincount = avgcount * 1.25; if (mincount < 2) mincount = 2; if (num_mcv > track_cnt) num_mcv = track_cnt; for (i = 0; i < num_mcv; i++) { if (track[i].count < mincount) { num_mcv = i; break; } } } /* Generate MCV slot entry */ if (num_mcv > 0) { MemoryContext old_context; Datum *mcv_values; float4 *mcv_freqs; /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum)); mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4)); for (i = 0; i < num_mcv; i++) { mcv_values[i] = datumCopy(track[i].value, stats->attr->attbyval, stats->attr->attlen); mcv_freqs[i] = (double) track[i].count / (double) samplerows; } MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCV; stats->staop[0] = mystats->eqopr; stats->stanumbers[0] = mcv_freqs; stats->numnumbers[0] = num_mcv; stats->stavalues[0] = mcv_values; stats->numvalues[0] = num_mcv; } } else if (null_cnt > 0) { /* We found only nulls; assume the column is entirely null */ stats->stats_valid = true; stats->stanullfrac = 1.0; if (is_varwidth) stats->stawidth = 0; /* "unknown" */ else stats->stawidth = stats->attrtype->typlen; stats->stadistinct = 0.0; /* "unknown" */ } /* We don't need to bother cleaning up any of our temporary palloc's */}/* * compute_scalar_stats() -- compute column statistics * * We use this when we can find "=" and "<" operators for the datatype. * * We determine the fraction of non-null rows, the average width, the * most common values, the (estimated) number of distinct values, the * distribution histogram, and the correlation of physical to logical order. * * The desired stats can be determined fairly easily after sorting the * data values into order. */static voidcompute_scalar_stats(VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows){ int i; int null_cnt = 0; int nonnull_cnt = 0; int toowide_cnt = 0; double total_width = 0; bool is_varlena = (!stats->attr->attbyval && stats->attr->attlen == -1); bool is_varwidth = (!stats->attr->attbyval && stats->attr->attlen < 0); double corr_xysum; RegProcedure cmpFn; SortFunctionKind cmpFnKind; FmgrInfo f_cmpfn; ScalarItem *values; int values_cnt = 0; int *tupnoLink; ScalarMCVItem *track; int track_cnt = 0; int num_mcv = stats->attr->attstattarget; int num_bins = stats->attr->attstattarget; StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; values = (ScalarItem *) palloc(samplerows * sizeof(ScalarItem)); tupnoLink = (int *) palloc(samplerows * sizeof(int)); track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem)); SelectSortFunction(mystats->ltopr, &cmpFn, &cmpFnKind); fmgr_info(cmpFn, &f_cmpfn); /* Initial scan to find sortable values */ for (i = 0; i < samplerows; i++) { Datum value; bool isnull; vacuum_delay_point(); value = fetchfunc(stats, i, &isnull); /* Check for null/nonnull */ if (isnull) { null_cnt++; continue; } nonnull_cnt++; /* * If it's a variable-width field, add up widths for average width
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?