analyze.c
来自「PostgreSQL7.4.6 for Linux」· C语言 代码 · 共 1,819 行 · 第 1/4 页
C
1,819 行
int firstcount1, j; CHECK_FOR_INTERRUPTS(); value = heap_getattr(tuple, stats->attnum, tupDesc, &isnull); /* Check for null/nonnull */ if (isnull) { null_cnt++; continue; } nonnull_cnt++; /* * If it's a variable-width field, add up widths for average width * calculation. Note that if the value is toasted, we use the * toasted width. We don't bother with this calculation if it's a * fixed-width type. */ if (is_varlena) { total_width += VARSIZE(DatumGetPointer(value)); /* * If the value is toasted, we want to detoast it just once to * avoid repeated detoastings and resultant excess memory * usage during the comparisons. Also, check to see if the * value is excessively wide, and if so don't detoast at all * --- just ignore the value. */ if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) { toowide_cnt++; continue; } value = PointerGetDatum(PG_DETOAST_DATUM(value)); } else if (is_varwidth) { /* must be cstring */ total_width += strlen(DatumGetCString(value)) + 1; } /* * See if the value matches anything we're already tracking. */ match = false; firstcount1 = track_cnt; for (j = 0; j < track_cnt; j++) { if (DatumGetBool(FunctionCall2(&f_cmpeq, value, track[j].value))) { match = true; break; } if (j < firstcount1 && track[j].count == 1) firstcount1 = j; } if (match) { /* Found a match */ track[j].count++; /* This value may now need to "bubble up" in the track list */ while (j > 0 && track[j].count > track[j - 1].count) { swapDatum(track[j].value, track[j - 1].value); swapInt(track[j].count, track[j - 1].count); j--; } } else { /* No match. Insert at head of count-1 list */ if (track_cnt < track_max) track_cnt++; for (j = track_cnt - 1; j > firstcount1; j--) { track[j].value = track[j - 1].value; track[j].count = track[j - 1].count; } if (firstcount1 < track_cnt) { track[firstcount1].value = value; track[firstcount1].count = 1; } } } /* We can only compute valid stats if we found some non-null values. */ if (nonnull_cnt > 0) { int nmultiple, summultiple; stats->stats_valid = true; /* Do the simple null-frac and width stats */ stats->stanullfrac = (double) null_cnt / (double) numrows; if (is_varwidth) stats->stawidth = total_width / (double) nonnull_cnt; else stats->stawidth = stats->attrtype->typlen; /* Count the number of values we found multiple times */ summultiple = 0; for (nmultiple = 0; nmultiple < track_cnt; nmultiple++) { if (track[nmultiple].count == 1) break; summultiple += track[nmultiple].count; } if (nmultiple == 0) { /* If we found no repeated values, assume it's a unique column */ stats->stadistinct = -1.0; } else if (track_cnt < track_max && toowide_cnt == 0 && nmultiple == track_cnt) { /* * Our track list includes every value in the sample, and * every value appeared more than once. Assume the column has * just these values. */ stats->stadistinct = track_cnt; } else { /*---------- * Estimate the number of distinct values using the estimator * proposed by Haas and Stokes in IBM Research Report RJ 10025: * n*d / (n - f1 + f1*n/N) * where f1 is the number of distinct values that occurred * exactly once in our sample of n rows (from a total of N), * and d is the total number of distinct values in the sample. * This is their Duj1 estimator; the other estimators they * recommend are considerably more complex, and are numerically * very unstable when n is much smaller than N. * * We assume (not very reliably!) that all the multiply-occurring * values are reflected in the final track[] list, and the other * nonnull values all appeared but once. (XXX this usually * results in a drastic overestimate of ndistinct. Can we do * any better?) *---------- */ int f1 = nonnull_cnt - summultiple; int d = f1 + nmultiple; double numer, denom, stadistinct; numer = (double) numrows *(double) d; denom = (double) (numrows - f1) + (double) f1 *(double) numrows / totalrows; stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ if (stadistinct < (double) d) stadistinct = (double) d; if (stadistinct > totalrows) stadistinct = totalrows; stats->stadistinct = floor(stadistinct + 0.5); } /* * If we estimated the number of distinct values at more than 10% * of the total row count (a very arbitrary limit), then assume * that stadistinct should scale with the row count rather than be * a fixed value. */ if (stats->stadistinct > 0.1 * totalrows) stats->stadistinct = -(stats->stadistinct / totalrows); /* * Decide how many values are worth storing as most-common values. * If we are able to generate a complete MCV list (all the values * in the sample will fit, and we think these are all the ones in * the table), then do so. Otherwise, store only those values * that are significantly more common than the (estimated) * average. We set the threshold rather arbitrarily at 25% more * than average, with at least 2 instances in the sample. */ if (track_cnt < track_max && toowide_cnt == 0 && stats->stadistinct > 0 && track_cnt <= num_mcv) { /* Track list includes all values seen, and all will fit */ num_mcv = track_cnt; } else { double ndistinct = stats->stadistinct; double avgcount, mincount; if (ndistinct < 0) ndistinct = -ndistinct * totalrows; /* estimate # of occurrences in sample of a typical value */ avgcount = (double) numrows / ndistinct; /* set minimum threshold count to store a value */ mincount = avgcount * 1.25; if (mincount < 2) mincount = 2; if (num_mcv > track_cnt) num_mcv = track_cnt; for (i = 0; i < num_mcv; i++) { if (track[i].count < mincount) { num_mcv = i; break; } } } /* Generate MCV slot entry */ if (num_mcv > 0) { MemoryContext old_context; Datum *mcv_values; float4 *mcv_freqs; /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(anl_context); mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum)); mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4)); for (i = 0; i < num_mcv; i++) { mcv_values[i] = datumCopy(track[i].value, stats->attr->attbyval, stats->attr->attlen); mcv_freqs[i] = (double) track[i].count / (double) numrows; } MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCV; stats->staop[0] = stats->eqopr; stats->stanumbers[0] = mcv_freqs; stats->numnumbers[0] = num_mcv; stats->stavalues[0] = mcv_values; stats->numvalues[0] = num_mcv; } } /* We don't need to bother cleaning up any of our temporary palloc's */}/* * compute_scalar_stats() -- compute column statistics * * We use this when we can find "=" and "<" operators for the datatype. * * We determine the fraction of non-null rows, the average width, the * most common values, the (estimated) number of distinct values, the * distribution histogram, and the correlation of physical to logical order. * * The desired stats can be determined fairly easily after sorting the * data values into order. */static voidcompute_scalar_stats(VacAttrStats *stats, TupleDesc tupDesc, double totalrows, HeapTuple *rows, int numrows){ int i; int null_cnt = 0; int nonnull_cnt = 0; int toowide_cnt = 0; double total_width = 0; bool is_varlena = (!stats->attr->attbyval && stats->attr->attlen == -1); bool is_varwidth = (!stats->attr->attbyval && stats->attr->attlen < 0); double corr_xysum; RegProcedure cmpFn; SortFunctionKind cmpFnKind; FmgrInfo f_cmpfn; ScalarItem *values; int values_cnt = 0; int *tupnoLink; ScalarMCVItem *track; int track_cnt = 0; int num_mcv = stats->attr->attstattarget; int num_bins = stats->attr->attstattarget; values = (ScalarItem *) palloc(numrows * sizeof(ScalarItem)); tupnoLink = (int *) palloc(numrows * sizeof(int)); track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem)); SelectSortFunction(stats->ltopr, &cmpFn, &cmpFnKind); fmgr_info(cmpFn, &f_cmpfn); /* Initial scan to find sortable values */ for (i = 0; i < numrows; i++) { HeapTuple tuple = rows[i]; Datum value; bool isnull; CHECK_FOR_INTERRUPTS(); value = heap_getattr(tuple, stats->attnum, tupDesc, &isnull); /* Check for null/nonnull */ if (isnull) { null_cnt++; continue; } nonnull_cnt++; /* * If it's a variable-width field, add up widths for average width * calculation. Note that if the value is toasted, we use the * toasted width. We don't bother with this calculation if it's a * fixed-width type. */ if (is_varlena) { total_width += VARSIZE(DatumGetPointer(value)); /* * If the value is toasted, we want to detoast it just once to * avoid repeated detoastings and resultant excess memory * usage during the comparisons. Also, check to see if the * value is excessively wide, and if so don't detoast at all * --- just ignore the value. */ if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) { toowide_cnt++; continue; } value = PointerGetDatum(PG_DETOAST_DATUM(value)); } else if (is_varwidth) { /* must be cstring */ total_width += strlen(DatumGetCString(value)) + 1; } /* Add it to the list to be sorted */ values[values_cnt].value = value; values[values_cnt].tupno = values_cnt; tupnoLink[values_cnt] = values_cnt; values_cnt++; } /* We can only compute valid stats if we found some sortable values. */ if (values_cnt > 0) { int ndistinct, /* # distinct values in sample */ nmultiple, /* # that appear multiple times */ num_hist, dups_cnt; int slot_idx = 0; /* Sort the collected values */ datumCmpFn = &f_cmpfn; datumCmpFnKind = cmpFnKind; datumCmpTupnoLink = tupnoLink; qsort((void *) values, values_cnt, sizeof(ScalarItem), compare_scalars); /* * Now scan the values in order, find the most common ones, and * also accumulate ordering-correlation statistics. * * To determine which are most common, we first have to count the * number of duplicates of each value. The duplicates are * adjacent in the sorted list, so a brute-force approach is to * compare successive datum values until we find two that are not * equal. However, that requires N-1 invocations of the datum * comparison routine, which are completely redundant with work * that was done during the sort. (The sort algorithm must at * some point have compared each pair of items that are adjacent * in the sorted order; otherwise it could not know that it's * ordered the pair correctly.) We exploit this by having * compare_scalars remember the highest tupno index that each * ScalarItem has been found equal to. At the end of the sort, a * ScalarItem's tupnoLink will still point to itself if and only * if it is the last item of its group of duplicates (since the * group will be ordered by tupno). */ corr_xysum = 0; ndistinct = 0; nmultiple = 0; dups_cnt = 0; for (i = 0; i < values_cnt; i++) { int tupno = values[i].tupno; corr_xysum += ((double) i) * ((double) tupno); dups_cnt++; if (tupnoLink[tupno] == tupno) { /* Reached end of duplicates of this value */ ndistinct++; if (dups_cnt > 1) { nmultiple++; if (track_cnt < num_mcv || dups_cnt > track[track_cnt - 1].count) { /* * Found a new item for the mcv list; find its * position, bubbling down old items if needed. * Loop invariant is that j points at an empty/ * replaceable slot. */ int j; if (track_cnt < num_mcv) track_cnt++; for (j = track_cnt - 1; j > 0; j--) { if (dups_cnt <= track[j - 1].count) break; track[j].count = track[j - 1].count; track[j].first = track[j - 1].first; } track[j].count = dups_cnt; track[j].first = i + 1 - dups_cnt; } } dups_cnt = 0; } } stats->stats_valid = true; /* Do the simple null-frac and width stats */ stats->stanullfrac = (double) null_cnt / (double) numrows; if (is_varwidth) stats->stawidth = total_width / (double) nonnull_cnt; else stats->stawidth = stats->attrtype->typlen; if (nmultiple == 0) { /* If we found no repeated values, assume it's a unique column */ stats->stadistinct = -1.0; } else if (toowide_cnt == 0 && nmultiple == ndistinct) { /* * Every value in the sample appeared more than once. Assume * the column has just these values. */ stats->stadistinct = ndistinct;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?