analyze.c
来自「PostgreSQL 8.1.4的源码 适用于Linux下的开源数据库系统」· C语言 代码 · 共 2,242 行 · 第 1/5 页
C
2,242 行
* calculation. Note that if the value is toasted, we use the toasted * width. We don't bother with this calculation if it's a fixed-width * type. */ if (is_varlena) { total_width += VARSIZE(DatumGetPointer(value)); /* * If the value is toasted, we want to detoast it just once to * avoid repeated detoastings and resultant excess memory usage * during the comparisons. Also, check to see if the value is * excessively wide, and if so don't detoast at all --- just * ignore the value. */ if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) { toowide_cnt++; continue; } value = PointerGetDatum(PG_DETOAST_DATUM(value)); } else if (is_varwidth) { /* must be cstring */ total_width += strlen(DatumGetCString(value)) + 1; } /* Add it to the list to be sorted */ values[values_cnt].value = value; values[values_cnt].tupno = values_cnt; tupnoLink[values_cnt] = values_cnt; values_cnt++; } /* We can only compute real stats if we found some sortable values. */ if (values_cnt > 0) { int ndistinct, /* # distinct values in sample */ nmultiple, /* # that appear multiple times */ num_hist, dups_cnt; int slot_idx = 0; /* Sort the collected values */ datumCmpFn = &f_cmpfn; datumCmpFnKind = cmpFnKind; datumCmpTupnoLink = tupnoLink; qsort((void *) values, values_cnt, sizeof(ScalarItem), compare_scalars); /* * Now scan the values in order, find the most common ones, and also * accumulate ordering-correlation statistics. * * To determine which are most common, we first have to count the * number of duplicates of each value. The duplicates are adjacent in * the sorted list, so a brute-force approach is to compare successive * datum values until we find two that are not equal. However, that * requires N-1 invocations of the datum comparison routine, which are * completely redundant with work that was done during the sort. (The * sort algorithm must at some point have compared each pair of items * that are adjacent in the sorted order; otherwise it could not know * that it's ordered the pair correctly.) We exploit this by having * compare_scalars remember the highest tupno index that each * ScalarItem has been found equal to. At the end of the sort, a * ScalarItem's tupnoLink will still point to itself if and only if it * is the last item of its group of duplicates (since the group will * be ordered by tupno). */ corr_xysum = 0; ndistinct = 0; nmultiple = 0; dups_cnt = 0; for (i = 0; i < values_cnt; i++) { int tupno = values[i].tupno; corr_xysum += ((double) i) * ((double) tupno); dups_cnt++; if (tupnoLink[tupno] == tupno) { /* Reached end of duplicates of this value */ ndistinct++; if (dups_cnt > 1) { nmultiple++; if (track_cnt < num_mcv || dups_cnt > track[track_cnt - 1].count) { /* * Found a new item for the mcv list; find its * position, bubbling down old items if needed. Loop * invariant is that j points at an empty/ replaceable * slot. */ int j; if (track_cnt < num_mcv) track_cnt++; for (j = track_cnt - 1; j > 0; j--) { if (dups_cnt <= track[j - 1].count) break; track[j].count = track[j - 1].count; track[j].first = track[j - 1].first; } track[j].count = dups_cnt; track[j].first = i + 1 - dups_cnt; } } dups_cnt = 0; } } stats->stats_valid = true; /* Do the simple null-frac and width stats */ stats->stanullfrac = (double) null_cnt / (double) samplerows; if (is_varwidth) stats->stawidth = total_width / (double) nonnull_cnt; else stats->stawidth = stats->attrtype->typlen; if (nmultiple == 0) { /* If we found no repeated values, assume it's a unique column */ stats->stadistinct = -1.0; } else if (toowide_cnt == 0 && nmultiple == ndistinct) { /* * Every value in the sample appeared more than once. Assume the * column has just these values. */ stats->stadistinct = ndistinct; } else { /*---------- * Estimate the number of distinct values using the estimator * proposed by Haas and Stokes in IBM Research Report RJ 10025: * n*d / (n - f1 + f1*n/N) * where f1 is the number of distinct values that occurred * exactly once in our sample of n rows (from a total of N), * and d is the total number of distinct values in the sample. * This is their Duj1 estimator; the other estimators they * recommend are considerably more complex, and are numerically * very unstable when n is much smaller than N. * * Overwidth values are assumed to have been distinct. *---------- */ int f1 = ndistinct - nmultiple + toowide_cnt; int d = f1 + nmultiple; double numer, denom, stadistinct; numer = (double) samplerows *(double) d; denom = (double) (samplerows - f1) + (double) f1 *(double) samplerows / totalrows; stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ if (stadistinct < (double) d) stadistinct = (double) d; if (stadistinct > totalrows) stadistinct = totalrows; stats->stadistinct = floor(stadistinct + 0.5); } /* * If we estimated the number of distinct values at more than 10% of * the total row count (a very arbitrary limit), then assume that * stadistinct should scale with the row count rather than be a fixed * value. */ if (stats->stadistinct > 0.1 * totalrows) stats->stadistinct = -(stats->stadistinct / totalrows); /* * Decide how many values are worth storing as most-common values. If * we are able to generate a complete MCV list (all the values in the * sample will fit, and we think these are all the ones in the table), * then do so. Otherwise, store only those values that are * significantly more common than the (estimated) average. We set the * threshold rather arbitrarily at 25% more than average, with at * least 2 instances in the sample. Also, we won't suppress values * that have a frequency of at least 1/K where K is the intended * number of histogram bins; such values might otherwise cause us to * emit duplicate histogram bin boundaries. */ if (track_cnt == ndistinct && toowide_cnt == 0 && stats->stadistinct > 0 && track_cnt <= num_mcv) { /* Track list includes all values seen, and all will fit */ num_mcv = track_cnt; } else { double ndistinct = stats->stadistinct; double avgcount, mincount, maxmincount; if (ndistinct < 0) ndistinct = -ndistinct * totalrows; /* estimate # of occurrences in sample of a typical value */ avgcount = (double) samplerows / ndistinct; /* set minimum threshold count to store a value */ mincount = avgcount * 1.25; if (mincount < 2) mincount = 2; /* don't let threshold exceed 1/K, however */ maxmincount = (double) samplerows / (double) num_bins; if (mincount > maxmincount) mincount = maxmincount; if (num_mcv > track_cnt) num_mcv = track_cnt; for (i = 0; i < num_mcv; i++) { if (track[i].count < mincount) { num_mcv = i; break; } } } /* Generate MCV slot entry */ if (num_mcv > 0) { MemoryContext old_context; Datum *mcv_values; float4 *mcv_freqs; /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum)); mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4)); for (i = 0; i < num_mcv; i++) { mcv_values[i] = datumCopy(values[track[i].first].value, stats->attr->attbyval, stats->attr->attlen); mcv_freqs[i] = (double) track[i].count / (double) samplerows; } MemoryContextSwitchTo(old_context); stats->stakind[slot_idx] = STATISTIC_KIND_MCV; stats->staop[slot_idx] = mystats->eqopr; stats->stanumbers[slot_idx] = mcv_freqs; stats->numnumbers[slot_idx] = num_mcv; stats->stavalues[slot_idx] = mcv_values; stats->numvalues[slot_idx] = num_mcv; slot_idx++; } /* * Generate a histogram slot entry if there are at least two distinct * values not accounted for in the MCV list. (This ensures the * histogram won't collapse to empty or a singleton.) */ num_hist = ndistinct - num_mcv; if (num_hist > num_bins) num_hist = num_bins + 1; if (num_hist >= 2) { MemoryContext old_context; Datum *hist_values; int nvals; /* Sort the MCV items into position order to speed next loop */ qsort((void *) track, num_mcv, sizeof(ScalarMCVItem), compare_mcvs); /* * Collapse out the MCV items from the values[] array. * * Note we destroy the values[] array here... but we don't need it * for anything more. We do, however, still need values_cnt. * nvals will be the number of remaining entries in values[]. */ if (num_mcv > 0) { int src, dest; int j; src = dest = 0; j = 0; /* index of next interesting MCV item */ while (src < values_cnt) { int ncopy; if (j < num_mcv) { int first = track[j].first; if (src >= first) { /* advance past this MCV item */ src = first + track[j].count; j++; continue; } ncopy = first - src; } else ncopy = values_cnt - src; memmove(&values[dest], &values[src], ncopy * sizeof(ScalarItem)); src += ncopy; dest += ncopy; } nvals = dest; } else nvals = values_cnt; Assert(nvals >= num_hist); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); hist_values = (Datum *) palloc(num_hist * sizeof(Datum)); for (i = 0; i < num_hist; i++) { int pos; pos = (i * (nvals - 1)) / (num_hist - 1); hist_values[i] = datumCopy(values[pos].value, stats->attr->attbyval, stats->attr->attlen); } MemoryContextSwitchTo(old_context); stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM; stats->staop[slot_idx] = mystats->ltopr; stats->stavalues[slot_idx] = hist_values; stats->numvalues[slot_idx] = num_hist; slot_idx++; } /* Generate a correlation entry if there are multiple values */ if (values_cnt > 1) { MemoryContext old_context; float4 *corrs; double corr_xsum, corr_x2sum; /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); corrs = (float4 *) palloc(sizeof(float4)); MemoryContextSwitchTo(old_context); /*---------- * Since we know the x and y value sets are both * 0, 1, ..., values_cnt-1 * we have sum(x) = sum(y) = * (values_cnt-1)*values_cnt / 2 * and sum(x^2) = sum(y^2) = * (values_cnt-1)*values_cnt*(2*values_cnt-1) / 6. *---------- */ corr_xsum = ((double) (values_cnt - 1)) * ((double) values_cnt) / 2.0; corr_x2sum = ((double) (values_cnt - 1)) * ((double) values_cnt) * (double) (2 * values_cnt - 1) / 6.0; /* And the correlation coefficient reduces to */ corrs[0] = (values_cnt * corr_xysum - corr_xsum * corr_xsum) / (values_cnt * corr_x2sum - corr_xsum * corr_xsum); stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION; stats->staop[slot_idx] = mystats->ltopr; stats->stanumbers[slot_idx] = corrs; stats->numnumbers[slot_idx] = 1; slot_idx++; } } else if (nonnull_cnt == 0 && null_cnt > 0) { /* We found only nulls; assume the column is entirely null */ stats->stats_valid = true; stats->stanullfrac = 1.0; if (is_varwidth) stats->stawidth = 0; /* "unknown" */ else stats->stawidth = stats->attrtype->typlen; stats->stadistinct = 0.0; /* "unknown" */ } /* We don't need to bother cleaning up any of our temporary palloc's */}/* * qsort comparator for sorting ScalarItems * * Aside from sorting the items, we update the datumCmpTupnoLink[] array * whenever two ScalarItems are found to contain equal datums. The array * is indexed by tupno; for each ScalarItem, it contains the highest * tupno that that item's datum has been found to be equal to. This allows * us to avoid additional comparisons in compute_scalar_stats(). */static intcompare_scalars(const void *a, const void *b){ Datum da = ((ScalarItem *) a)->value; int ta = ((ScalarItem *) a)->tupno; Datum db = ((ScalarItem *) b)->value; int tb = ((ScalarItem *) b)->tupno; int32 compare; compare = ApplySortFunction(datumCmpFn, datumCmpFnKind, da, false, db, false); if (compare != 0) return compare; /* * The two datums are equal, so update datumCmpTupnoLink[]. */ if (datumCmpTupnoLink[ta] < tb) datumCmpTupnoLink[ta] = tb; if (datumCmpTupnoLink[tb] < ta) datumCmpTupnoLink[tb] = ta; /* * For equal datums, sort by tupno */ return ta - tb;}/* * qsort comparator for sorting ScalarMCVItems by position */static intcompare_mcvs(const void *a, const void *b){ int da = ((ScalarMCVItem *) a)->first; int db = ((ScalarMCVItem *) b)->first; return da - db;}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?