Remove useless "rough estimate" path from mcelem_array_contained_selec.

The code in this function that tried to cope with a missing count histogram
was quite ineffective for anything except a perfectly flat distribution.
Furthermore, since we were already punting for missing MCELEM slot, it's
rather useless to sweat over missing DECHIST: there are no cases where
ANALYZE will create the first but not the second.  So just simplify the
code by punting rather than pretending we can do something useful.
pull/3/head
Tom Lane 14 years ago
parent 4fb694aebc
commit e2eed78910
  1. 138
      src/backend/utils/adt/array_selfuncs.c

@ -242,8 +242,7 @@ scalararraysel_containment(PlannerInfo *root,
}
/*
* arraycontsel -- restriction selectivity for "arraycolumn @> const",
* "arraycolumn && const" or "arraycolumn <@ const"
* arraycontsel -- restriction selectivity for array @>, &&, <@ operators
*/
Datum
arraycontsel(PG_FUNCTION_ARGS)
@ -323,8 +322,7 @@ arraycontsel(PG_FUNCTION_ARGS)
}
/*
* arraycontjoinsel -- join selectivity for "arraycolumn @> const",
* "arraycolumn && const" or "arraycolumn <@ const"
* arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
*/
Datum
arraycontjoinsel(PG_FUNCTION_ARGS)
@ -744,6 +742,10 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
if (numbers == NULL || nnumbers != nmcelem + 3)
return DEFAULT_CONTAIN_SEL;
/* Can't do much without a count histogram, either */
if (hist == NULL || nhist < 3)
return DEFAULT_CONTAIN_SEL;
/*
* Grab some of the summary statistics that compute_array_stats() stores:
* lowest frequency, frequency of null elements, and average distinct
@ -751,11 +753,7 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
*/
minfreq = numbers[nmcelem];
nullelem_freq = numbers[nmcelem + 2];
if (hist && nhist > 0)
avg_count = hist[nhist - 1];
else
avg_count = 10.0f; /* default assumption */
avg_count = hist[nhist - 1];
/*
* "rest" will be the sum of the frequencies of all elements not
@ -853,83 +851,71 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
*/
mult *= exp(-rest);
/* Check we have nonempty distinct element count histogram */
if (hist && nhist >= 3)
{
/*----------
* Using the distinct element count histogram requires
* O(unique_nitems * (nmcelem + unique_nitems))
* operations. Beyond a certain computational cost threshold, it's
* reasonable to sacrifice accuracy for decreased planning time.
* We limit the number of operations to EFFORT * nmcelem; since
* nmcelem is limited by the column's statistics target, the work
* done is user-controllable.
*
* If the number of operations would be too large, we can reduce it
* without losing all accuracy by reducing unique_nitems and
* considering only the most-common elements of the constant array.
* To make the results exactly match what we would have gotten with
* only those elements to start with, we'd have to remove any
* discarded elements' frequencies from "mult", but since this is only
* an approximation anyway, we don't bother with that. Therefore it's
* sufficient to qsort elem_selec[] and take the largest elements.
* (They will no longer match up with the elements of array_data[],
* but we don't care.)
*----------
*/
/*----------
* Using the distinct element count histogram requires
* O(unique_nitems * (nmcelem + unique_nitems))
* operations. Beyond a certain computational cost threshold, it's
* reasonable to sacrifice accuracy for decreased planning time. We limit
* the number of operations to EFFORT * nmcelem; since nmcelem is limited
* by the column's statistics target, the work done is user-controllable.
*
* If the number of operations would be too large, we can reduce it
* without losing all accuracy by reducing unique_nitems and considering
* only the most-common elements of the constant array. To make the
* results exactly match what we would have gotten with only those
* elements to start with, we'd have to remove any discarded elements'
* frequencies from "mult", but since this is only an approximation
* anyway, we don't bother with that. Therefore it's sufficient to qsort
* elem_selec[] and take the largest elements. (They will no longer match
* up with the elements of array_data[], but we don't care.)
*----------
*/
#define EFFORT 100
if ((nmcelem + unique_nitems) > 0 &&
unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
{
/*
* Use the quadratic formula to solve for largest allowable N;
* we have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
*/
double b = (double) nmcelem;
int n;
n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
/* Sort, then take just the first n elements */
qsort(elem_selec, unique_nitems, sizeof(float),
float_compare_desc);
unique_nitems = n;
}
if ((nmcelem + unique_nitems) > 0 &&
unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
{
/*
* Calculate probabilities of each distinct element count for both
* mcelems and constant elements. At this point, assume independent
* element occurrence.
* Use the quadratic formula to solve for largest allowable N. We
* have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
*/
dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
double b = (double) nmcelem;
int n;
/* ignore hist[nhist-1], which is the avg not a histogram member */
hist_part = calc_hist(hist, nhist - 1, unique_nitems);
n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
selec = 0.0f;
for (i = 0; i <= unique_nitems; i++)
{
/*
* mult * dist[i] / mcelem_dist[i] gives us probability of qual
* matching from assumption of independent element occurrence with
* the condition that distinct element count = i.
*/
if (mcelem_dist[i] > 0)
selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
}
pfree(dist);
pfree(mcelem_dist);
pfree(hist_part);
/* Sort, then take just the first n elements */
qsort(elem_selec, unique_nitems, sizeof(float),
float_compare_desc);
unique_nitems = n;
}
else
/*
* Calculate probabilities of each distinct element count for both
* mcelems and constant elements. At this point, assume independent
* element occurrence.
*/
dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
/* ignore hist[nhist-1], which is the average not a histogram member */
hist_part = calc_hist(hist, nhist - 1, unique_nitems);
selec = 0.0f;
for (i = 0; i <= unique_nitems; i++)
{
/* We don't have histogram. Use a rough estimate. */
selec = mult;
/*
* mult * dist[i] / mcelem_dist[i] gives us probability of qual
* matching from assumption of independent element occurrence with
* the condition that distinct element count = i.
*/
if (mcelem_dist[i] > 0)
selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
}
pfree(dist);
pfree(mcelem_dist);
pfree(hist_part);
pfree(elem_selec);
/* Take into account occurrence of NULL element. */

Loading…
Cancel
Save