Formerly we only applied extended statistics to an OR clause as part
of the clauselist_selectivity() code path for an OR clause appearing
in an implicitly-ANDed list of clauses. This meant that it could only
use extended statistics if all sub-clauses of the OR clause were
covered by a single extended statistics object.
Instead, teach clause_selectivity() how to apply extended statistics
to an OR clause by handling its ORed list of sub-clauses in a similar
manner to an implicitly-ANDed list of sub-clauses, but with different
combination rules. This allows one or more extended statistics objects
to be used to estimate all or part of the list of sub-clauses. Any
remaining sub-clauses are then treated as if they are independent.
Additionally, to avoid double-application of extended statistics, this
introduces "extended" versions of clause_selectivity() and
clauselist_selectivity(), which include an option to ignore extended
statistics. This replaces the old clauselist_selectivity_simple()
function which failed to completely ignore extended statistics when
called from the extended statistics code.
A known limitation of the current infrastructure is that an AND clause
under an OR clause is not treated as compatible with extended
statistics (because we don't build RestrictInfos for such sub-AND
clauses). Thus, for example, "(a=1 AND b=1) OR (a=2 AND b=2)" will
currently be treated as two independent AND clauses (each of which may
be estimated using extended statistics), but extended statistics will
not currently be used to account for any possible overlap between
those clauses. Improving that is left as a task for the future.
Original patch by Tomas Vondra, with additional improvements by me.
Discussion: https://postgr.es/m/20200113230008.g67iyk4cs3xbnjju@development
@ -1113,6 +1113,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
200 | 200
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
estimated | actual
-----------+--------
200 | 200
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
estimated | actual
-----------+--------
@ -1173,13 +1179,6 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY
100 | 100
(1 row)
-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
estimated | actual
-----------+--------
343 | 200
(1 row)
-- check change of unrelated column type does not reset the MCV statistics
ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64);
SELECT d.stxdmcv IS NOT NULL
@ -1477,6 +1476,134 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
1 | 0
(1 row)
-- mcv covering just a small fraction of data
CREATE TABLE mcv_lists_partial (
a INT,
b INT,
c INT
);
-- 10 frequent groups, each with 100 elements
INSERT INTO mcv_lists_partial (a, b, c)
SELECT
mod(i,10),
mod(i,10),
mod(i,10)
FROM generate_series(0,999) s(i);
-- 100 groups that will make it to the MCV list (includes the 10 frequent ones)
INSERT INTO mcv_lists_partial (a, b, c)
SELECT
i,
i,
i
FROM generate_series(0,99) s(i);
-- 4000 groups in total, most of which won't make it (just a single item)
INSERT INTO mcv_lists_partial (a, b, c)
SELECT
i,
i,
i
FROM generate_series(0,3999) s(i);
ANALYZE mcv_lists_partial;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0');
estimated | actual
-----------+--------
1 | 102
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0');
estimated | actual
-----------+--------
300 | 102
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10');
estimated | actual
-----------+--------
1 | 2
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10');
estimated | actual
-----------+--------
6 | 2
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10');
estimated | actual
-----------+--------
1 | 0
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10');
estimated | actual
-----------+--------
204 | 104
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0 AND c = 0) OR (a = 1 AND b = 1 AND c = 1) OR (a = 2 AND b = 2 AND c = 2)');
estimated | actual
-----------+--------
1 | 306
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0) OR (a = 0 AND c = 0) OR (b = 0 AND c = 0)');
estimated | actual
-----------+--------
6 | 102
(1 row)
CREATE STATISTICS mcv_lists_partial_stats (mcv) ON a, b, c
FROM mcv_lists_partial;
ANALYZE mcv_lists_partial;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0');
estimated | actual
-----------+--------
102 | 102
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0');
estimated | actual
-----------+--------
96 | 102
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10');
estimated | actual
-----------+--------
2 | 2
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10');
estimated | actual
-----------+--------
2 | 2
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10');
estimated | actual
-----------+--------
1 | 0
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10');
estimated | actual
-----------+--------
102 | 104
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0 AND c = 0) OR (a = 1 AND b = 1 AND c = 1) OR (a = 2 AND b = 2 AND c = 2)');
estimated | actual
-----------+--------
300 | 306
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0) OR (a = 0 AND c = 0) OR (b = 0 AND c = 0)');
estimated | actual
-----------+--------
306 | 102
(1 row)
DROP TABLE mcv_lists_partial;
-- check the ability to use multiple MCV lists
CREATE TABLE mcv_lists_multi (
a INTEGER,
@ -1506,12 +1633,36 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AN
102 | 714
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0');
estimated | actual
-----------+--------
143 | 142
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0');
estimated | actual
-----------+--------
1571 | 1572
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
estimated | actual
-----------+--------
4 | 142
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)');
estimated | actual
-----------+--------
298 | 1572
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0');
estimated | actual
-----------+--------
2649 | 1572
(1 row)
-- create separate MCV statistics
CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi;
CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi;
@ -1528,12 +1679,36 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AN
714 | 714
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0');
estimated | actual
-----------+--------
143 | 142
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0');
estimated | actual
-----------+--------
1571 | 1572
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
estimated | actual
-----------+--------
143 | 142
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)');
estimated | actual
-----------+--------
1571 | 1572
(1 row)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0');
estimated | actual
-----------+--------
1714 | 1572
(1 row)
DROP TABLE mcv_lists_multi;
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
@ -561,6 +561,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 4 >= a AND ''0
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)');
@ -581,9 +583,6 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY[4, 5]) AND b IN (''1'', ''2'', NULL, ''3'') AND c > ANY (ARRAY[1, 2, NULL, 3])');
-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
-- check change of unrelated column type does not reset the MCV statistics
ALTERTABLEmcv_listsALTERCOLUMNdTYPEVARCHAR(64);
@ -777,6 +776,78 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');
-- mcv covering just a small fraction of data
CREATETABLEmcv_lists_partial(
aINT,
bINT,
cINT
);
-- 10 frequent groups, each with 100 elements
INSERTINTOmcv_lists_partial(a,b,c)
SELECT
mod(i,10),
mod(i,10),
mod(i,10)
FROMgenerate_series(0,999)s(i);
-- 100 groups that will make it to the MCV list (includes the 10 frequent ones)
INSERTINTOmcv_lists_partial(a,b,c)
SELECT
i,
i,
i
FROMgenerate_series(0,99)s(i);
-- 4000 groups in total, most of which won't make it (just a single item)
INSERTINTOmcv_lists_partial(a,b,c)
SELECT
i,
i,
i
FROMgenerate_series(0,3999)s(i);
ANALYZEmcv_lists_partial;
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0 AND c = 0) OR (a = 1 AND b = 1 AND c = 1) OR (a = 2 AND b = 2 AND c = 2)');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0) OR (a = 0 AND c = 0) OR (b = 0 AND c = 0)');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0 AND c = 0) OR (a = 1 AND b = 1 AND c = 1) OR (a = 2 AND b = 2 AND c = 2)');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0) OR (a = 0 AND c = 0) OR (b = 0 AND c = 0)');
DROPTABLEmcv_lists_partial;
-- check the ability to use multiple MCV lists
CREATETABLEmcv_lists_multi(
aINTEGER,
@ -799,7 +870,11 @@ ANALYZE mcv_lists_multi;
-- estimates without any mcv statistics
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)');
SELECT*FROMcheck_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0');