Consider collation when proving subquery uniqueness

rel_is_distinct_for()'s RTE_SUBQUERY branch passed only the equality
operator from each join clause to query_is_distinct_for(), discarding
the operator's input collation.  query_is_distinct_for() then verified
opfamily compatibility but never checked collations, so a DISTINCT /
GROUP BY / set-op operating under one collation was trusted to prove
uniqueness for a comparison performed under an unrelated collation.
As with the recent fix in relation_has_unique_index_for(), this is
unsound for nondeterministic collations and yields wrong query results
in any optimization that consumes the proof.

Fix by carrying each clause's operator input collation into
query_is_distinct_for() and validating it at every check-site against
the subquery target expression's collation.

Back-patch to all supported branches.  query_is_distinct_for() is
declared in an installed header, so on stable branches the existing
two-list signature is retained as a thin wrapper that forwards to a
new collation-aware entry point; external callers continue to receive
the historical collation-blind answer.

Author: Richard Guo <guofenglinux@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/CAMbWs4_XUUSTyzCaRjUeeahWNqi=8ZOA5Q4coi8zUVEDSBkM6A@mail.gmail.com
Backpatch-through: 14
REL_18_STABLE
Richard Guo 3 days ago
parent b62f514ac5
commit bed3ffbf9d
  1. 177
      src/backend/optimizer/plan/analyzejoins.c
  2. 181
      src/test/regress/expected/collate.icu.utf8.out
  3. 58
      src/test/regress/sql/collate.icu.utf8.sql
  4. 1
      src/tools/pgindent/typedefs.list

@ -34,6 +34,20 @@
#include "rewrite/rewriteManip.h"
#include "utils/lsyscache.h"
/*
* One element of the list passed to query_is_distinct_for_with_collations().
* Each entry names a subquery output column that the caller needs to be
* distinct over, plus the upper-level equality operator and its input
* collation, so that the subquery's own DISTINCT/GROUP BY/set-op clauses can
* be compared for compatibility.
*/
typedef struct DistinctColInfo
{
int colno; /* subquery output column resno */
Oid opid; /* upper-level equality operator */
Oid collid; /* input collation of opid */
} DistinctColInfo;
/*
* Utility structure. A sorting procedure is needed to simplify the search
* of SJE-candidate baserels referencing the same database relation. Having
@ -65,7 +79,9 @@ static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
List *clause_list, List **extra_clauses);
static Oid distinct_col_search(int colno, List *colnos, List *opids);
static bool query_is_distinct_for_with_collations(Query *query,
List *distinct_cols);
static DistinctColInfo *distinct_col_search(int colno, List *distinct_cols);
static bool is_innerrel_unique_for(PlannerInfo *root,
Relids joinrelids,
Relids outerrelids,
@ -1018,15 +1034,17 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
{
Index relid = rel->relid;
Query *subquery = root->simple_rte_array[relid]->subquery;
List *colnos = NIL;
List *opids = NIL;
List *distinct_cols = NIL;
ListCell *l;
/*
* Build the argument lists for query_is_distinct_for: a list of
* output column numbers that the query needs to be distinct over, and
* a list of equality operators that the output columns need to be
* distinct according to.
* Build the argument list for query_is_distinct_for_with_collations:
* a list of DistinctColInfo entries, each holding an output column
* number that the query needs to be distinct over, the equality
* operator that the column needs to be distinct according to, and
* that operator's input collation. The collation matters because the
* subquery's own DISTINCT / GROUP BY / set-op proves uniqueness under
* its own collation, which need not agree with the operator's.
*
* (XXX we are not considering restriction clauses attached to the
* subquery; is that worth doing?)
@ -1034,18 +1052,18 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
foreach(l, clause_list)
{
RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
Oid op;
OpExpr *opexpr;
Var *var;
DistinctColInfo *dcinfo;
/*
* Get the equality operator we need uniqueness according to.
* (This might be a cross-type operator and thus not exactly the
* same operator the subquery would consider; that's all right
* since query_is_distinct_for can resolve such cases.) The
* caller's mergejoinability test should have selected only
* OpExprs.
* The caller's mergejoinability test should have selected only
* OpExprs. The operator might be a cross-type operator and thus
* not exactly the same operator the subquery would consider;
* that's all right since query_is_distinct_for_with_collations
* can resolve such cases.
*/
op = castNode(OpExpr, rinfo->clause)->opno;
opexpr = castNode(OpExpr, rinfo->clause);
/* caller identified the inner side for us */
if (rinfo->outer_is_left)
@ -1069,11 +1087,14 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
var->varno != relid || var->varlevelsup != 0)
continue;
colnos = lappend_int(colnos, var->varattno);
opids = lappend_oid(opids, op);
dcinfo = palloc(sizeof(DistinctColInfo));
dcinfo->colno = var->varattno;
dcinfo->opid = opexpr->opno;
dcinfo->collid = opexpr->inputcollid;
distinct_cols = lappend(distinct_cols, dcinfo);
}
if (query_is_distinct_for(subquery, colnos, opids))
if (query_is_distinct_for_with_collations(subquery, distinct_cols))
return true;
}
return false;
@ -1111,31 +1132,70 @@ query_supports_distinctness(Query *query)
}
/*
* query_is_distinct_for - does query never return duplicates of the
* specified columns?
* query_is_distinct_for - ABI-preserving wrapper around
* query_is_distinct_for_with_collations().
*
* The original signature took parallel colnos/opids lists and did not
* consider collations. External callers built against earlier minor
* releases continue to call it with the historical (collation-blind)
* semantics; we forward with InvalidOid collations, which makes the
* collation check a no-op (see collations_agree_on_equality()).
*/
bool
query_is_distinct_for(Query *query, List *colnos, List *opids)
{
List *distinct_cols = NIL;
ListCell *lc1;
ListCell *lc2;
Assert(list_length(colnos) == list_length(opids));
forboth(lc1, colnos, lc2, opids)
{
DistinctColInfo *dcinfo = palloc(sizeof(DistinctColInfo));
dcinfo->colno = lfirst_int(lc1);
dcinfo->opid = lfirst_oid(lc2);
dcinfo->collid = InvalidOid;
distinct_cols = lappend(distinct_cols, dcinfo);
}
return query_is_distinct_for_with_collations(query, distinct_cols);
}
/*
* query_is_distinct_for_with_collations - does query never return duplicates
* of the specified columns?
*
* query is a not-yet-planned subquery (in current usage, it's always from
* a subquery RTE, which the planner avoids scribbling on).
*
* colnos is an integer list of output column numbers (resno's). We are
* interested in whether rows consisting of just these columns are certain
* to be distinct. "Distinctness" is defined according to whether the
* corresponding upper-level equality operators listed in opids would think
* the values are distinct. (Note: the opids entries could be cross-type
* operators, and thus not exactly the equality operators that the subquery
* would use itself. We use equality_ops_are_compatible() to check
* compatibility. That looks at opfamily membership for index AMs that have
* declared that they support consistent equality semantics within an
* distinct_cols is a list of DistinctColInfo, one per requested output column.
* Each entry names the subquery output column number we want distinct, the
* upper-level equality operator we'll compare values with, and that operator's
* input collation. We are interested in whether rows consisting of just these
* columns are certain to be distinct.
*
* "Distinctness" is defined according to whether the corresponding upper-level
* equality operators would think the values are distinct. (Note: each opid
* could be a cross-type operator, and thus not exactly the equality operator
* that the subquery would use itself. We use equality_ops_are_compatible() to
* check compatibility. That looks at opfamily membership for index AMs that
* have declared that they support consistent equality semantics within an
* opfamily, and so should give trustworthy answers for all operators that we
* might need to deal with here.)
*
* The collid must also agree on equality with the collation the subquery's own
* DISTINCT/GROUP BY/set-op uses to deduplicate the column, else the subquery's
* distinctness does not carry over to the caller's equality semantics. Two
* collations agree on equality if they match or if both are deterministic (in
* which case both reduce equality to byte-equality; see CREATE COLLATION).
*/
bool
query_is_distinct_for(Query *query, List *colnos, List *opids)
static bool
query_is_distinct_for_with_collations(Query *query, List *distinct_cols)
{
ListCell *l;
Oid opid;
Assert(list_length(colnos) == list_length(opids));
DistinctColInfo *dcinfo;
/*
* DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
@ -1151,9 +1211,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
TargetEntry *tle = get_sortgroupclause_tle(sgc,
query->targetList);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
dcinfo = distinct_col_search(tle->resno, distinct_cols);
if (dcinfo == NULL ||
!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
!collations_agree_on_equality(dcinfo->collid,
exprCollation((Node *) tle->expr)))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
@ -1182,9 +1244,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
TargetEntry *tle = get_sortgroupclause_tle(sgc,
query->targetList);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
dcinfo = distinct_col_search(tle->resno, distinct_cols);
if (dcinfo == NULL ||
!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
!collations_agree_on_equality(dcinfo->collid,
exprCollation((Node *) tle->expr)))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
@ -1250,9 +1314,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
sgc = (SortGroupClause *) lfirst(lg);
lg = lnext(topop->groupClauses, lg);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
dcinfo = distinct_col_search(tle->resno, distinct_cols);
if (dcinfo == NULL ||
!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
!collations_agree_on_equality(dcinfo->collid,
exprCollation((Node *) tle->expr)))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
@ -1272,24 +1338,23 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
}
/*
* distinct_col_search - subroutine for query_is_distinct_for
* distinct_col_search - subroutine for query_is_distinct_for_with_collations
*
* If colno is in colnos, return the corresponding element of opids,
* else return InvalidOid. (Ordinarily colnos would not contain duplicates,
* but if it does, we arbitrarily select the first match.)
* If colno matches the colno field of an entry in distinct_cols, return a
* pointer to that entry; else return NULL. (Ordinarily distinct_cols would
* not contain duplicate colnos, but if it does, we arbitrarily select the
* first match.)
*/
static Oid
distinct_col_search(int colno, List *colnos, List *opids)
static DistinctColInfo *
distinct_col_search(int colno, List *distinct_cols)
{
ListCell *lc1,
*lc2;
forboth(lc1, colnos, lc2, opids)
foreach_ptr(DistinctColInfo, dcinfo, distinct_cols)
{
if (colno == lfirst_int(lc1))
return lfirst_oid(lc2);
if (dcinfo->colno == colno)
return dcinfo;
}
return InvalidOid;
return NULL;
}

@ -1767,6 +1767,187 @@ ORDER BY 1;
ghi
(4 rows)
--
-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness
-- under a different collation, so the planner must not use such a proof for
-- any optimization.
--
-- Ensure that we do not use inner-unique join execution
EXPLAIN (VERBOSE, COSTS OFF)
SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
QUERY PLAN
---------------------------------------------------------------------------
Sort
Output: t1.x, test3cs.x
Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
-> Hash Join
Output: t1.x, test3cs.x
Hash Cond: ((t1.x)::text = (test3cs.x)::text)
-> Seq Scan on collate_tests.test1cs t1
Output: t1.x
-> Hash
Output: test3cs.x
-> HashAggregate
Output: test3cs.x
Group Key: test3cs.x
-> Seq Scan on collate_tests.test3cs
Output: test3cs.x
(15 rows)
SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
x | x
-----+-----
abc | abc
abc | ABC
ABC | abc
ABC | ABC
def | def
ghi | ghi
(6 rows)
-- Same with GROUP BY
EXPLAIN (VERBOSE, COSTS OFF)
SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
QUERY PLAN
---------------------------------------------------------------------------
Sort
Output: t1.x, test3cs.x
Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
-> Hash Join
Output: t1.x, test3cs.x
Hash Cond: ((t1.x)::text = (test3cs.x)::text)
-> Seq Scan on collate_tests.test1cs t1
Output: t1.x
-> Hash
Output: test3cs.x
-> HashAggregate
Output: test3cs.x
Group Key: test3cs.x
-> Seq Scan on collate_tests.test3cs
Output: test3cs.x
(15 rows)
SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
x | x
-----+-----
abc | abc
abc | ABC
ABC | abc
ABC | ABC
def | def
ghi | ghi
(6 rows)
-- Same with set-op
EXPLAIN (VERBOSE, COSTS OFF)
SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
QUERY PLAN
---------------------------------------------------------------------------
Sort
Output: t1.x, test3cs.x
Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
-> Hash Join
Output: t1.x, test3cs.x
Hash Cond: ((test3cs.x)::text = (t1.x)::text)
-> HashAggregate
Output: test3cs.x
Group Key: test3cs.x
-> Append
-> Seq Scan on collate_tests.test3cs
Output: test3cs.x
-> Seq Scan on collate_tests.test3cs test3cs_1
Output: test3cs_1.x
-> Hash
Output: t1.x
-> Seq Scan on collate_tests.test1cs t1
Output: t1.x
(18 rows)
SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
x | x
-----+-----
abc | abc
abc | ABC
ABC | abc
ABC | ABC
def | def
ghi | ghi
(6 rows)
-- Ensure that left-join is not removed
EXPLAIN (COSTS OFF)
SELECT t1.* FROM test3cs t1
LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
ORDER BY 1;
QUERY PLAN
-----------------------------------------------
Sort
Sort Key: t1.x COLLATE case_sensitive
-> Hash Left Join
Hash Cond: (t1.x = (test3cs.x)::text)
-> Seq Scan on test3cs t1
-> Hash
-> HashAggregate
Group Key: test3cs.x
-> Seq Scan on test3cs
(9 rows)
SELECT t1.* FROM test3cs t1
LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
ORDER BY 1;
x
-----
abc
abc
ABC
ABC
def
ghi
(6 rows)
-- Ensure that semijoin is not reduced to innerjoin
EXPLAIN (COSTS OFF)
SELECT * FROM test3cs t1
WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive)
ORDER BY 1;
QUERY PLAN
-------------------------------------------------------
Sort
Sort Key: t1.x COLLATE case_sensitive
-> Hash Semi Join
Hash Cond: ((t1.x)::text = (test3cs.x)::text)
-> Seq Scan on test3cs t1
-> Hash
-> HashAggregate
Group Key: test3cs.x
-> Seq Scan on test3cs
(9 rows)
SELECT * FROM test3cs t1
WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive)
ORDER BY 1;
x
-----
abc
ABC
def
ghi
(4 rows)
CREATE TABLE test1ci (x text COLLATE case_insensitive);
CREATE TABLE test2ci (x text COLLATE case_insensitive);
CREATE TABLE test3ci (x text COLLATE case_insensitive);

@ -653,6 +653,64 @@ SELECT * FROM test3cs t1
WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
ORDER BY 1;
--
-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness
-- under a different collation, so the planner must not use such a proof for
-- any optimization.
--
-- Ensure that we do not use inner-unique join execution
EXPLAIN (VERBOSE, COSTS OFF)
SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
-- Same with GROUP BY
EXPLAIN (VERBOSE, COSTS OFF)
SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
-- Same with set-op
EXPLAIN (VERBOSE, COSTS OFF)
SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive
ORDER BY 1, 2;
-- Ensure that left-join is not removed
EXPLAIN (COSTS OFF)
SELECT t1.* FROM test3cs t1
LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
ORDER BY 1;
SELECT t1.* FROM test3cs t1
LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
ORDER BY 1;
-- Ensure that semijoin is not reduced to innerjoin
EXPLAIN (COSTS OFF)
SELECT * FROM test3cs t1
WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive)
ORDER BY 1;
SELECT * FROM test3cs t1
WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
WHERE t1.x = t2.x COLLATE case_insensitive)
ORDER BY 1;
CREATE TABLE test1ci (x text COLLATE case_insensitive);
CREATE TABLE test2ci (x text COLLATE case_insensitive);
CREATE TABLE test3ci (x text COLLATE case_insensitive);

@ -648,6 +648,7 @@ DiscardMode
DiscardStmt
DispatchOption
DistanceValue
DistinctColInfo
DistinctExpr
DoState
DoStmt

Loading…
Cancel
Save