From bed3ffbf9d952be6c7d739d068cdce44c046dfb7 Mon Sep 17 00:00:00 2001 From: Richard Guo Date: Tue, 5 May 2026 10:27:06 +0900 Subject: [PATCH] Consider collation when proving subquery uniqueness rel_is_distinct_for()'s RTE_SUBQUERY branch passed only the equality operator from each join clause to query_is_distinct_for(), discarding the operator's input collation. query_is_distinct_for() then verified opfamily compatibility but never checked collations, so a DISTINCT / GROUP BY / set-op operating under one collation was trusted to prove uniqueness for a comparison performed under an unrelated collation. As with the recent fix in relation_has_unique_index_for(), this is unsound for nondeterministic collations and yields wrong query results in any optimization that consumes the proof. Fix by carrying each clause's operator input collation into query_is_distinct_for() and validating it at every check-site against the subquery target expression's collation. Back-patch to all supported branches. query_is_distinct_for() is declared in an installed header, so on stable branches the existing two-list signature is retained as a thin wrapper that forwards to a new collation-aware entry point; external callers continue to receive the historical collation-blind answer. Author: Richard Guo Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CAMbWs4_XUUSTyzCaRjUeeahWNqi=8ZOA5Q4coi8zUVEDSBkM6A@mail.gmail.com Backpatch-through: 14 --- src/backend/optimizer/plan/analyzejoins.c | 177 +++++++++++------ .../regress/expected/collate.icu.utf8.out | 181 ++++++++++++++++++ src/test/regress/sql/collate.icu.utf8.sql | 58 ++++++ src/tools/pgindent/typedefs.list | 1 + 4 files changed, 361 insertions(+), 56 deletions(-) diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 1ab9749b78c..9f546b4d3a6 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -34,6 +34,20 @@ #include "rewrite/rewriteManip.h" #include "utils/lsyscache.h" +/* + * One element of the list passed to query_is_distinct_for_with_collations(). + * Each entry names a subquery output column that the caller needs to be + * distinct over, plus the upper-level equality operator and its input + * collation, so that the subquery's own DISTINCT/GROUP BY/set-op clauses can + * be compared for compatibility. + */ +typedef struct DistinctColInfo +{ + int colno; /* subquery output column resno */ + Oid opid; /* upper-level equality operator */ + Oid collid; /* input collation of opid */ +} DistinctColInfo; + /* * Utility structure. A sorting procedure is needed to simplify the search * of SJE-candidate baserels referencing the same database relation. Having @@ -65,7 +79,9 @@ static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel); static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, List **extra_clauses); -static Oid distinct_col_search(int colno, List *colnos, List *opids); +static bool query_is_distinct_for_with_collations(Query *query, + List *distinct_cols); +static DistinctColInfo *distinct_col_search(int colno, List *distinct_cols); static bool is_innerrel_unique_for(PlannerInfo *root, Relids joinrelids, Relids outerrelids, @@ -1018,15 +1034,17 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, { Index relid = rel->relid; Query *subquery = root->simple_rte_array[relid]->subquery; - List *colnos = NIL; - List *opids = NIL; + List *distinct_cols = NIL; ListCell *l; /* - * Build the argument lists for query_is_distinct_for: a list of - * output column numbers that the query needs to be distinct over, and - * a list of equality operators that the output columns need to be - * distinct according to. + * Build the argument list for query_is_distinct_for_with_collations: + * a list of DistinctColInfo entries, each holding an output column + * number that the query needs to be distinct over, the equality + * operator that the column needs to be distinct according to, and + * that operator's input collation. The collation matters because the + * subquery's own DISTINCT / GROUP BY / set-op proves uniqueness under + * its own collation, which need not agree with the operator's. * * (XXX we are not considering restriction clauses attached to the * subquery; is that worth doing?) @@ -1034,18 +1052,18 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, foreach(l, clause_list) { RestrictInfo *rinfo = lfirst_node(RestrictInfo, l); - Oid op; + OpExpr *opexpr; Var *var; + DistinctColInfo *dcinfo; /* - * Get the equality operator we need uniqueness according to. - * (This might be a cross-type operator and thus not exactly the - * same operator the subquery would consider; that's all right - * since query_is_distinct_for can resolve such cases.) The - * caller's mergejoinability test should have selected only - * OpExprs. + * The caller's mergejoinability test should have selected only + * OpExprs. The operator might be a cross-type operator and thus + * not exactly the same operator the subquery would consider; + * that's all right since query_is_distinct_for_with_collations + * can resolve such cases. */ - op = castNode(OpExpr, rinfo->clause)->opno; + opexpr = castNode(OpExpr, rinfo->clause); /* caller identified the inner side for us */ if (rinfo->outer_is_left) @@ -1069,11 +1087,14 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, var->varno != relid || var->varlevelsup != 0) continue; - colnos = lappend_int(colnos, var->varattno); - opids = lappend_oid(opids, op); + dcinfo = palloc(sizeof(DistinctColInfo)); + dcinfo->colno = var->varattno; + dcinfo->opid = opexpr->opno; + dcinfo->collid = opexpr->inputcollid; + distinct_cols = lappend(distinct_cols, dcinfo); } - if (query_is_distinct_for(subquery, colnos, opids)) + if (query_is_distinct_for_with_collations(subquery, distinct_cols)) return true; } return false; @@ -1111,31 +1132,70 @@ query_supports_distinctness(Query *query) } /* - * query_is_distinct_for - does query never return duplicates of the - * specified columns? + * query_is_distinct_for - ABI-preserving wrapper around + * query_is_distinct_for_with_collations(). + * + * The original signature took parallel colnos/opids lists and did not + * consider collations. External callers built against earlier minor + * releases continue to call it with the historical (collation-blind) + * semantics; we forward with InvalidOid collations, which makes the + * collation check a no-op (see collations_agree_on_equality()). + */ +bool +query_is_distinct_for(Query *query, List *colnos, List *opids) +{ + List *distinct_cols = NIL; + ListCell *lc1; + ListCell *lc2; + + Assert(list_length(colnos) == list_length(opids)); + + forboth(lc1, colnos, lc2, opids) + { + DistinctColInfo *dcinfo = palloc(sizeof(DistinctColInfo)); + + dcinfo->colno = lfirst_int(lc1); + dcinfo->opid = lfirst_oid(lc2); + dcinfo->collid = InvalidOid; + distinct_cols = lappend(distinct_cols, dcinfo); + } + + return query_is_distinct_for_with_collations(query, distinct_cols); +} + +/* + * query_is_distinct_for_with_collations - does query never return duplicates + * of the specified columns? * * query is a not-yet-planned subquery (in current usage, it's always from * a subquery RTE, which the planner avoids scribbling on). * - * colnos is an integer list of output column numbers (resno's). We are - * interested in whether rows consisting of just these columns are certain - * to be distinct. "Distinctness" is defined according to whether the - * corresponding upper-level equality operators listed in opids would think - * the values are distinct. (Note: the opids entries could be cross-type - * operators, and thus not exactly the equality operators that the subquery - * would use itself. We use equality_ops_are_compatible() to check - * compatibility. That looks at opfamily membership for index AMs that have - * declared that they support consistent equality semantics within an + * distinct_cols is a list of DistinctColInfo, one per requested output column. + * Each entry names the subquery output column number we want distinct, the + * upper-level equality operator we'll compare values with, and that operator's + * input collation. We are interested in whether rows consisting of just these + * columns are certain to be distinct. + * + * "Distinctness" is defined according to whether the corresponding upper-level + * equality operators would think the values are distinct. (Note: each opid + * could be a cross-type operator, and thus not exactly the equality operator + * that the subquery would use itself. We use equality_ops_are_compatible() to + * check compatibility. That looks at opfamily membership for index AMs that + * have declared that they support consistent equality semantics within an * opfamily, and so should give trustworthy answers for all operators that we * might need to deal with here.) + * + * The collid must also agree on equality with the collation the subquery's own + * DISTINCT/GROUP BY/set-op uses to deduplicate the column, else the subquery's + * distinctness does not carry over to the caller's equality semantics. Two + * collations agree on equality if they match or if both are deterministic (in + * which case both reduce equality to byte-equality; see CREATE COLLATION). */ -bool -query_is_distinct_for(Query *query, List *colnos, List *opids) +static bool +query_is_distinct_for_with_collations(Query *query, List *distinct_cols) { ListCell *l; - Oid opid; - - Assert(list_length(colnos) == list_length(opids)); + DistinctColInfo *dcinfo; /* * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the @@ -1151,9 +1211,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) TargetEntry *tle = get_sortgroupclause_tle(sgc, query->targetList); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_agree_on_equality(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -1182,9 +1244,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) TargetEntry *tle = get_sortgroupclause_tle(sgc, query->targetList); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_agree_on_equality(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -1250,9 +1314,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) sgc = (SortGroupClause *) lfirst(lg); lg = lnext(topop->groupClauses, lg); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_agree_on_equality(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -1272,24 +1338,23 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) } /* - * distinct_col_search - subroutine for query_is_distinct_for + * distinct_col_search - subroutine for query_is_distinct_for_with_collations * - * If colno is in colnos, return the corresponding element of opids, - * else return InvalidOid. (Ordinarily colnos would not contain duplicates, - * but if it does, we arbitrarily select the first match.) + * If colno matches the colno field of an entry in distinct_cols, return a + * pointer to that entry; else return NULL. (Ordinarily distinct_cols would + * not contain duplicate colnos, but if it does, we arbitrarily select the + * first match.) */ -static Oid -distinct_col_search(int colno, List *colnos, List *opids) +static DistinctColInfo * +distinct_col_search(int colno, List *distinct_cols) { - ListCell *lc1, - *lc2; - - forboth(lc1, colnos, lc2, opids) + foreach_ptr(DistinctColInfo, dcinfo, distinct_cols) { - if (colno == lfirst_int(lc1)) - return lfirst_oid(lc2); + if (dcinfo->colno == colno) + return dcinfo; } - return InvalidOid; + + return NULL; } diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 779240521dc..17876844a45 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1767,6 +1767,187 @@ ORDER BY 1; ghi (4 rows) +-- +-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness +-- under a different collation, so the planner must not use such a proof for +-- any optimization. +-- +-- Ensure that we do not use inner-unique join execution +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x + -> Hash + Output: test3cs.x + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x +(15 rows) + +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Same with GROUP BY +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x + -> Hash + Output: test3cs.x + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x +(15 rows) + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Same with set-op +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((test3cs.x)::text = (t1.x)::text) + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Append + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x + -> Seq Scan on collate_tests.test3cs test3cs_1 + Output: test3cs_1.x + -> Hash + Output: t1.x + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x +(18 rows) + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Ensure that left-join is not removed +EXPLAIN (COSTS OFF) +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + QUERY PLAN +----------------------------------------------- + Sort + Sort Key: t1.x COLLATE case_sensitive + -> Hash Left Join + Hash Cond: (t1.x = (test3cs.x)::text) + -> Seq Scan on test3cs t1 + -> Hash + -> HashAggregate + Group Key: test3cs.x + -> Seq Scan on test3cs +(9 rows) + +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + x +----- + abc + abc + ABC + ABC + def + ghi +(6 rows) + +-- Ensure that semijoin is not reduced to innerjoin +EXPLAIN (COSTS OFF) +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + QUERY PLAN +------------------------------------------------------- + Sort + Sort Key: t1.x COLLATE case_sensitive + -> Hash Semi Join + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on test3cs t1 + -> Hash + -> HashAggregate + Group Key: test3cs.x + -> Seq Scan on test3cs +(9 rows) + +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + x +----- + abc + ABC + def + ghi +(4 rows) + CREATE TABLE test1ci (x text COLLATE case_insensitive); CREATE TABLE test2ci (x text COLLATE case_insensitive); CREATE TABLE test3ci (x text COLLATE case_insensitive); diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 2a5b76c6c8c..7488624401b 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -653,6 +653,64 @@ SELECT * FROM test3cs t1 WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive) ORDER BY 1; +-- +-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness +-- under a different collation, so the planner must not use such a proof for +-- any optimization. +-- + +-- Ensure that we do not use inner-unique join execution +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Same with GROUP BY +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Same with set-op +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Ensure that left-join is not removed +EXPLAIN (COSTS OFF) +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + +-- Ensure that semijoin is not reduced to innerjoin +EXPLAIN (COSTS OFF) +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + CREATE TABLE test1ci (x text COLLATE case_insensitive); CREATE TABLE test2ci (x text COLLATE case_insensitive); CREATE TABLE test3ci (x text COLLATE case_insensitive); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 817af488a0d..c2fef51cd7b 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -648,6 +648,7 @@ DiscardMode DiscardStmt DispatchOption DistanceValue +DistinctColInfo DistinctExpr DoState DoStmt