diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 8b4163f1d67..0e6969c5d5a 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -32,6 +32,20 @@ #include "optimizer/tlist.h" #include "utils/lsyscache.h" +/* + * One element of the list passed to query_is_distinct_for_with_collations(). + * Each entry names a subquery output column that the caller needs to be + * distinct over, plus the upper-level equality operator and its input + * collation, so that the subquery's own DISTINCT/GROUP BY/set-op clauses can + * be compared for compatibility. + */ +typedef struct DistinctColInfo +{ + int colno; /* subquery output column resno */ + Oid opid; /* upper-level equality operator */ + Oid collid; /* input collation of opid */ +} DistinctColInfo; + /* local functions */ static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); static void remove_rel_from_query(PlannerInfo *root, int relid, @@ -40,7 +54,9 @@ static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel); static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list); -static Oid distinct_col_search(int colno, List *colnos, List *opids); +static bool query_is_distinct_for_with_collations(Query *query, + List *distinct_cols); +static DistinctColInfo *distinct_col_search(int colno, List *distinct_cols); static bool is_innerrel_unique_for(PlannerInfo *root, Relids joinrelids, Relids outerrelids, @@ -660,15 +676,17 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list) { Index relid = rel->relid; Query *subquery = root->simple_rte_array[relid]->subquery; - List *colnos = NIL; - List *opids = NIL; + List *distinct_cols = NIL; ListCell *l; /* - * Build the argument lists for query_is_distinct_for: a list of - * output column numbers that the query needs to be distinct over, and - * a list of equality operators that the output columns need to be - * distinct according to. + * Build the argument list for query_is_distinct_for_with_collations: + * a list of DistinctColInfo entries, each holding an output column + * number that the query needs to be distinct over, the equality + * operator that the column needs to be distinct according to, and + * that operator's input collation. The collation matters because the + * subquery's own DISTINCT / GROUP BY / set-op proves uniqueness under + * its own collation, which need not agree with the operator's. * * (XXX we are not considering restriction clauses attached to the * subquery; is that worth doing?) @@ -676,18 +694,18 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list) foreach(l, clause_list) { RestrictInfo *rinfo = lfirst_node(RestrictInfo, l); - Oid op; + OpExpr *opexpr; Var *var; + DistinctColInfo *dcinfo; /* - * Get the equality operator we need uniqueness according to. - * (This might be a cross-type operator and thus not exactly the - * same operator the subquery would consider; that's all right - * since query_is_distinct_for can resolve such cases.) The - * caller's mergejoinability test should have selected only - * OpExprs. + * The caller's mergejoinability test should have selected only + * OpExprs. The operator might be a cross-type operator and thus + * not exactly the same operator the subquery would consider; + * that's all right since query_is_distinct_for_with_collations + * can resolve such cases. */ - op = castNode(OpExpr, rinfo->clause)->opno; + opexpr = castNode(OpExpr, rinfo->clause); /* caller identified the inner side for us */ if (rinfo->outer_is_left) @@ -711,11 +729,14 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list) var->varno != relid || var->varlevelsup != 0) continue; - colnos = lappend_int(colnos, var->varattno); - opids = lappend_oid(opids, op); + dcinfo = palloc(sizeof(DistinctColInfo)); + dcinfo->colno = var->varattno; + dcinfo->opid = opexpr->opno; + dcinfo->collid = opexpr->inputcollid; + distinct_cols = lappend(distinct_cols, dcinfo); } - if (query_is_distinct_for(subquery, colnos, opids)) + if (query_is_distinct_for_with_collations(subquery, distinct_cols)) return true; } return false; @@ -753,31 +774,71 @@ query_supports_distinctness(Query *query) } /* - * query_is_distinct_for - does query never return duplicates of the - * specified columns? + * query_is_distinct_for - ABI-preserving wrapper around + * query_is_distinct_for_with_collations(). * - * query is a not-yet-planned subquery (in current usage, it's always from - * a subquery RTE, which the planner avoids scribbling on). - * - * colnos is an integer list of output column numbers (resno's). We are - * interested in whether rows consisting of just these columns are certain - * to be distinct. "Distinctness" is defined according to whether the - * corresponding upper-level equality operators listed in opids would think - * the values are distinct. (Note: the opids entries could be cross-type - * operators, and thus not exactly the equality operators that the subquery - * would use itself. We use equality_ops_are_compatible() to check - * compatibility. That looks at btree or hash opfamily membership, and so - * should give trustworthy answers for all operators that we might need - * to deal with here.) + * The original signature took parallel colnos/opids lists and did not + * consider collations. External callers built against earlier minor + * releases continue to call it with the historical (collation-blind) + * semantics; we forward with InvalidOid collations, which makes the + * collation check a no-op (see collations_agree_on_equality()). */ bool query_is_distinct_for(Query *query, List *colnos, List *opids) { - ListCell *l; - Oid opid; + List *distinct_cols = NIL; + ListCell *lc1; + ListCell *lc2; Assert(list_length(colnos) == list_length(opids)); + forboth(lc1, colnos, lc2, opids) + { + DistinctColInfo *dcinfo = palloc(sizeof(DistinctColInfo)); + + dcinfo->colno = lfirst_int(lc1); + dcinfo->opid = lfirst_oid(lc2); + dcinfo->collid = InvalidOid; + distinct_cols = lappend(distinct_cols, dcinfo); + } + + return query_is_distinct_for_with_collations(query, distinct_cols); +} + +/* + * query_is_distinct_for_with_collations - does query never return duplicates + * of the specified columns? + * + * query is a not-yet-planned subquery (in current usage, it's always from + * a subquery RTE, which the planner avoids scribbling on). + * + * distinct_cols is a list of DistinctColInfo, one per requested output column. + * Each entry names the subquery output column number we want distinct, the + * upper-level equality operator we'll compare values with, and that operator's + * input collation. We are interested in whether rows consisting of just these + * columns are certain to be distinct. + * + * "Distinctness" is defined according to whether the corresponding upper-level + * equality operators would think the values are distinct. (Note: each opid + * could be a cross-type operator, and thus not exactly the equality operator + * that the subquery would use itself. We use equality_ops_are_compatible() to + * check compatibility. That looks at opfamily membership for index AMs that + * have declared that they support consistent equality semantics within an + * opfamily, and so should give trustworthy answers for all operators that we + * might need to deal with here.) + * + * The collid must also agree on equality with the collation the subquery's own + * DISTINCT/GROUP BY/set-op uses to deduplicate the column, else the subquery's + * distinctness does not carry over to the caller's equality semantics. Two + * collations agree on equality if they match or if both are deterministic (in + * which case both reduce equality to byte-equality; see CREATE COLLATION). + */ +static bool +query_is_distinct_for_with_collations(Query *query, List *distinct_cols) +{ + ListCell *l; + DistinctColInfo *dcinfo; + /* * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the * columns in the DISTINCT clause appear in colnos and operator semantics @@ -792,9 +853,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) TargetEntry *tle = get_sortgroupclause_tle(sgc, query->targetList); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_agree_on_equality(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -823,9 +886,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) TargetEntry *tle = get_sortgroupclause_tle(sgc, query->targetList); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_agree_on_equality(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -891,9 +956,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) sgc = (SortGroupClause *) lfirst(lg); lg = lnext(topop->groupClauses, lg); - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) + dcinfo = distinct_col_search(tle->resno, distinct_cols); + if (dcinfo == NULL || + !equality_ops_are_compatible(dcinfo->opid, sgc->eqop) || + !collations_agree_on_equality(dcinfo->collid, + exprCollation((Node *) tle->expr))) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ @@ -913,24 +980,27 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) } /* - * distinct_col_search - subroutine for query_is_distinct_for + * distinct_col_search - subroutine for query_is_distinct_for_with_collations * - * If colno is in colnos, return the corresponding element of opids, - * else return InvalidOid. (Ordinarily colnos would not contain duplicates, - * but if it does, we arbitrarily select the first match.) + * If colno matches the colno field of an entry in distinct_cols, return a + * pointer to that entry; else return NULL. (Ordinarily distinct_cols would + * not contain duplicate colnos, but if it does, we arbitrarily select the + * first match.) */ -static Oid -distinct_col_search(int colno, List *colnos, List *opids) +static DistinctColInfo * +distinct_col_search(int colno, List *distinct_cols) { - ListCell *lc1, - *lc2; + ListCell *lc; - forboth(lc1, colnos, lc2, opids) + foreach(lc, distinct_cols) { - if (colno == lfirst_int(lc1)) - return lfirst_oid(lc2); + DistinctColInfo *dcinfo = (DistinctColInfo *) lfirst(lc); + + if (dcinfo->colno == colno) + return dcinfo; } - return InvalidOid; + + return NULL; } diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index c7d7122bd2b..4179fc1b78d 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1512,6 +1512,187 @@ ORDER BY 1; ghi (4 rows) +-- +-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness +-- under a different collation, so the planner must not use such a proof for +-- any optimization. +-- +-- Ensure that we do not use inner-unique join execution +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x + -> Hash + Output: test3cs.x + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x +(15 rows) + +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Same with GROUP BY +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x + -> Hash + Output: test3cs.x + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x +(15 rows) + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Same with set-op +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + QUERY PLAN +--------------------------------------------------------------------------- + Sort + Output: t1.x, test3cs.x + Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive + -> Hash Join + Output: t1.x, test3cs.x + Hash Cond: ((test3cs.x)::text = (t1.x)::text) + -> HashAggregate + Output: test3cs.x + Group Key: test3cs.x + -> Append + -> Seq Scan on collate_tests.test3cs + Output: test3cs.x + -> Seq Scan on collate_tests.test3cs test3cs_1 + Output: test3cs_1.x + -> Hash + Output: t1.x + -> Seq Scan on collate_tests.test1cs t1 + Output: t1.x +(18 rows) + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + x | x +-----+----- + abc | abc + abc | ABC + ABC | abc + ABC | ABC + def | def + ghi | ghi +(6 rows) + +-- Ensure that left-join is not removed +EXPLAIN (COSTS OFF) +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + QUERY PLAN +----------------------------------------------- + Sort + Sort Key: t1.x COLLATE case_sensitive + -> Hash Left Join + Hash Cond: (t1.x = (test3cs.x)::text) + -> Seq Scan on test3cs t1 + -> Hash + -> HashAggregate + Group Key: test3cs.x + -> Seq Scan on test3cs +(9 rows) + +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + x +----- + abc + abc + ABC + ABC + def + ghi +(6 rows) + +-- Ensure that semijoin is not reduced to innerjoin +EXPLAIN (COSTS OFF) +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + QUERY PLAN +------------------------------------------------------- + Sort + Sort Key: t1.x COLLATE case_sensitive + -> Hash Semi Join + Hash Cond: ((t1.x)::text = (test3cs.x)::text) + -> Seq Scan on test3cs t1 + -> Hash + -> HashAggregate + Group Key: test3cs.x + -> Seq Scan on test3cs +(9 rows) + +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + x +----- + abc + ABC + def + ghi +(4 rows) + CREATE TABLE test1ci (x text COLLATE case_insensitive); CREATE TABLE test2ci (x text COLLATE case_insensitive); CREATE TABLE test3ci (x text COLLATE case_insensitive); diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index a0f140eb090..537a49fb43c 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -571,6 +571,64 @@ SELECT * FROM test3cs t1 WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive) ORDER BY 1; +-- +-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness +-- under a different collation, so the planner must not use such a proof for +-- any optimization. +-- + +-- Ensure that we do not use inner-unique join execution +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Same with GROUP BY +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Same with set-op +EXPLAIN (VERBOSE, COSTS OFF) +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2 +WHERE t1.x = t2.x COLLATE case_insensitive +ORDER BY 1, 2; + +-- Ensure that left-join is not removed +EXPLAIN (COSTS OFF) +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + +SELECT t1.* FROM test3cs t1 + LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive +ORDER BY 1; + +-- Ensure that semijoin is not reduced to innerjoin +EXPLAIN (COSTS OFF) +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + +SELECT * FROM test3cs t1 + WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2 + WHERE t1.x = t2.x COLLATE case_insensitive) +ORDER BY 1; + CREATE TABLE test1ci (x text COLLATE case_insensitive); CREATE TABLE test2ci (x text COLLATE case_insensitive); CREATE TABLE test3ci (x text COLLATE case_insensitive); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 863d1fa8c39..ce192fcf4d0 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -557,6 +557,7 @@ DisableTimeoutParams DiscardMode DiscardStmt DistanceValue +DistinctColInfo DistinctExpr DoStmt DocRepresentation