Consider collation when proving subquery uniqueness

rel_is_distinct_for()'s RTE_SUBQUERY branch passed only the equality operator from each join clause to query_is_distinct_for(), discarding the operator's input collation. query_is_distinct_for() then verified opfamily compatibility but never checked collations, so a DISTINCT / GROUP BY / set-op operating under one collation was trusted to prove uniqueness for a comparison performed under an unrelated collation. As with the recent fix in relation_has_unique_index_for(), this is unsound for nondeterministic collations and yields wrong query results in any optimization that consumes the proof. Fix by carrying each clause's operator input collation into query_is_distinct_for() and validating it at every check-site against the subquery target expression's collation. Back-patch to all supported branches. query_is_distinct_for() is declared in an installed header, so on stable branches the existing two-list signature is retained as a thin wrapper that forwards to a new collation-aware entry point; external callers continue to receive the historical collation-blind answer. Author: Richard Guo <guofenglinux@gmail.com> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/CAMbWs4_XUUSTyzCaRjUeeahWNqi=8ZOA5Q4coi8zUVEDSBkM6A@mail.gmail.com Backpatch-through: 14
3 days ago · bed3ffbf9d
parent b62f514ac5
commit bed3ffbf9d
4 changed files with 361 additions and 56 deletions
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@ -34,6 +34,20 @@
 #include "rewrite/rewriteManip.h"
 #include "utils/lsyscache.h"

+/*
+ * One element of the list passed to query_is_distinct_for_with_collations().
+ * Each entry names a subquery output column that the caller needs to be
+ * distinct over, plus the upper-level equality operator and its input
+ * collation, so that the subquery's own DISTINCT/GROUP BY/set-op clauses can
+ * be compared for compatibility.
+ */
+typedef struct DistinctColInfo
+{
+	int			colno;			/* subquery output column resno */
+	Oid			opid;			/* upper-level equality operator */
+	Oid			collid;			/* input collation of opid */
+} DistinctColInfo;
+
 /*
 * Utility structure.  A sorting procedure is needed to simplify the search
 * of SJE-candidate baserels referencing the same database relation.  Having
@ -65,7 +79,9 @@ static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
 static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
 static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
 								List *clause_list, List **extra_clauses);
-static Oid	distinct_col_search(int colno, List *colnos, List *opids);
+static bool query_is_distinct_for_with_collations(Query *query,
+												  List *distinct_cols);
+static DistinctColInfo *distinct_col_search(int colno, List *distinct_cols);
 static bool is_innerrel_unique_for(PlannerInfo *root,
 								   Relids joinrelids,
 								   Relids outerrelids,
@ -1018,15 +1034,17 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
 	{
 		Index		relid = rel->relid;
 		Query	   *subquery = root->simple_rte_array[relid]->subquery;
-		List	   *colnos = NIL;
-		List	   *opids = NIL;
+		List	   *distinct_cols = NIL;
 		ListCell   *l;

 		/*
-		 * Build the argument lists for query_is_distinct_for: a list of
-		 * output column numbers that the query needs to be distinct over, and
-		 * a list of equality operators that the output columns need to be
-		 * distinct according to.
+		 * Build the argument list for query_is_distinct_for_with_collations:
+		 * a list of DistinctColInfo entries, each holding an output column
+		 * number that the query needs to be distinct over, the equality
+		 * operator that the column needs to be distinct according to, and
+		 * that operator's input collation.  The collation matters because the
+		 * subquery's own DISTINCT / GROUP BY / set-op proves uniqueness under
+		 * its own collation, which need not agree with the operator's.
 		 *
 		 * (XXX we are not considering restriction clauses attached to the
 		 * subquery; is that worth doing?)
@ -1034,18 +1052,18 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
 		foreach(l, clause_list)
 		{
 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
-			Oid			op;
+			OpExpr	   *opexpr;
 			Var		   *var;
+			DistinctColInfo *dcinfo;

 			/*
-			 * Get the equality operator we need uniqueness according to.
-			 * (This might be a cross-type operator and thus not exactly the
-			 * same operator the subquery would consider; that's all right
-			 * since query_is_distinct_for can resolve such cases.)  The
-			 * caller's mergejoinability test should have selected only
-			 * OpExprs.
+			 * The caller's mergejoinability test should have selected only
+			 * OpExprs.  The operator might be a cross-type operator and thus
+			 * not exactly the same operator the subquery would consider;
+			 * that's all right since query_is_distinct_for_with_collations
+			 * can resolve such cases.
 			 */
-			op = castNode(OpExpr, rinfo->clause)->opno;
+			opexpr = castNode(OpExpr, rinfo->clause);

 			/* caller identified the inner side for us */
 			if (rinfo->outer_is_left)
@ -1069,11 +1087,14 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list,
 				var->varno != relid || var->varlevelsup != 0)
 				continue;

-			colnos = lappend_int(colnos, var->varattno);
-			opids = lappend_oid(opids, op);
+			dcinfo = palloc(sizeof(DistinctColInfo));
+			dcinfo->colno = var->varattno;
+			dcinfo->opid = opexpr->opno;
+			dcinfo->collid = opexpr->inputcollid;
+			distinct_cols = lappend(distinct_cols, dcinfo);
 		}

-		if (query_is_distinct_for(subquery, colnos, opids))
+		if (query_is_distinct_for_with_collations(subquery, distinct_cols))
 			return true;
 	}
 	return false;
@ -1111,31 +1132,70 @@ query_supports_distinctness(Query *query)
 }

 /*
- * query_is_distinct_for - does query never return duplicates of the
- *		specified columns?
+ * query_is_distinct_for - ABI-preserving wrapper around
+ *		query_is_distinct_for_with_collations().
+ *
+ * The original signature took parallel colnos/opids lists and did not
+ * consider collations.  External callers built against earlier minor
+ * releases continue to call it with the historical (collation-blind)
+ * semantics; we forward with InvalidOid collations, which makes the
+ * collation check a no-op (see collations_agree_on_equality()).
+ */
+bool
+query_is_distinct_for(Query *query, List *colnos, List *opids)
+{
+	List	   *distinct_cols = NIL;
+	ListCell   *lc1;
+	ListCell   *lc2;
+
+	Assert(list_length(colnos) == list_length(opids));
+
+	forboth(lc1, colnos, lc2, opids)
+	{
+		DistinctColInfo *dcinfo = palloc(sizeof(DistinctColInfo));
+
+		dcinfo->colno = lfirst_int(lc1);
+		dcinfo->opid = lfirst_oid(lc2);
+		dcinfo->collid = InvalidOid;
+		distinct_cols = lappend(distinct_cols, dcinfo);
+	}
+
+	return query_is_distinct_for_with_collations(query, distinct_cols);
+}
+
+/*
+ * query_is_distinct_for_with_collations - does query never return duplicates
+ *		of the specified columns?
 *
 * query is a not-yet-planned subquery (in current usage, it's always from
 * a subquery RTE, which the planner avoids scribbling on).
 *
- * colnos is an integer list of output column numbers (resno's).  We are
- * interested in whether rows consisting of just these columns are certain
- * to be distinct.  "Distinctness" is defined according to whether the
- * corresponding upper-level equality operators listed in opids would think
- * the values are distinct.  (Note: the opids entries could be cross-type
- * operators, and thus not exactly the equality operators that the subquery
- * would use itself.  We use equality_ops_are_compatible() to check
- * compatibility.  That looks at opfamily membership for index AMs that have
- * declared that they support consistent equality semantics within an
+ * distinct_cols is a list of DistinctColInfo, one per requested output column.
+ * Each entry names the subquery output column number we want distinct, the
+ * upper-level equality operator we'll compare values with, and that operator's
+ * input collation.  We are interested in whether rows consisting of just these
+ * columns are certain to be distinct.
+ *
+ * "Distinctness" is defined according to whether the corresponding upper-level
+ * equality operators would think the values are distinct.  (Note: each opid
+ * could be a cross-type operator, and thus not exactly the equality operator
+ * that the subquery would use itself.  We use equality_ops_are_compatible() to
+ * check compatibility.  That looks at opfamily membership for index AMs that
+ * have declared that they support consistent equality semantics within an
 * opfamily, and so should give trustworthy answers for all operators that we
 * might need to deal with here.)
+ *
+ * The collid must also agree on equality with the collation the subquery's own
+ * DISTINCT/GROUP BY/set-op uses to deduplicate the column, else the subquery's
+ * distinctness does not carry over to the caller's equality semantics.  Two
+ * collations agree on equality if they match or if both are deterministic (in
+ * which case both reduce equality to byte-equality; see CREATE COLLATION).
 */
-bool
-query_is_distinct_for(Query *query, List *colnos, List *opids)
+static bool
+query_is_distinct_for_with_collations(Query *query, List *distinct_cols)
 {
 	ListCell   *l;
-	Oid			opid;
-
-	Assert(list_length(colnos) == list_length(opids));
+	DistinctColInfo *dcinfo;

 	/*
 	 * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
@ -1151,9 +1211,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 			TargetEntry *tle = get_sortgroupclause_tle(sgc,
 													   query->targetList);

-			opid = distinct_col_search(tle->resno, colnos, opids);
-			if (!OidIsValid(opid) ||
-				!equality_ops_are_compatible(opid, sgc->eqop))
+			dcinfo = distinct_col_search(tle->resno, distinct_cols);
+			if (dcinfo == NULL ||
+				!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+				!collations_agree_on_equality(dcinfo->collid,
+											  exprCollation((Node *) tle->expr)))
 				break;			/* exit early if no match */
 		}
 		if (l == NULL)			/* had matches for all? */
@ -1182,9 +1244,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 			TargetEntry *tle = get_sortgroupclause_tle(sgc,
 													   query->targetList);

-			opid = distinct_col_search(tle->resno, colnos, opids);
-			if (!OidIsValid(opid) ||
-				!equality_ops_are_compatible(opid, sgc->eqop))
+			dcinfo = distinct_col_search(tle->resno, distinct_cols);
+			if (dcinfo == NULL ||
+				!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+				!collations_agree_on_equality(dcinfo->collid,
+											  exprCollation((Node *) tle->expr)))
 				break;			/* exit early if no match */
 		}
 		if (l == NULL)			/* had matches for all? */
@ -1250,9 +1314,11 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 				sgc = (SortGroupClause *) lfirst(lg);
 				lg = lnext(topop->groupClauses, lg);

-				opid = distinct_col_search(tle->resno, colnos, opids);
-				if (!OidIsValid(opid) ||
-					!equality_ops_are_compatible(opid, sgc->eqop))
+				dcinfo = distinct_col_search(tle->resno, distinct_cols);
+				if (dcinfo == NULL ||
+					!equality_ops_are_compatible(dcinfo->opid, sgc->eqop) ||
+					!collations_agree_on_equality(dcinfo->collid,
+												  exprCollation((Node *) tle->expr)))
 					break;		/* exit early if no match */
 			}
 			if (l == NULL)		/* had matches for all? */
@ -1272,24 +1338,23 @@ query_is_distinct_for(Query *query, List *colnos, List *opids)
 }

 /*
- * distinct_col_search - subroutine for query_is_distinct_for
+ * distinct_col_search - subroutine for query_is_distinct_for_with_collations
 *
- * If colno is in colnos, return the corresponding element of opids,
- * else return InvalidOid.  (Ordinarily colnos would not contain duplicates,
- * but if it does, we arbitrarily select the first match.)
+ * If colno matches the colno field of an entry in distinct_cols, return a
+ * pointer to that entry; else return NULL.  (Ordinarily distinct_cols would
+ * not contain duplicate colnos, but if it does, we arbitrarily select the
+ * first match.)
 */
-static Oid
-distinct_col_search(int colno, List *colnos, List *opids)
+static DistinctColInfo *
+distinct_col_search(int colno, List *distinct_cols)
 {
-	ListCell   *lc1,
-			   *lc2;
-
-	forboth(lc1, colnos, lc2, opids)
+	foreach_ptr(DistinctColInfo, dcinfo, distinct_cols)
 	{
-		if (colno == lfirst_int(lc1))
-			return lfirst_oid(lc2);
+		if (dcinfo->colno == colno)
+			return dcinfo;
 	}
-	return InvalidOid;
+
+	return NULL;
 }


--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@ -1767,6 +1767,187 @@ ORDER BY 1;
 ghi
 (4 rows)

+--
+-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness
+-- under a different collation, so the planner must not use such a proof for
+-- any optimization.
+--
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on collate_tests.test1cs t1
+               Output: t1.x
+         ->  Hash
+               Output: test3cs.x
+               ->  HashAggregate
+                     Output: test3cs.x
+                     Group Key: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+(15 rows)
+
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Same with GROUP BY
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on collate_tests.test1cs t1
+               Output: t1.x
+         ->  Hash
+               Output: test3cs.x
+               ->  HashAggregate
+                     Output: test3cs.x
+                     Group Key: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+(15 rows)
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Same with set-op
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Output: t1.x, test3cs.x
+   Sort Key: t1.x COLLATE case_sensitive, test3cs.x COLLATE case_sensitive
+   ->  Hash Join
+         Output: t1.x, test3cs.x
+         Hash Cond: ((test3cs.x)::text = (t1.x)::text)
+         ->  HashAggregate
+               Output: test3cs.x
+               Group Key: test3cs.x
+               ->  Append
+                     ->  Seq Scan on collate_tests.test3cs
+                           Output: test3cs.x
+                     ->  Seq Scan on collate_tests.test3cs test3cs_1
+                           Output: test3cs_1.x
+         ->  Hash
+               Output: t1.x
+               ->  Seq Scan on collate_tests.test1cs t1
+                     Output: t1.x
+(18 rows)
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+  x  |  x  
+-----+-----
+ abc | abc
+ abc | ABC
+ ABC | abc
+ ABC | ABC
+ def | def
+ ghi | ghi
+(6 rows)
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Left Join
+         Hash Cond: (t1.x = (test3cs.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  HashAggregate
+                     Group Key: test3cs.x
+                     ->  Seq Scan on test3cs
+(9 rows)
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+  x  
+-----
+ abc
+ abc
+ ABC
+ ABC
+ def
+ ghi
+(6 rows)
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Sort
+   Sort Key: t1.x COLLATE case_sensitive
+   ->  Hash Semi Join
+         Hash Cond: ((t1.x)::text = (test3cs.x)::text)
+         ->  Seq Scan on test3cs t1
+         ->  Hash
+               ->  HashAggregate
+                     Group Key: test3cs.x
+                     ->  Seq Scan on test3cs
+(9 rows)
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+  x  
+-----
+ abc
+ ABC
+ def
+ ghi
+(4 rows)
+
 CREATE TABLE test1ci (x text COLLATE case_insensitive);
 CREATE TABLE test2ci (x text COLLATE case_insensitive);
 CREATE TABLE test3ci (x text COLLATE case_insensitive);
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@ -653,6 +653,64 @@ SELECT * FROM test3cs t1
  WHERE EXISTS (SELECT 1 FROM test3cs t2 WHERE t1.x = t2.x COLLATE case_insensitive)
 ORDER BY 1;

+--
+-- A DISTINCT / GROUP BY / set-op on a subquery does not prove uniqueness
+-- under a different collation, so the planner must not use such a proof for
+-- any optimization.
+--
+
+-- Ensure that we do not use inner-unique join execution
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT DISTINCT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Same with GROUP BY
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs GROUP BY x) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Same with set-op
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+SELECT * FROM test1cs t1, (SELECT x FROM test3cs UNION SELECT x FROM test3cs) t2
+WHERE t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1, 2;
+
+-- Ensure that left-join is not removed
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+SELECT t1.* FROM test3cs t1
+       LEFT JOIN (SELECT DISTINCT x FROM test3cs) t2 ON t1.x = t2.x COLLATE case_insensitive
+ORDER BY 1;
+
+-- Ensure that semijoin is not reduced to innerjoin
+EXPLAIN (COSTS OFF)
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
+SELECT * FROM test3cs t1
+  WHERE EXISTS (SELECT 1 FROM (SELECT DISTINCT x FROM test3cs) t2
+                WHERE t1.x = t2.x COLLATE case_insensitive)
+ORDER BY 1;
+
 CREATE TABLE test1ci (x text COLLATE case_insensitive);
 CREATE TABLE test2ci (x text COLLATE case_insensitive);
 CREATE TABLE test3ci (x text COLLATE case_insensitive);
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@ -648,6 +648,7 @@ DiscardMode
 DiscardStmt
 DispatchOption
 DistanceValue
+DistinctColInfo
 DistinctExpr
 DoState
 DoStmt