Support hashing for duplicate-elimination in INTERSECT and EXCEPT queries.

This completes my project of improving usage of hashing for duplicate elimination (aggregate functions with DISTINCT remain undone, but that's for some other day). As with the previous patches, this means we can INTERSECT/EXCEPT on datatypes that can hash but not sort, and it means that INTERSECT/EXCEPT without ORDER BY are no longer certain to produce sorted output.
17 years ago · 368df30427
parent 2d1d96b1ce
commit 368df30427
11 changed files with 593 additions and 203 deletions
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1994-5, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/explain.c,v 1.175 2008/05/14 19:10:29 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/explain.c,v 1.176 2008/08/07 03:04:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -558,19 +558,47 @@ explain_outNode(StringInfo str,
 			pname = "Unique";
 			break;
 		case T_SetOp:
-			switch (((SetOp *) plan)->cmd)
+			switch (((SetOp *) plan)->strategy)
 			{
-				case SETOPCMD_INTERSECT:
-					pname = "SetOp Intersect";
+				case SETOP_SORTED:
+					switch (((SetOp *) plan)->cmd)
+					{
+						case SETOPCMD_INTERSECT:
+							pname = "SetOp Intersect";
+							break;
+						case SETOPCMD_INTERSECT_ALL:
+							pname = "SetOp Intersect All";
+							break;
+						case SETOPCMD_EXCEPT:
+							pname = "SetOp Except";
+							break;
+						case SETOPCMD_EXCEPT_ALL:
+							pname = "SetOp Except All";
+							break;
+						default:
+							pname = "SetOp ???";
+							break;
+					}
 					break;
-				case SETOPCMD_INTERSECT_ALL:
-					pname = "SetOp Intersect All";
-					break;
-				case SETOPCMD_EXCEPT:
-					pname = "SetOp Except";
-					break;
-				case SETOPCMD_EXCEPT_ALL:
-					pname = "SetOp Except All";
+				case SETOP_HASHED:
+					switch (((SetOp *) plan)->cmd)
+					{
+						case SETOPCMD_INTERSECT:
+							pname = "HashSetOp Intersect";
+							break;
+						case SETOPCMD_INTERSECT_ALL:
+							pname = "HashSetOp Intersect All";
+							break;
+						case SETOPCMD_EXCEPT:
+							pname = "HashSetOp Except";
+							break;
+						case SETOPCMD_EXCEPT_ALL:
+							pname = "HashSetOp Except All";
+							break;
+						default:
+							pname = "HashSetOp ???";
+							break;
+					}
 					break;
 				default:
 					pname = "SetOp ???";
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@ -4,33 +4,38 @@
 *	  Routines to handle INTERSECT and EXCEPT selection
 *
 * The input of a SetOp node consists of tuples from two relations,
- * which have been combined into one dataset and sorted on all the nonjunk
- * attributes.	In addition there is a junk attribute that shows which
- * relation each tuple came from.  The SetOp node scans each group of
- * identical tuples to determine how many came from each input relation.
- * Then it is a simple matter to emit the output demanded by the SQL spec
- * for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ * which have been combined into one dataset, with a junk attribute added
+ * that shows which relation each tuple came from.  In SETOP_SORTED mode,
+ * the input has furthermore been sorted according to all the grouping
+ * columns (ie, all the non-junk attributes).  The SetOp node scans each
+ * group of identical tuples to determine how many came from each input
+ * relation.  Then it is a simple matter to emit the output demanded by the
+ * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ *
+ * In SETOP_HASHED mode, the input is delivered in no particular order.
+ * We build a hash table in memory with one entry for each group of
+ * identical tuples, and count the number of tuples in the group from
+ * each relation.  After seeing all the input, we scan the hashtable and
+ * generate the correct output using those counts.
 *
 * This node type is not used for UNION or UNION ALL, since those can be
 * implemented more cheaply (there's no need for the junk attribute to
 * identify the source relation).
 *
+ * Note that SetOp does no qual checking nor projection.  The delivered
+ * output tuples are just copies of the first-to-arrive tuple in each
+ * input group.
+ *
 *
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.25 2008/01/01 19:45:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.26 2008/08/07 03:04:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
-/*
- * INTERFACE ROUTINES
- *		ExecSetOp		- filter input to generate INTERSECT/EXCEPT results
- *		ExecInitSetOp	- initialize node and subnodes..
- *		ExecEndSetOp	- shutdown node and subnodes
- */

 #include "postgres.h"

@ -39,6 +44,160 @@
 #include "utils/memutils.h"


+/*
+ * SetOpStatePerGroupData - per-group working state
+ *
+ * These values are working state that is initialized at the start of
+ * an input tuple group and updated for each input tuple.
+ *
+ * In SETOP_SORTED mode, we need only one of these structs, and it's kept in
+ * the plan state node.  In SETOP_HASHED mode, the hash table contains one
+ * of these for each tuple group.
+ */
+typedef struct SetOpStatePerGroupData
+{
+	long		numLeft;		/* number of left-input dups in group */
+	long		numRight;		/* number of right-input dups in group */
+} SetOpStatePerGroupData;
+
+/*
+ * To implement hashed mode, we need a hashtable that stores a
+ * representative tuple and the duplicate counts for each distinct set
+ * of grouping columns.  We compute the hash key from the grouping columns.
+ */
+typedef struct SetOpHashEntryData *SetOpHashEntry;
+
+typedef struct SetOpHashEntryData
+{
+	TupleHashEntryData shared;	/* common header for hash table entries */
+	SetOpStatePerGroupData pergroup;
+} SetOpHashEntryData;
+
+
+static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate);
+static void setop_fill_hash_table(SetOpState *setopstate);
+static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate);
+
+
+/*
+ * Initialize state for a new group of input values.
+ */
+static void
+initialize_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+	pergroup->numLeft = pergroup->numRight = 0;
+}
+
+/*
+ * Advance the appropriate counter for one input tuple.
+ */
+static void
+advance_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup,
+			   TupleTableSlot *inputslot)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	int			flag;
+	bool		isNull;
+
+	flag = DatumGetInt32(slot_getattr(inputslot,
+									  node->flagColIdx,
+									  &isNull));
+	Assert(!isNull);
+	if (flag)
+		pergroup->numRight++;
+	else
+		pergroup->numLeft++;
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(SetOpState *setopstate)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+
+	Assert(node->strategy == SETOP_HASHED);
+	Assert(node->numGroups > 0);
+
+	setopstate->hashtable = BuildTupleHashTable(node->numCols,
+												node->dupColIdx,
+												setopstate->eqfunctions,
+												setopstate->hashfunctions,
+												node->numGroups,
+												sizeof(SetOpHashEntryData),
+												setopstate->tableContext,
+												setopstate->tempContext);
+}
+
+/*
+ * Find or create a hashtable entry for the tuple group containing the
+ * given tuple.
+ */
+static SetOpHashEntry
+lookup_hash_entry(SetOpState *setopstate, TupleTableSlot *inputslot)
+{
+	SetOpHashEntry entry;
+	bool		isnew;
+
+	/* find or create the hashtable entry */
+	entry = (SetOpHashEntry) LookupTupleHashEntry(setopstate->hashtable,
+												  inputslot,
+												  &isnew);
+
+	if (isnew)
+	{
+		/* initialize counts for new tuple group */
+		initialize_counts(setopstate, &entry->pergroup);
+	}
+
+	/* Must reset temp context after each hashtable lookup */
+	MemoryContextReset(setopstate->tempContext);
+
+	return entry;
+}
+
+/*
+ * We've completed processing a tuple group.  Decide how many copies (if any)
+ * of its representative row to emit, and store the count into numOutput.
+ * This logic is straight from the SQL92 specification.
+ */
+static void
+set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+	SetOp	   *plannode = (SetOp *) setopstate->ps.plan;
+
+	switch (plannode->cmd)
+	{
+		case SETOPCMD_INTERSECT:
+			if (pergroup->numLeft > 0 && pergroup->numRight > 0)
+				setopstate->numOutput = 1;
+			else
+				setopstate->numOutput = 0;
+			break;
+		case SETOPCMD_INTERSECT_ALL:
+			setopstate->numOutput =
+				(pergroup->numLeft < pergroup->numRight) ?
+				pergroup->numLeft : pergroup->numRight;
+			break;
+		case SETOPCMD_EXCEPT:
+			if (pergroup->numLeft > 0 && pergroup->numRight == 0)
+				setopstate->numOutput = 1;
+			else
+				setopstate->numOutput = 0;
+			break;
+		case SETOPCMD_EXCEPT_ALL:
+			setopstate->numOutput =
+				(pergroup->numLeft < pergroup->numRight) ?
+				0 : (pergroup->numLeft - pergroup->numRight);
+			break;
+		default:
+			elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd);
+			break;
+	}
+}
+
+
 /* ----------------------------------------------------------------
 *		ExecSetOp
 * ----------------------------------------------------------------
@ -47,14 +206,7 @@ TupleTableSlot *				/* return: a tuple or NULL */
 ExecSetOp(SetOpState *node)
 {
 	SetOp	   *plannode = (SetOp *) node->ps.plan;
-	TupleTableSlot *resultTupleSlot;
-	PlanState  *outerPlan;
-
-	/*
-	 * get information from the node
-	 */
-	outerPlan = outerPlanState(node);
-	resultTupleSlot = node->ps.ps_ResultTupleSlot;
+	TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;

 	/*
 	 * If the previously-returned tuple needs to be returned more than once,
@ -66,142 +218,218 @@ ExecSetOp(SetOpState *node)
 		return resultTupleSlot;
 	}

-	/* Flag that we have no current tuple */
-	ExecClearTuple(resultTupleSlot);
+	/* Otherwise, we're done if we are out of groups */
+	if (node->setop_done)
+		return NULL;
+
+	/* Fetch the next tuple group according to the correct strategy */
+	if (plannode->strategy == SETOP_HASHED)
+	{
+		if (!node->table_filled)
+			setop_fill_hash_table(node);
+		return setop_retrieve_hash_table(node);
+	}
+	else
+		return setop_retrieve_direct(node);
+}
+
+/*
+ * ExecSetOp for non-hashed case
+ */
+static TupleTableSlot *
+setop_retrieve_direct(SetOpState *setopstate)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	PlanState  *outerPlan;
+	SetOpStatePerGroup pergroup;
+	TupleTableSlot *outerslot;
+	TupleTableSlot *resultTupleSlot;

 	/*
-	 * Absorb groups of duplicate tuples, counting them, and saving the first
-	 * of each group as a possible return value. At the end of each group,
-	 * decide whether to return anything.
-	 *
-	 * We assume that the tuples arrive in sorted order so we can detect
-	 * duplicates easily.
+	 * get state info from node
 	 */
-	for (;;)
-	{
-		TupleTableSlot *inputTupleSlot;
-		bool		endOfGroup;
+	outerPlan = outerPlanState(setopstate);
+	pergroup = setopstate->pergroup;
+	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;

+	/*
+	 * We loop retrieving groups until we find one we should return
+	 */
+	while (!setopstate->setop_done)
+	{
 		/*
-		 * fetch a tuple from the outer subplan, unless we already did.
+		 * If we don't already have the first tuple of the new group, fetch it
+		 * from the outer plan.
 		 */
-		if (node->ps.ps_OuterTupleSlot == NULL &&
-			!node->subplan_done)
-		{
-			node->ps.ps_OuterTupleSlot =
-				ExecProcNode(outerPlan);
-			if (TupIsNull(node->ps.ps_OuterTupleSlot))
-				node->subplan_done = true;
-		}
-		inputTupleSlot = node->ps.ps_OuterTupleSlot;
-
-		if (TupIsNull(resultTupleSlot))
+		if (setopstate->grp_firstTuple == NULL)
 		{
-			/*
-			 * First of group: save a copy in result slot, and reset
-			 * duplicate-counters for new group.
-			 */
-			if (node->subplan_done)
-				return NULL;	/* no more tuples */
-			ExecCopySlot(resultTupleSlot, inputTupleSlot);
-			node->numLeft = 0;
-			node->numRight = 0;
-			endOfGroup = false;
-		}
-		else if (node->subplan_done)
-		{
-			/*
-			 * Reached end of input, so finish processing final group
-			 */
-			endOfGroup = true;
-		}
-		else
-		{
-			/*
-			 * Else test if the new tuple and the previously saved tuple
-			 * match.
-			 */
-			if (execTuplesMatch(inputTupleSlot,
-								resultTupleSlot,
-								plannode->numCols, plannode->dupColIdx,
-								node->eqfunctions,
-								node->tempContext))
-				endOfGroup = false;
+			outerslot = ExecProcNode(outerPlan);
+			if (!TupIsNull(outerslot))
+			{
+				/* Make a copy of the first input tuple */
+				setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot);
+			}
 			else
-				endOfGroup = true;
+			{
+				/* outer plan produced no tuples at all */
+				setopstate->setop_done = true;
+				return NULL;
+			}
 		}

-		if (endOfGroup)
+		/*
+		 * Store the copied first input tuple in the tuple table slot
+		 * reserved for it.  The tuple will be deleted when it is cleared
+		 * from the slot.
+		 */
+		ExecStoreTuple(setopstate->grp_firstTuple,
+					   resultTupleSlot,
+					   InvalidBuffer,
+					   true);
+		setopstate->grp_firstTuple = NULL;	/* don't keep two pointers */
+
+		/* Initialize working state for a new input tuple group */
+		initialize_counts(setopstate, pergroup);
+
+		/* Count the first input tuple */
+		advance_counts(setopstate, pergroup, resultTupleSlot);
+
+		/*
+		 * Scan the outer plan until we exhaust it or cross a group boundary.
+		 */
+		for (;;)
 		{
+			outerslot = ExecProcNode(outerPlan);
+			if (TupIsNull(outerslot))
+			{
+				/* no more outer-plan tuples available */
+				setopstate->setop_done = true;
+				break;
+			}
+
 			/*
-			 * We've reached the end of the group containing resultTuple.
-			 * Decide how many copies (if any) to emit.  This logic is
-			 * straight from the SQL92 specification.
+			 * Check whether we've crossed a group boundary.
 			 */
-			switch (plannode->cmd)
+			if (!execTuplesMatch(resultTupleSlot,
+								 outerslot,
+								 node->numCols, node->dupColIdx,
+								 setopstate->eqfunctions,
+								 setopstate->tempContext))
 			{
-				case SETOPCMD_INTERSECT:
-					if (node->numLeft > 0 && node->numRight > 0)
-						node->numOutput = 1;
-					else
-						node->numOutput = 0;
-					break;
-				case SETOPCMD_INTERSECT_ALL:
-					node->numOutput =
-						(node->numLeft < node->numRight) ?
-						node->numLeft : node->numRight;
-					break;
-				case SETOPCMD_EXCEPT:
-					if (node->numLeft > 0 && node->numRight == 0)
-						node->numOutput = 1;
-					else
-						node->numOutput = 0;
-					break;
-				case SETOPCMD_EXCEPT_ALL:
-					node->numOutput =
-						(node->numLeft < node->numRight) ?
-						0 : (node->numLeft - node->numRight);
-					break;
-				default:
-					elog(ERROR, "unrecognized set op: %d",
-						 (int) plannode->cmd);
-					break;
-			}
-			/* Fall out of for-loop if we have tuples to emit */
-			if (node->numOutput > 0)
+				/*
+				 * Save the first input tuple of the next group.
+				 */
+				setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot);
 				break;
-			/* Else flag that we have no current tuple, and loop around */
-			ExecClearTuple(resultTupleSlot);
+			}
+
+			/* Still in same group, so count this tuple */
+			advance_counts(setopstate, pergroup, outerslot);
 		}
-		else
+
+		/*
+		 * Done scanning input tuple group.  See if we should emit any
+		 * copies of result tuple, and if so return the first copy.
+		 */
+		set_output_count(setopstate, pergroup);
+
+		if (setopstate->numOutput > 0)
 		{
-			/*
-			 * Current tuple is member of same group as resultTuple. Count it
-			 * in the appropriate counter.
-			 */
-			int			flag;
-			bool		isNull;
-
-			flag = DatumGetInt32(slot_getattr(inputTupleSlot,
-											  plannode->flagColIdx,
-											  &isNull));
-			Assert(!isNull);
-			if (flag)
-				node->numRight++;
-			else
-				node->numLeft++;
-			/* Set flag to fetch a new input tuple, and loop around */
-			node->ps.ps_OuterTupleSlot = NULL;
+			setopstate->numOutput--;
+			return resultTupleSlot;
 		}
 	}

+	/* No more groups */
+	ExecClearTuple(resultTupleSlot);
+	return NULL;
+}
+
+/*
+ * ExecSetOp for hashed case: phase 1, read input and build hash table
+ */
+static void
+setop_fill_hash_table(SetOpState *setopstate)
+{
+	PlanState  *outerPlan;
+	SetOpHashEntry entry;
+	TupleTableSlot *outerslot;
+
+	/*
+	 * get state info from node
+	 */
+	outerPlan = outerPlanState(setopstate);
+
 	/*
-	 * If we fall out of loop, then we need to emit at least one copy of
-	 * resultTuple.
+	 * Process each outer-plan tuple, and then fetch the next one, until we
+	 * exhaust the outer plan.
 	 */
-	Assert(node->numOutput > 0);
-	node->numOutput--;
-	return resultTupleSlot;
+	for (;;)
+	{
+		outerslot = ExecProcNode(outerPlan);
+		if (TupIsNull(outerslot))
+			break;
+
+		/* Find or build hashtable entry for this tuple's group */
+		entry = lookup_hash_entry(setopstate, outerslot);
+
+		/* Advance the counts */
+		advance_counts(setopstate, &entry->pergroup, outerslot);
+	}
+
+	setopstate->table_filled = true;
+	/* Initialize to walk the hash table */
+	ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
+}
+
+/*
+ * ExecSetOp for hashed case: phase 2, retrieving groups from hash table
+ */
+static TupleTableSlot *
+setop_retrieve_hash_table(SetOpState *setopstate)
+{
+	SetOpHashEntry entry;
+	TupleTableSlot *resultTupleSlot;
+
+	/*
+	 * get state info from node
+	 */
+	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+
+	/*
+	 * We loop retrieving groups until we find one we should return
+	 */
+	while (!setopstate->setop_done)
+	{
+		/*
+		 * Find the next entry in the hash table
+		 */
+		entry = (SetOpHashEntry) ScanTupleHashTable(&setopstate->hashiter);
+		if (entry == NULL)
+		{
+			/* No more entries in hashtable, so done */
+			setopstate->setop_done = true;
+			return NULL;
+		}
+
+		/*
+		 * See if we should emit any copies of this tuple, and if so return
+		 * the first copy.
+		 */
+		set_output_count(setopstate, &entry->pergroup);
+
+		if (setopstate->numOutput > 0)
+		{
+			setopstate->numOutput--;
+			return ExecStoreMinimalTuple(entry->shared.firstTuple,
+										 resultTupleSlot,
+										 false);
+		}
+	}
+
+	/* No more groups */
+	ExecClearTuple(resultTupleSlot);
+	return NULL;
 }

 /* ----------------------------------------------------------------
@ -226,9 +454,14 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	setopstate->ps.plan = (Plan *) node;
 	setopstate->ps.state = estate;

-	setopstate->ps.ps_OuterTupleSlot = NULL;
-	setopstate->subplan_done = false;
+	setopstate->eqfunctions = NULL;
+	setopstate->hashfunctions = NULL;
+	setopstate->setop_done = false;
 	setopstate->numOutput = 0;
+	setopstate->pergroup = NULL;
+	setopstate->grp_firstTuple = NULL;
+	setopstate->hashtable = NULL;
+	setopstate->tableContext = NULL;

 	/*
 	 * Miscellaneous initialization
@ -244,6 +477,19 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 							  ALLOCSET_DEFAULT_INITSIZE,
 							  ALLOCSET_DEFAULT_MAXSIZE);

+	/*
+	 * If hashing, we also need a longer-lived context to store the hash
+	 * table.  The table can't just be kept in the per-query context because
+	 * we want to be able to throw it away in ExecReScanSetOp.
+	 */
+	if (node->strategy == SETOP_HASHED)
+		setopstate->tableContext =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "SetOp hash table",
+								  ALLOCSET_DEFAULT_MINSIZE,
+								  ALLOCSET_DEFAULT_INITSIZE,
+								  ALLOCSET_DEFAULT_MAXSIZE);
+
 #define SETOP_NSLOTS 1

 	/*
@ -252,8 +498,13 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	ExecInitResultTupleSlot(estate, &setopstate->ps);

 	/*
-	 * then initialize outer plan
+	 * initialize child nodes
+	 *
+	 * If we are hashing then the child plan does not need
+	 * to handle REWIND efficiently; see ExecReScanSetOp.
 	 */
+	if (node->strategy == SETOP_HASHED)
+		eflags &= ~EXEC_FLAG_REWIND;
 	outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags);

 	/*
@ -264,11 +515,30 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	setopstate->ps.ps_ProjInfo = NULL;

 	/*
-	 * Precompute fmgr lookup data for inner loop
+	 * Precompute fmgr lookup data for inner loop. We need both equality and
+	 * hashing functions to do it by hashing, but only equality if not
+	 * hashing.
 	 */
-	setopstate->eqfunctions =
-		execTuplesMatchPrepare(node->numCols,
-							   node->dupOperators);
+	if (node->strategy == SETOP_HASHED)
+		execTuplesHashPrepare(node->numCols,
+							  node->dupOperators,
+							  &setopstate->eqfunctions,
+							  &setopstate->hashfunctions);
+	else
+		setopstate->eqfunctions =
+			execTuplesMatchPrepare(node->numCols,
+								   node->dupOperators);
+
+	if (node->strategy == SETOP_HASHED)
+	{
+		build_hash_table(setopstate);
+		setopstate->table_filled = false;
+	}
+	else
+	{
+		setopstate->pergroup =
+			(SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData));
+	}

 	return setopstate;
 }
@ -293,9 +563,11 @@ ExecEndSetOp(SetOpState *node)
 {
 	/* clean up tuple table */
 	ExecClearTuple(node->ps.ps_ResultTupleSlot);
-	node->ps.ps_OuterTupleSlot = NULL;

+	/* free subsidiary stuff including hashtable */
 	MemoryContextDelete(node->tempContext);
+	if (node->tableContext)
+		MemoryContextDelete(node->tableContext);

 	ExecEndNode(outerPlanState(node));
 }
@ -305,10 +577,50 @@ void
 ExecReScanSetOp(SetOpState *node, ExprContext *exprCtxt)
 {
 	ExecClearTuple(node->ps.ps_ResultTupleSlot);
-	node->ps.ps_OuterTupleSlot = NULL;
-	node->subplan_done = false;
+	node->setop_done = false;
 	node->numOutput = 0;

+	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+	{
+		/*
+		 * In the hashed case, if we haven't yet built the hash table then we
+		 * can just return; nothing done yet, so nothing to undo. If subnode's
+		 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+		 * else no reason to re-scan it at all.
+		 */
+		if (!node->table_filled)
+			return;
+
+		/*
+		 * If we do have the hash table and the subplan does not have any
+		 * parameter changes, then we can just rescan the existing hash table;
+		 * no need to build it again.
+		 */
+		if (((PlanState *) node)->lefttree->chgParam == NULL)
+		{
+			ResetTupleHashIterator(node->hashtable, &node->hashiter);
+			return;
+		}
+	}
+
+	/* Release first tuple of group, if we have made a copy */
+	if (node->grp_firstTuple != NULL)
+	{
+		heap_freetuple(node->grp_firstTuple);
+		node->grp_firstTuple = NULL;
+	}
+
+	/* Release any hashtable storage */
+	if (node->tableContext)
+		MemoryContextResetAndDeleteChildren(node->tableContext);
+
+	/* And rebuild empty hashtable if needed */
+	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+	{
+		build_hash_table(node);
+		node->table_filled = false;
+	}
+
 	/*
 	 * if chgParam of subnode is not null then plan will be re-scanned by
 	 * first ExecProcNode.
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@ -15,7 +15,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.397 2008/08/07 01:11:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.398 2008/08/07 03:04:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -652,10 +652,12 @@ _copySetOp(SetOp *from)
 	 * copy remainder of node
 	 */
 	COPY_SCALAR_FIELD(cmd);
+	COPY_SCALAR_FIELD(strategy);
 	COPY_SCALAR_FIELD(numCols);
 	COPY_POINTER_FIELD(dupColIdx, from->numCols * sizeof(AttrNumber));
 	COPY_POINTER_FIELD(dupOperators, from->numCols * sizeof(Oid));
 	COPY_SCALAR_FIELD(flagColIdx);
+	COPY_SCALAR_FIELD(numGroups);

 	return newnode;
 }
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.331 2008/08/07 01:11:48 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.332 2008/08/07 03:04:03 tgl Exp $
 *
 * NOTES
 *	  Every node type that can appear in stored rules' parsetrees *must*
@ -599,6 +599,7 @@ _outSetOp(StringInfo str, SetOp *node)
 	_outPlanInfo(str, (Plan *) node);

 	WRITE_ENUM_FIELD(cmd, SetOpCmd);
+	WRITE_ENUM_FIELD(strategy, SetOpStrategy);
 	WRITE_INT_FIELD(numCols);

 	appendStringInfo(str, " :dupColIdx");
@ -610,6 +611,7 @@ _outSetOp(StringInfo str, SetOp *node)
 		appendStringInfo(str, " %u", node->dupOperators[i]);

 	WRITE_INT_FIELD(flagColIdx);
+	WRITE_LONG_FIELD(numGroups);
 }

 static void
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.242 2008/08/02 21:32:00 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.243 2008/08/07 03:04:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -3108,8 +3108,9 @@ make_unique(Plan *lefttree, List *distinctList)
 * already be sorted accordingly.
 */
 SetOp *
-make_setop(SetOpCmd cmd, Plan *lefttree,
-		   List *distinctList, AttrNumber flagColIdx)
+make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree,
+		   List *distinctList, AttrNumber flagColIdx, long numGroups,
+		   double outputRows)
 {
 	SetOp	   *node = makeNode(SetOp);
 	Plan	   *plan = &node->plan;
@ -3120,20 +3121,13 @@ make_setop(SetOpCmd cmd, Plan *lefttree,
 	ListCell   *slitem;

 	copy_plan_costsize(plan, lefttree);
+	plan->plan_rows = outputRows;

 	/*
 	 * Charge one cpu_operator_cost per comparison per input tuple. We assume
 	 * all columns get compared at most of the tuples.
 	 */
-	plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols;
-
-	/*
-	 * We make the unsupported assumption that there will be 10% as many
-	 * tuples out as in.  Any way to do better?
-	 */
-	plan->plan_rows *= 0.1;
-	if (plan->plan_rows < 1)
-		plan->plan_rows = 1;
+	plan->total_cost += cpu_operator_cost * lefttree->plan_rows * numCols;

 	plan->targetlist = lefttree->targetlist;
 	plan->qual = NIL;
@ -3160,10 +3154,12 @@ make_setop(SetOpCmd cmd, Plan *lefttree,
 	}

 	node->cmd = cmd;
+	node->strategy = strategy;
 	node->numCols = numCols;
 	node->dupColIdx = dupColIdx;
 	node->dupOperators = dupOperators;
 	node->flagColIdx = flagColIdx;
+	node->numGroups = numGroups;

 	return node;
 }
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@ -22,7 +22,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.150 2008/08/07 01:11:50 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.151 2008/08/07 03:04:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -60,6 +60,7 @@ static Plan *generate_union_plan(SetOperationStmt *op, PlannerInfo *root,
 					double tuple_fraction,
 					List *refnames_tlist, List **sortClauses);
 static Plan *generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
+					   double tuple_fraction,
 					   List *refnames_tlist, List **sortClauses);
 static List *recurse_union_children(Node *setOp, PlannerInfo *root,
 					   double tuple_fraction,
@ -229,7 +230,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 									   refnames_tlist,
 									   sortClauses);
 		else
-			plan = generate_nonunion_plan(op, root,
+			plan = generate_nonunion_plan(op, root, tuple_fraction,
 										  refnames_tlist,
 										  sortClauses);

@ -341,6 +342,7 @@ generate_union_plan(SetOperationStmt *op, PlannerInfo *root,
 */
 static Plan *
 generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
+					   double tuple_fraction,
 					   List *refnames_tlist,
 					   List **sortClauses)
 {
@ -351,6 +353,10 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 			   *groupList,
 			   *planlist,
 			   *child_sortclauses;
+	double		dNumDistinctRows;
+	double		dNumOutputRows;
+	long		numDistinctRows;
+	bool		use_hash;
 	SetOpCmd	cmd;

 	/* Recurse on children, ensuring their outputs are marked */
@ -393,10 +399,32 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 		return plan;
 	}

+	/*
+	 * XXX for the moment, take the number of distinct groups as being the
+	 * total input size, ie, the worst case.  This is too conservative, but
+	 * we don't want to risk having the hashtable overrun memory; also,
+	 * it's not clear how to get a decent estimate of the true size.
+	 */
+	dNumDistinctRows = plan->plan_rows;
+
+	/* Also convert to long int --- but 'ware overflow! */
+	numDistinctRows = (long) Min(dNumDistinctRows, (double) LONG_MAX);
+
+	/*
+	 * The output size is taken as 10% of that, which is a completely bogus
+	 * guess, but it's what we've used historically.
+	 */
+	dNumOutputRows = ceil(dNumDistinctRows * 0.1);
+
 	/*
 	 * Decide whether to hash or sort, and add a sort node if needed.
 	 */
-	plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan);
+	use_hash = choose_hashed_setop(root, groupList, plan,
+								   tuple_fraction, dNumDistinctRows,
+								   (op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT");
+
+	if (!use_hash)
+		plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan);

 	/*
 	 * Finally, add a SetOp plan node to generate the correct output.
@ -414,9 +442,12 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 			cmd = SETOPCMD_INTERSECT;	/* keep compiler quiet */
 			break;
 	}
-	plan = (Plan *) make_setop(cmd, plan, groupList, list_length(op->colTypes) + 1);
+	plan = (Plan *) make_setop(cmd, use_hash ? SETOP_HASHED : SETOP_SORTED,
+							   plan, groupList, list_length(op->colTypes) + 1,
+							   numDistinctRows, dNumOutputRows);

-	*sortClauses = groupList;
+	/* Result is sorted only if we're not hashing */
+	*sortClauses = use_hash ? NIL : groupList;

 	return plan;
 }
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.185 2008/07/26 19:15:35 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.186 2008/08/07 03:04:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1414,21 +1414,31 @@ typedef struct HashState
 /* ----------------
 *	 SetOpState information
 *
- *		SetOp nodes are used "on top of" sort nodes to discard
- *		duplicate tuples returned from the sort phase.	These are
- *		more complex than a simple Unique since we have to count
- *		how many duplicates to return.
+ *		Even in "sorted" mode, SetOp nodes are more complex than a simple
+ *		Unique, since we have to count how many duplicates to return.  But
+ *		we also support hashing, so this is really more like a cut-down
+ *		form of Agg.
 * ----------------
 */
+/* this struct is private in nodeSetOp.c: */
+typedef struct SetOpStatePerGroupData *SetOpStatePerGroup;
+
 typedef struct SetOpState
 {
 	PlanState	ps;				/* its first field is NodeTag */
-	FmgrInfo   *eqfunctions;	/* per-field lookup data for equality fns */
-	bool		subplan_done;	/* has subplan returned EOF? */
-	long		numLeft;		/* number of left-input dups of cur group */
-	long		numRight;		/* number of right-input dups of cur group */
+	FmgrInfo   *eqfunctions;	/* per-grouping-field equality fns */
+	FmgrInfo   *hashfunctions;	/* per-grouping-field hash fns */
+	bool		setop_done;		/* indicates completion of output scan */
 	long		numOutput;		/* number of dups left to output */
 	MemoryContext tempContext;	/* short-term context for comparisons */
+	/* these fields are used in SETOP_SORTED mode: */
+	SetOpStatePerGroup pergroup;	/* per-group working state */
+	HeapTuple	grp_firstTuple; /* copy of first tuple of current group */
+	/* these fields are used in SETOP_HASHED mode: */
+	TupleHashTable hashtable;	/* hash table with one entry per group */
+	MemoryContext tableContext;	/* memory context containing hash table */
+	bool		table_filled;	/* hash table filled yet? */
+	TupleHashIterator hashiter; /* for iterating through hash table */
 } SetOpState;

 /* ----------------
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.100 2008/04/13 20:51:21 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.101 2008/08/07 03:04:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -525,15 +525,23 @@ typedef enum SetOpCmd
 	SETOPCMD_EXCEPT_ALL
 } SetOpCmd;

+typedef enum SetOpStrategy
+{
+	SETOP_SORTED,				/* input must be sorted */
+	SETOP_HASHED				/* use internal hashtable */
+} SetOpStrategy;
+
 typedef struct SetOp
 {
 	Plan		plan;
 	SetOpCmd	cmd;			/* what to do */
+	SetOpStrategy strategy;		/* how to do it */
 	int			numCols;		/* number of columns to check for
 								 * duplicate-ness */
 	AttrNumber *dupColIdx;		/* their indexes in the target list */
 	Oid		   *dupOperators;	/* equality operators to compare with */
 	AttrNumber	flagColIdx;		/* where is the flag column, if any */
+	long		numGroups;		/* estimated number of groups in input */
 } SetOp;

 /* ----------------
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.108 2008/05/02 21:26:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.109 2008/08/07 03:04:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -61,8 +61,9 @@ extern Plan *materialize_finished_plan(Plan *subplan);
 extern Unique *make_unique(Plan *lefttree, List *distinctList);
 extern Limit *make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount,
 		   int64 offset_est, int64 count_est);
-extern SetOp *make_setop(SetOpCmd cmd, Plan *lefttree,
-		   List *distinctList, AttrNumber flagColIdx);
+extern SetOp *make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree,
+		   List *distinctList, AttrNumber flagColIdx, long numGroups,
+		   double outputRows);
 extern Result *make_result(PlannerInfo *root, List *tlist,
 			Node *resconstantqual, Plan *subplan);
 extern bool is_projection_capable_plan(Plan *plan);
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@ -275,21 +275,21 @@ SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl;
 4567890123456789
 (3 rows)

-SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
        q2         
 -------------------
 -4567890123456789
               456
 (2 rows)

-SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl ORDER BY 1;
        q2         
 -------------------
 -4567890123456789
               456
 (2 rows)

-SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1;
        q2         
 -------------------
 -4567890123456789
@ -326,7 +326,7 @@ SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl;
  0
 (1 row)

-SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl;
+SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1;
          f1           
 -----------------------
 -1.2345678901234e+200
@ -369,14 +369,14 @@ SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2
 -4567890123456789
 (7 rows)

-SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl;
+SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
        q1         
 -------------------
 -4567890123456789
               456
 (2 rows)

-SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl)));
+SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1)));
        q1         
 -------------------
               123
@ -388,7 +388,7 @@ SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FR
               456
 (7 rows)

-(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl;
+(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
        q1         
 -------------------
 -4567890123456789
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@ -97,11 +97,11 @@ SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl;

 SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl;

-SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;

-SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl ORDER BY 1;

-SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1;

 SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl;

@ -115,7 +115,7 @@ SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl;

 SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl;

-SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl;
+SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1;

 --
 -- Operator precedence and (((((extra))))) parentheses
@ -127,11 +127,11 @@ SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2

 (((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL SELECT q2 FROM int8_tbl;

-SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl;
+SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;

-SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl)));
+SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1)));

-(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl;
+(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;

 --
 -- Subqueries with ORDER BY & LIMIT clauses