Fix some nasty coredump bugs in hashjoin. This code was just

about certain to fail anytime it decided the relation to be hashed was too big to fit in memory --- the code for 'batching' a series of hashjoins had multiple errors. I've fixed the easier problems. A remaining big problem is that you can get 'hashtable out of memory' if the code's guesstimate about how much overflow space it will need turns out wrong. That will require much more extensive revisions to fix, so I'm committing these fixes now before I start on that problem.
27 years ago · 9f82f9e459
parent 5d5cf912bc
commit 9f82f9e459
3 changed files with 142 additions and 130 deletions
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@ -6,7 +6,7 @@
 * Copyright (c) 1994, Regents of the University of California
 *
 *
- *  $Id: nodeHash.c,v 1.32 1999/04/07 23:33:30 tgl Exp $
+ *  $Id: nodeHash.c,v 1.33 1999/05/06 00:30:46 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -38,12 +38,13 @@
 #include "utils/hsearch.h"

 extern int	NBuffers;
-static int	HashTBSize;
+
+#define HJ_TEMP_NAMELEN  16		/* max length for mk_hj_temp file names */

 static void mk_hj_temp(char *tempname);
 static int	hashFunc(Datum key, int len, bool byVal);
-static int	ExecHashPartition(Hash *node);
 static RelativeAddr hashTableAlloc(int size, HashJoinTable hashtable);
+static void * absHashTableAlloc(int size, HashJoinTable hashtable);
 static void ExecHashOverflowInsert(HashJoinTable hashtable,
 					   HashBucket bucket,
 					   HeapTuple heapTuple);
@ -270,13 +271,19 @@ ExecEndHash(Hash *node)
 static RelativeAddr
 hashTableAlloc(int size, HashJoinTable hashtable)
 {
-	RelativeAddr p;
-
-	p = hashtable->top;
-	hashtable->top += size;
+	RelativeAddr p = hashtable->top;
+	hashtable->top += MAXALIGN(size);
 	return p;
 }

+static void *
+absHashTableAlloc(int size, HashJoinTable hashtable)
+{
+	RelativeAddr p = hashTableAlloc(size, hashtable);
+	return ABSADDR(p);
+}
+
+
 /* ----------------------------------------------------------------
 *		ExecHashTableCreate
 *
@ -290,9 +297,12 @@ HashJoinTable
 ExecHashTableCreate(Hash *node)
 {
 	Plan	   *outerNode;
+	int			HashTBSize;
 	int			nbatch;
 	int			ntuples;
 	int			tupsize;
+	int			pages;
+	int			sqrtpages;
 	IpcMemoryId shmid;
 	HashJoinTable hashtable;
 	HashBucket	bucket;
@ -307,43 +317,72 @@ ExecHashTableCreate(Hash *node)
 	int		   *innerbatchSizes;
 	RelativeAddr tempname;

-	nbatch = -1;
-	HashTBSize = NBuffers / 2;
-	while (nbatch < 0)
-	{
-
-		/*
-		 * determine number of batches for the hashjoin
-		 */
-		HashTBSize *= 2;
-		nbatch = ExecHashPartition(node);
-	}
 	/* ----------------
-	 *	get information about the size of the relation
+	 *	Get information about the size of the relation to be hashed
+	 *	(it's the "outer" subtree of this node, but the inner relation of
+	 *	the hashjoin).
+	 *  Caution: this is only the planner's estimates, and so
+	 *  can't be trusted too far.  Apply a healthy fudge factor.
 	 * ----------------
 	 */
 	outerNode = outerPlan(node);
 	ntuples = outerNode->plan_size;
-	if (ntuples <= 0)
-		ntuples = 1000;			/* XXX just a hack */
+	if (ntuples <= 0)			/* force a plausible size if no info */
+		ntuples = 1000;
 	tupsize = outerNode->plan_width + sizeof(HeapTupleData);
+	pages = (int) ceil((double) ntuples * tupsize * FUDGE_FAC / BLCKSZ);

 	/*
-	 * totalbuckets is the total number of hash buckets needed for the
-	 * entire relation
+	 * Max hashtable size is NBuffers pages, but not less than
+	 * sqrt(estimated inner rel size), so as to avoid horrible performance.
+	 * XXX since the hashtable is not allocated in shared mem anymore,
+	 * it would probably be more appropriate to drive this from -S than -B.
 	 */
-	totalbuckets = ceil((double) ntuples / NTUP_PER_BUCKET);
-	bucketsize = LONGALIGN(NTUP_PER_BUCKET * tupsize + sizeof(*bucket));
+	sqrtpages = (int) ceil(sqrt((double) pages));
+	HashTBSize = NBuffers;
+	if (sqrtpages > HashTBSize)
+		HashTBSize = sqrtpages;

 	/*
-	 * nbuckets is the number of hash buckets for the first pass of hybrid
-	 * hashjoin
+	 * Count the number of hash buckets we want for the whole relation,
+	 * and the number we can actually fit in the allowed memory.
+	 * NOTE: FUDGE_FAC here determines the fraction of the hashtable space
+	 * saved for overflow records.  Need a better approach...
 	 */
-	nbuckets = (HashTBSize - nbatch) * BLCKSZ / (bucketsize * FUDGE_FAC);
-	if (totalbuckets < nbuckets)
-		totalbuckets = nbuckets;
-	if (nbatch == 0)
+	totalbuckets = (int) ceil((double) ntuples / NTUP_PER_BUCKET);
+	bucketsize = MAXALIGN(NTUP_PER_BUCKET * tupsize + sizeof(*bucket));
+	nbuckets = (int) ((HashTBSize * BLCKSZ) / (bucketsize * FUDGE_FAC));
+
+	if (totalbuckets <= nbuckets)
+	{
+		/* We have enough space, so no batching.  In theory we could
+		 * even reduce HashTBSize, but as long as we don't have a way
+		 * to deal with overflow-space overrun, best to leave the
+		 * extra space available for overflow.
+		 */
 		nbuckets = totalbuckets;
+		nbatch = 0;
+	}
+	else
+	{
+		/* Need to batch; compute how many batches we want to use.
+		 * Note that nbatch doesn't have to have anything to do with
+		 * the ratio totalbuckets/nbuckets; in fact, it is the number
+		 * of groups we will use for the part of the data that doesn't
+		 * fall into the first nbuckets hash buckets.
+		 */
+		nbatch = (int) ceil((double) (pages - HashTBSize) / HashTBSize);
+		if (nbatch <= 0)
+			nbatch = 1;
+	}
+
+	/* Now, totalbuckets is the number of (virtual) hashbuckets for the
+	 * whole relation, and nbuckets is the number of physical hashbuckets
+	 * we will use in the first pass.  Data falling into the first nbuckets
+	 * virtual hashbuckets gets handled in the first pass; everything else
+	 * gets divided into nbatch batches to be processed in additional
+	 * passes.
+	 */
 #ifdef HJDEBUG
 	printf("nbatch = %d, totalbuckets = %d, nbuckets = %d\n", 
 			nbatch, totalbuckets, nbuckets);
@ -351,10 +390,11 @@ ExecHashTableCreate(Hash *node)

 	/* ----------------
 	 *	in non-parallel machines, we don't need to put the hash table
-	 *	in the shared memory.  We just palloc it.
+	 *	in the shared memory.  We just palloc it.  The space needed
+	 *  is the hash area itself plus nbatch+1 I/O buffer pages.
 	 * ----------------
 	 */
-	hashtable = (HashJoinTable) palloc((HashTBSize + 1) * BLCKSZ);
+	hashtable = (HashJoinTable) palloc((HashTBSize + nbatch + 1) * BLCKSZ);
 	shmid = 0;

 	if (hashtable == NULL)
@ -367,13 +407,15 @@ ExecHashTableCreate(Hash *node)
 	hashtable->totalbuckets = totalbuckets;
 	hashtable->bucketsize = bucketsize;
 	hashtable->shmid = shmid;
-	hashtable->top = sizeof(HashTableData);
+	hashtable->top = MAXALIGN(sizeof(HashTableData));
 	hashtable->bottom = HashTBSize * BLCKSZ;
-
 	/*
-	 * hashtable->readbuf has to be long aligned!!!
+	 * hashtable->readbuf has to be maxaligned!!!
+	 * Note there are nbatch additional pages available after readbuf;
+	 * these are used for buffering the outgoing batch data.
 	 */
 	hashtable->readbuf = hashtable->bottom;
+	hashtable->batch = hashtable->bottom + BLCKSZ;
 	hashtable->nbatch = nbatch;
 	hashtable->curbatch = 0;
 	hashtable->pcount = hashtable->nprocess = 0;
@ -383,13 +425,13 @@ ExecHashTableCreate(Hash *node)
 		 *	allocate and initialize the outer batches
 		 * ---------------
 		 */
-		outerbatchNames = (RelativeAddr *) ABSADDR(
-			   hashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable));
-		outerbatchPos = (RelativeAddr *) ABSADDR(
-			   hashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable));
+		outerbatchNames = (RelativeAddr *)
+			absHashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable);
+		outerbatchPos = (RelativeAddr *)
+			absHashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable);
 		for (i = 0; i < nbatch; i++)
 		{
-			tempname = hashTableAlloc(12, hashtable);
+			tempname = hashTableAlloc(HJ_TEMP_NAMELEN, hashtable);
 			mk_hj_temp(ABSADDR(tempname));
 			outerbatchNames[i] = tempname;
 			outerbatchPos[i] = -1;
@ -400,15 +442,15 @@ ExecHashTableCreate(Hash *node)
 		 *	allocate and initialize the inner batches
 		 * ---------------
 		 */
-		innerbatchNames = (RelativeAddr *) ABSADDR(
-			   hashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable));
-		innerbatchPos = (RelativeAddr *) ABSADDR(
-			   hashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable));
-		innerbatchSizes = (int *) ABSADDR(
-						hashTableAlloc(nbatch * sizeof(int), hashtable));
+		innerbatchNames = (RelativeAddr *)
+			absHashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable);
+		innerbatchPos = (RelativeAddr *)
+			absHashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable);
+		innerbatchSizes = (int *)
+			absHashTableAlloc(nbatch * sizeof(int), hashtable);
 		for (i = 0; i < nbatch; i++)
 		{
-			tempname = hashTableAlloc(12, hashtable);
+			tempname = hashTableAlloc(HJ_TEMP_NAMELEN, hashtable);
 			mk_hj_temp(ABSADDR(tempname));
 			innerbatchNames[i] = tempname;
 			innerbatchPos[i] = -1;
@ -427,9 +469,8 @@ ExecHashTableCreate(Hash *node)
 		hashtable->innerbatchSizes = (RelativeAddr) NULL;
 	}

-	hashtable->batch = (RelativeAddr) LONGALIGN(hashtable->top +
-												bucketsize * nbuckets);
-	hashtable->overflownext = hashtable->batch + nbatch * BLCKSZ;
+	hashtable->overflownext = hashtable->top + bucketsize * nbuckets;
+	Assert(hashtable->overflownext < hashtable->bottom);
 	/* ----------------
 	 *	initialize each hash bucket
 	 * ----------------
@ -437,10 +478,10 @@ ExecHashTableCreate(Hash *node)
 	bucket = (HashBucket) ABSADDR(hashtable->top);
 	for (i = 0; i < nbuckets; i++)
 	{
-		bucket->top = RELADDR((char *) bucket + sizeof(*bucket));
+		bucket->top = RELADDR((char *) bucket + MAXALIGN(sizeof(*bucket)));
 		bucket->bottom = bucket->top;
 		bucket->firstotuple = bucket->lastotuple = -1;
-		bucket = (HashBucket) LONGALIGN(((char *) bucket + bucketsize));
+		bucket = (HashBucket) ((char *) bucket + bucketsize);
 	}
 	return hashtable;
 }
@ -494,18 +535,18 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 */
 		bucket = (HashBucket)
 			(ABSADDR(hashtable->top) + bucketno * hashtable->bucketsize);
-		if ((char *) LONGALIGN(ABSADDR(bucket->bottom)) - (char *) bucket 
+		if (((char *) MAXALIGN(ABSADDR(bucket->bottom)) - (char *) bucket)
 				+ heapTuple->t_len + HEAPTUPLESIZE > hashtable->bucketsize)
 			ExecHashOverflowInsert(hashtable, bucket, heapTuple);
 		else
 		{
-			memmove((char *) LONGALIGN(ABSADDR(bucket->bottom)),
+			memmove((char *) MAXALIGN(ABSADDR(bucket->bottom)),
 					heapTuple,
 					HEAPTUPLESIZE);
-			memmove((char *) LONGALIGN(ABSADDR(bucket->bottom)) + HEAPTUPLESIZE,
+			memmove((char *) MAXALIGN(ABSADDR(bucket->bottom)) + HEAPTUPLESIZE,
 					heapTuple->t_data,
 					heapTuple->t_len);
-			bucket->bottom = ((RelativeAddr) LONGALIGN(bucket->bottom) + 
+			bucket->bottom = ((RelativeAddr) MAXALIGN(bucket->bottom) + 
 					heapTuple->t_len + HEAPTUPLESIZE);
 		}
 	}
@ -515,9 +556,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 * put the tuple into a tmp file for other batches
 		 * -----------------
 		 */
-		batchno = (float) (bucketno - hashtable->nbuckets) /
-			(float) (hashtable->totalbuckets - hashtable->nbuckets)
-			* nbatch;
+		batchno = (nbatch * (bucketno - hashtable->nbuckets)) /
+			(hashtable->totalbuckets - hashtable->nbuckets);
 		buffer = ABSADDR(hashtable->batch) + batchno * BLCKSZ;
 		batchSizes[batchno]++;
 		pos = (char *)
@ -614,19 +654,11 @@ ExecHashOverflowInsert(HashJoinTable hashtable,
 	 *	see if we run out of overflow space
 	 * ----------------
 	 */
-	newend = (RelativeAddr) LONGALIGN(hashtable->overflownext + sizeof(*otuple)
+	newend = (RelativeAddr) MAXALIGN(hashtable->overflownext + sizeof(*otuple)
 									  + heapTuple->t_len + HEAPTUPLESIZE);
 	if (newend > hashtable->bottom)
-	{
-		/* ------------------
-		 * XXX the temporary hack above doesn't work because things
-		 * above us don't know that we've moved the hash table!
-		 *	- Chris Dunlop, <chris@onthe.net.au>
-		 * ------------------
-		 */
 		elog(ERROR, 
-				"hash table out of memory. Use -B parameter to increase buffers.");
-	}
+			 "hash table out of memory. Use -B parameter to increase buffers.");

 	/* ----------------
 	 *	establish the overflow chain
@ -647,7 +679,7 @@ ExecHashOverflowInsert(HashJoinTable hashtable,
 	 * ----------------
 	 */
 	otuple->next = -1;
-	otuple->tuple = RELADDR(LONGALIGN(((char *) otuple + sizeof(*otuple))));
+	otuple->tuple = RELADDR(MAXALIGN(((char *) otuple + sizeof(*otuple))));
 	memmove(ABSADDR(otuple->tuple),
 			heapTuple,
 			HEAPTUPLESIZE);
@ -690,10 +722,10 @@ ExecScanHashBucket(HashJoinState *hjstate,
 	{
 		if (curtuple == NULL)
 			heapTuple = (HeapTuple)
-				LONGALIGN(ABSADDR(bucket->top));
+				MAXALIGN(ABSADDR(bucket->top));
 		else
 			heapTuple = (HeapTuple)
-				LONGALIGN(((char *) curtuple + curtuple->t_len + HEAPTUPLESIZE));
+				MAXALIGN(((char *) curtuple + curtuple->t_len + HEAPTUPLESIZE));

 		while (heapTuple < (HeapTuple) ABSADDR(bucket->bottom))
 		{
@ -713,7 +745,7 @@ ExecScanHashBucket(HashJoinState *hjstate,
 				return heapTuple;

 			heapTuple = (HeapTuple)
-				LONGALIGN(((char *) heapTuple + heapTuple->t_len + HEAPTUPLESIZE));
+				MAXALIGN(((char *) heapTuple + heapTuple->t_len + HEAPTUPLESIZE));
 		}

 		if (firstotuple == NULL)
@ -810,48 +842,12 @@ hashFunc(Datum key, int len, bool byVal)
 	return h % PRIME2;
 }

-/* ----------------------------------------------------------------
- *		ExecHashPartition
- *
- *		determine the number of batches needed for a hashjoin
- * ----------------------------------------------------------------
- */
-static int
-ExecHashPartition(Hash *node)
-{
-	Plan	   *outerNode;
-	int			b;
-	int			pages;
-	int			ntuples;
-	int			tupsize;
-
-	/*
-	 * get size information for plan node
-	 */
-	outerNode = outerPlan(node);
-	ntuples = outerNode->plan_size;
-	if (ntuples == 0)
-		ntuples = 1000;
-	tupsize = outerNode->plan_width + sizeof(HeapTupleData);
-	pages = ceil((double) ntuples * tupsize * FUDGE_FAC / BLCKSZ);
-
-	/*
-	 * if amount of buffer space below hashjoin threshold, return negative
-	 */
-	if (ceil(sqrt((double) pages)) > HashTBSize)
-		return -1;
-	if (pages <= HashTBSize)
-		b = 0;					/* fit in memory, no partitioning */
-	else
-		b = ceil((double) (pages - HashTBSize) / (double) (HashTBSize - 1));
-
-	return b;
-}
-
 /* ----------------------------------------------------------------
 *		ExecHashTableReset
 *
 *		reset hash table header for new batch
+ *
+ *		ntuples is the number of tuples in the inner relation's batch
 * ----------------------------------------------------------------
 */
 void
@ -860,29 +856,42 @@ ExecHashTableReset(HashJoinTable hashtable, int ntuples)
 	int			i;
 	HashBucket	bucket;

-	hashtable->nbuckets = hashtable->totalbuckets
-		= ceil((double) ntuples / NTUP_PER_BUCKET);
+	/*
+	 * We can reset the number of hashbuckets since we are going to
+	 * recalculate the hash values of all the tuples in the new batch
+	 * anyway.  We might as well spread out the hash values as much as
+	 * we can within the available space.  Note we must set nbuckets
+	 * equal to totalbuckets since we will NOT generate any new output
+	 * batches after this point.
+	 */
+	hashtable->nbuckets = hashtable->totalbuckets =
+		(int) (hashtable->bottom / (hashtable->bucketsize * FUDGE_FAC));

+	/*
+	 * reinitialize the overflow area to empty, and reinit each hash bucket.
+	 */
 	hashtable->overflownext = hashtable->top + hashtable->bucketsize *
 		hashtable->nbuckets;
+	Assert(hashtable->overflownext < hashtable->bottom);

 	bucket = (HashBucket) ABSADDR(hashtable->top);
 	for (i = 0; i < hashtable->nbuckets; i++)
 	{
-		bucket->top = RELADDR((char *) bucket + sizeof(*bucket));
+		bucket->top = RELADDR((char *) bucket + MAXALIGN(sizeof(*bucket)));
 		bucket->bottom = bucket->top;
 		bucket->firstotuple = bucket->lastotuple = -1;
 		bucket = (HashBucket) ((char *) bucket + hashtable->bucketsize);
 	}
+
 	hashtable->pcount = hashtable->nprocess;
 }

-static int	hjtmpcnt = 0;
-
 static void
 mk_hj_temp(char *tempname)
 {
-	snprintf(tempname, strlen(tempname), "HJ%d.%d", (int) MyProcPid, hjtmpcnt);
+	static int	hjtmpcnt = 0;
+
+	snprintf(tempname, HJ_TEMP_NAMELEN, "HJ%d.%d", (int) MyProcPid, hjtmpcnt);
 	hjtmpcnt = (hjtmpcnt + 1) % 1000;
 }

--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.17 1999/02/13 23:15:23 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.18 1999/05/06 00:30:47 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -650,8 +650,8 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 	heapTuple = (HeapTuple) (*position);
 	heapTuple->t_data = (HeapTupleHeader) 
 						((char *) heapTuple + HEAPTUPLESIZE);
-	(*position) = (char *) LONGALIGN(*position + 
-									 heapTuple->t_len + HEAPTUPLESIZE);
+	(*position) = (char *) MAXALIGN(*position + 
+									heapTuple->t_len + HEAPTUPLESIZE);

 	return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, false);
 }
@ -843,7 +843,7 @@ ExecHashJoinSaveTuple(HeapTuple heapTuple,
 	}
 	memmove(position, heapTuple, HEAPTUPLESIZE);
 	memmove(position + HEAPTUPLESIZE, heapTuple->t_data, heapTuple->t_len);
-	position = (char *) LONGALIGN(position + heapTuple->t_len + HEAPTUPLESIZE);
+	position = (char *) MAXALIGN(position + heapTuple->t_len + HEAPTUPLESIZE);
 	*pageend = position - buffer;

 	return position;
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@ -6,7 +6,7 @@
 *
 * Copyright (c) 1994, Regents of the University of California
 *
- * $Id: hashjoin.h,v 1.8 1999/02/13 23:21:24 momjian Exp $
+ * $Id: hashjoin.h,v 1.9 1999/05/06 00:30:45 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -17,18 +17,23 @@

 /* -----------------
 *	have to use relative address as pointers in the hashtable
- *	because the hashtable may reallocate in difference processes
+ *	because the hashtable may reallocate in different processes
+ *
+ *  XXX: this relative-address stuff is useless on all supported platforms
+ *  and is a ever-dangerous source of bugs.  Really ought to rip it out.
 * -----------------
 */
 typedef int RelativeAddr;

 /* ------------------
- *	the relative addresses are always relative to the head of the
- *	hashtable, the following macro converts them to absolute address.
+ *	The relative addresses are always relative to the head of the
+ *	hashtable, the following macros convert them to/from absolute address.
+ *  NULL is represented as -1 (CAUTION: RELADDR() doesn't handle that!).
+ *  CAUTION: ABSADDR evaluates its arg twice!!
 * ------------------
 */
-#define ABSADDR(X)		((X) < 0 ? NULL: (char*)hashtable + X)
-#define RELADDR(X)		(RelativeAddr)((char*)(X) - (char*)hashtable)
+#define ABSADDR(X)		((X) < 0 ? (char*) NULL : (char*)hashtable + (X))
+#define RELADDR(X)		((RelativeAddr)((char*)(X) - (char*)hashtable))

 typedef char **charPP;
 typedef int *intP;
@ -79,6 +84,4 @@ typedef struct HashBucketData

 typedef HashBucketData *HashBucket;

-#define HASH_PERMISSION			0700
-
 #endif	 /* HASHJOIN_H */