Repair problems with hash indexes that span multiple segments: the hash code's

preference for filling pages out-of-order tends to confuse the sanity checks in md.c, as per report from Balazs Nagy in bug #2737. The fix is to ensure that the smgr-level code always has the same idea of the logical EOF as the hash index code does, by using ReadBuffer(P_NEW) where we are adding a single page to the end of the index, and using smgrextend() to reserve a large batch of pages when creating a new splitpoint. The patch is a bit ugly because it avoids making any changes in md.c, which seems the most prudent approach for a backpatchable beta-period fix. After 8.3 development opens, I'll take a look at a cleaner but more invasive patch, in particular getting rid of the now unnecessary hack to allow reading beyond EOF in mdread(). Backpatch as far as 7.4. The bug likely exists in 7.3 as well, but because of the magnitude of the 7.3-to-7.4 changes in hash, the later-version patch doesn't even begin to apply. Given the other known bugs in the 7.3-era hash code, it does not seem worth trying to develop a separate patch for 7.3.
19 years ago · d68efb3f8d
parent fa3d622cb0
commit d68efb3f8d
2 changed files with 184 additions and 44 deletions
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.52 2006/03/31 23:32:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.53 2006/11/19 21:33:22 tgl Exp $
 *
 * NOTES
 *	  Overflow pages look like ordinary relation pages.
@ -20,7 +20,7 @@
 #include "access/hash.h"


-static BlockNumber _hash_getovflpage(Relation rel, Buffer metabuf);
+static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
 static uint32 _hash_firstfreebit(uint32 map);


@ -99,18 +99,14 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
 Buffer
 _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
 {
-	BlockNumber ovflblkno;
 	Buffer		ovflbuf;
 	Page		page;
 	Page		ovflpage;
 	HashPageOpaque pageopaque;
 	HashPageOpaque ovflopaque;

-	/* allocate an empty overflow page */
-	ovflblkno = _hash_getovflpage(rel, metabuf);
-
-	/* lock the overflow page */
-	ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE);
+	/* allocate and lock an empty overflow page */
+	ovflbuf = _hash_getovflpage(rel, metabuf);
 	ovflpage = BufferGetPage(ovflbuf);

 	/*
@ -150,7 +146,7 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
 	MarkBufferDirty(ovflbuf);

 	/* logically chain overflow page to previous page */
-	pageopaque->hasho_nextblkno = ovflblkno;
+	pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
 	_hash_wrtbuf(rel, buf);

 	return ovflbuf;
@ -159,16 +155,18 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
 /*
 *	_hash_getovflpage()
 *
- *	Find an available overflow page and return its block number.
+ *	Find an available overflow page and return it.  The returned buffer
+ *	is pinned and write-locked, but its contents are not initialized.
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
- * The buffer is returned in the same state.
+ * That buffer is left in the same state at exit.
 */
-static BlockNumber
+static Buffer
 _hash_getovflpage(Relation rel, Buffer metabuf)
 {
 	HashMetaPage metap;
 	Buffer		mapbuf = 0;
+	Buffer		newbuf;
 	BlockNumber blkno;
 	uint32		orig_firstfree;
 	uint32		splitnum;
@ -243,11 +241,10 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 	}

-	/* No Free Page Found - have to allocate a new page */
-	bit = metap->hashm_spares[splitnum];
-	metap->hashm_spares[splitnum]++;
-
-	/* Check if we need to allocate a new bitmap page */
+	/*
+	 * No free pages --- have to extend the relation to add an overflow page.
+	 * First, check to see if we have to add a new bitmap page too.
+	 */
 	if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
 	{
 		/*
@ -258,22 +255,39 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 		 * marked "in use".  Subsequent pages do not exist yet, but it is
 		 * convenient to pre-mark them as "in use" too.
 		 */
-		_hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
-
 		bit = metap->hashm_spares[splitnum];
+		_hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
 		metap->hashm_spares[splitnum]++;
 	}
 	else
 	{
 		/*
-		 * Nothing to do here; since the page was past the last used page, we
-		 * know its bitmap bit was preinitialized to "in use".
+		 * Nothing to do here; since the page will be past the last used page,
+		 * we know its bitmap bit was preinitialized to "in use".
 		 */
 	}

 	/* Calculate address of the new overflow page */
+	bit = metap->hashm_spares[splitnum];
 	blkno = bitno_to_blkno(metap, bit);

+	/*
+	 * We have to fetch the page with P_NEW to ensure smgr's idea of the
+	 * relation length stays in sync with ours.  XXX It's annoying to do this
+	 * with metapage write lock held; would be better to use a lock that
+	 * doesn't block incoming searches.  Best way to fix it would be to stop
+	 * maintaining hashm_spares[hashm_ovflpoint] and rely entirely on the
+	 * smgr relation length to track where new overflow pages come from;
+	 * then we could release the metapage before we do the smgrextend.
+	 * FIXME later (not in beta...)
+	 */
+	newbuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+	if (BufferGetBlockNumber(newbuf) != blkno)
+		elog(ERROR, "unexpected hash relation size: %u, should be %u",
+			 BufferGetBlockNumber(newbuf), blkno);
+
+	metap->hashm_spares[splitnum]++;
+
 	/*
 	 * Adjust hashm_firstfree to avoid redundant searches.	But don't risk
 	 * changing it if someone moved it while we were searching bitmap pages.
@ -284,7 +298,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 	/* Write updated metapage and release lock, but not pin */
 	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

-	return blkno;
+	return newbuf;

 found:
 	/* convert bit to bit number within page */
@ -300,7 +314,7 @@ found:
 	/* convert bit to absolute bit number */
 	bit += (i << BMPG_SHIFT(metap));

-	/* Calculate address of the new overflow page */
+	/* Calculate address of the recycled overflow page */
 	blkno = bitno_to_blkno(metap, bit);

 	/*
@ -320,7 +334,8 @@ found:
 		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 	}

-	return blkno;
+	/* Fetch and return the recycled page */
+	return _hash_getbuf(rel, blkno, HASH_WRITE);
 }

 /*
@ -388,7 +403,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 	prevblkno = ovflopaque->hasho_prevblkno;
 	bucket = ovflopaque->hasho_bucket;

-	/* Zero the page for debugging's sake; then write and release it */
+	/*
+	 * Zero the page for debugging's sake; then write and release it.
+	 * (Note: if we failed to zero the page here, we'd have problems
+	 * with the Assert in _hash_pageinit() when the page is reused.)
+	 */
 	MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
 	_hash_wrtbuf(rel, ovflbuf);

@ -488,12 +507,19 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
 	/*
 	 * It is okay to write-lock the new bitmap page while holding metapage
 	 * write lock, because no one else could be contending for the new page.
+	 * Also, the metapage lock makes it safe to extend the index using P_NEW,
+	 * which we want to do to ensure the smgr's idea of the relation size
+	 * stays in step with ours.
 	 *
 	 * There is some loss of concurrency in possibly doing I/O for the new
 	 * page while holding the metapage lock, but this path is taken so seldom
 	 * that it's not worth worrying about.
 	 */
-	buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+	buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+	if (BufferGetBlockNumber(buf) != blkno)
+		elog(ERROR, "unexpected hash relation size: %u, should be %u",
+			 BufferGetBlockNumber(buf), blkno);
+
 	pg = BufferGetPage(buf);

 	/* initialize the page */
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.60 2006/10/04 00:29:48 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.61 2006/11/19 21:33:23 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@ -32,9 +32,11 @@
 #include "access/hash.h"
 #include "miscadmin.h"
 #include "storage/lmgr.h"
+#include "storage/smgr.h"
 #include "utils/lsyscache.h"


+static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks);
 static void _hash_splitbucket(Relation rel, Buffer metabuf,
 				  Bucket obucket, Bucket nbucket,
 				  BlockNumber start_oblkno,
@ -102,21 +104,18 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
 *		requested buffer and its reference count has been incremented
 *		(ie, the buffer is "locked and pinned").
 *
- *		XXX P_NEW is not used because, unlike the tree structures, we
- *		need the bucket blocks to be at certain block numbers.
+ *		blkno == P_NEW is allowed, but it is caller's responsibility to
+ *		ensure that only one process can extend the index at a time.
 *
- *		All call sites should call either _hash_pageinit or _hash_checkpage
+ *		All call sites should call either _hash_checkpage or _hash_pageinit
 *		on the returned page, depending on whether the block is expected
- *		to be new or not.
+ *		to be valid or not.
 */
 Buffer
 _hash_getbuf(Relation rel, BlockNumber blkno, int access)
 {
 	Buffer		buf;

-	if (blkno == P_NEW)
-		elog(ERROR, "hash AM does not use P_NEW");
-
 	buf = ReadBuffer(rel, blkno);

 	if (access != HASH_NOLOCK)
@ -237,7 +236,14 @@ _hash_metapinit(Relation rel)
 	if (ffactor < 10)
 		ffactor = 10;

-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+	/*
+	 * We initialize the metapage, the first two bucket pages, and the
+	 * first bitmap page in sequence, using P_NEW to cause smgrextend()
+	 * calls to occur.  This ensures that the smgr level has the right
+	 * idea of the physical index length.
+	 */
+	metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+	Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE);
 	pg = BufferGetPage(metabuf);
 	_hash_pageinit(pg, BufferGetPageSize(metabuf));

@ -290,7 +296,8 @@ _hash_metapinit(Relation rel)
 	 */
 	for (i = 0; i <= 1; i++)
 	{
-		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
+		buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+		Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i));
 		pg = BufferGetPage(buf);
 		_hash_pageinit(pg, BufferGetPageSize(buf));
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@ -303,8 +310,7 @@ _hash_metapinit(Relation rel)
 	}

 	/*
-	 * Initialize first bitmap page.  Can't do this until we create the first
-	 * two buckets, else smgr will complain.
+	 * Initialize first bitmap page
 	 */
 	_hash_initbitmap(rel, metap, 3);

@ -339,6 +345,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	Bucket		old_bucket;
 	Bucket		new_bucket;
 	uint32		spare_ndx;
+	BlockNumber firstblock = InvalidBlockNumber;
 	BlockNumber start_oblkno;
 	BlockNumber start_nblkno;
 	uint32		maxbucket;
@ -376,6 +383,40 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
 		goto fail;

+	/*
+	 * Can't split anymore if maxbucket has reached its maximum possible value.
+	 *
+	 * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
+	 * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
+	 * to half that because of overflow looping in _hash_log2() and
+	 * insufficient space in hashm_spares[].  It's moot anyway because an
+	 * index with 2^32 buckets would certainly overflow BlockNumber and
+	 * hence _hash_alloc_buckets() would fail, but if we supported buckets
+	 * smaller than a disk block then this would be an independent constraint.
+	 */
+	if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
+		goto fail;
+
+	/*
+	 * If the split point is increasing (hashm_maxbucket's log base 2
+	 * increases), we need to allocate a new batch of bucket pages.
+	 */
+	new_bucket = metap->hashm_maxbucket + 1;
+	spare_ndx = _hash_log2(new_bucket + 1);
+	if (spare_ndx > metap->hashm_ovflpoint)
+	{
+		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
+		/*
+		 * The number of buckets in the new splitpoint is equal to the
+		 * total number already in existence, i.e. new_bucket.  Currently
+		 * this maps one-to-one to blocks required, but someday we may need
+		 * a more complicated calculation here.
+		 */
+		firstblock = _hash_alloc_buckets(rel, new_bucket);
+		if (firstblock == InvalidBlockNumber)
+			goto fail;			/* can't split due to BlockNumber overflow */
+	}
+
 	/*
 	 * Determine which bucket is to be split, and attempt to lock the old
 	 * bucket.	If we can't get the lock, give up.
@ -389,7 +430,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * lock.  This should be okay because no one else should be trying to lock
 	 * the new bucket yet...
 	 */
-	new_bucket = metap->hashm_maxbucket + 1;
 	old_bucket = (new_bucket & metap->hashm_lowmask);

 	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
@ -425,14 +465,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * increases), we need to adjust the hashm_spares[] array and
 	 * hashm_ovflpoint so that future overflow pages will be created beyond
 	 * this new batch of bucket pages.
-	 *
-	 * XXX should initialize new bucket pages to prevent out-of-order page
-	 * creation?  Don't wanna do it right here though.
 	 */
-	spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
 	if (spare_ndx > metap->hashm_ovflpoint)
 	{
-		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
 		metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
 		metap->hashm_ovflpoint = spare_ndx;
 	}
@ -440,6 +475,12 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	/* now we can compute the new bucket's primary block number */
 	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);

+	/* if we added a splitpoint, should match result of _hash_alloc_buckets */
+	if (firstblock != InvalidBlockNumber &&
+		firstblock != start_nblkno)
+		elog(PANIC, "unexpected hash relation size: %u, should be %u",
+			 firstblock, start_nblkno);
+
 	Assert(!_hash_has_active_scan(rel, new_bucket));

 	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
@ -487,6 +528,79 @@ fail:
 }


+/*
+ * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
+ *
+ * This does not need to initialize the new bucket pages; we'll do that as
+ * each one is used by _hash_expandtable().  But we have to extend the logical
+ * EOF to the end of the splitpoint; otherwise the first overflow page
+ * allocated beyond the splitpoint will represent a noncontiguous access,
+ * which can confuse md.c (and will probably be forbidden by future changes
+ * to md.c).
+ *
+ * We do this by writing a page of zeroes at the end of the splitpoint range.
+ * We expect that the filesystem will ensure that the intervening pages read
+ * as zeroes too.  On many filesystems this "hole" will not be allocated
+ * immediately, which means that the index file may end up more fragmented
+ * than if we forced it all to be allocated now; but since we don't scan
+ * hash indexes sequentially anyway, that probably doesn't matter.
+ *
+ * XXX It's annoying that this code is executed with the metapage lock held.
+ * We need to interlock against _hash_getovflpage() adding a new overflow page
+ * concurrently, but it'd likely be better to use LockRelationForExtension
+ * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
+ * so it may not be worth worrying about.
+ *
+ * Returns the first block number in the new splitpoint's range, or
+ * InvalidBlockNumber if allocation failed due to BlockNumber overflow.
+ */
+static BlockNumber
+_hash_alloc_buckets(Relation rel, uint32 nblocks)
+{
+	BlockNumber	firstblock;
+	BlockNumber	lastblock;
+	BlockNumber	endblock;
+	char		zerobuf[BLCKSZ];
+
+	/*
+	 * Since we hold metapage lock, no one else is either splitting or
+	 * allocating a new page in _hash_getovflpage(); hence it's safe to
+	 * assume that the relation length isn't changing under us.
+	 */
+	firstblock = RelationGetNumberOfBlocks(rel);
+	lastblock = firstblock + nblocks - 1;
+
+	/*
+	 * Check for overflow in block number calculation; if so, we cannot
+	 * extend the index anymore.
+	 */
+	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
+		return InvalidBlockNumber;
+
+	/* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
+
+	MemSet(zerobuf, 0, sizeof(zerobuf));
+
+	/*
+	 * XXX If the extension results in creation of new segment files,
+	 * we have to make sure that each non-last file is correctly filled out to
+	 * RELSEG_SIZE blocks.  This ought to be done inside mdextend, but
+	 * changing the smgr API seems best left for development cycle not late
+	 * beta.  Temporary fix for bug #2737.
+	 */
+#ifndef LET_OS_MANAGE_FILESIZE
+	for (endblock = firstblock | (RELSEG_SIZE - 1);
+		 endblock < lastblock;
+		 endblock += RELSEG_SIZE)
+		smgrextend(rel->rd_smgr, endblock, zerobuf, rel->rd_istemp);
+#endif
+
+	smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp);
+
+	return firstblock;
+}
+
+
 /*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *