Further work on connecting the free space map (which is still just a

stub) into the rest of the system. Adopt a cleaner approach to preventing deadlock in concurrent heap_updates: allow RelationGetBufferForTuple to select any page of the rel, and put the onus on it to lock both buffers in a consistent order. Remove no-longer-needed isExtend hack from API of ReleaseAndReadBuffer.
25 years ago · af5ced9cfd
parent 0eab92c0e6
commit af5ced9cfd
12 changed files with 379 additions and 231 deletions
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.120 2001/06/27 23:31:38 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.121 2001/06/29 21:08:23 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@ -202,8 +202,7 @@ heapgettup(Relation relation,

 		*buffer = ReleaseAndReadBuffer(*buffer,
 									   relation,
-									   ItemPointerGetBlockNumber(tid),
-									   false);
+									   ItemPointerGetBlockNumber(tid));
 		if (!BufferIsValid(*buffer))
 			elog(ERROR, "heapgettup: failed ReadBuffer");

@ -238,8 +237,7 @@ heapgettup(Relation relation,

 		*buffer = ReleaseAndReadBuffer(*buffer,
 									   relation,
-									   page,
-									   false);
+									   page);
 		if (!BufferIsValid(*buffer))
 			elog(ERROR, "heapgettup: failed ReadBuffer");

@ -280,8 +278,7 @@ heapgettup(Relation relation,

 		*buffer = ReleaseAndReadBuffer(*buffer,
 									   relation,
-									   page,
-									   false);
+									   page);
 		if (!BufferIsValid(*buffer))
 			elog(ERROR, "heapgettup: failed ReadBuffer");

@ -374,8 +371,7 @@ heapgettup(Relation relation,

 		*buffer = ReleaseAndReadBuffer(*buffer,
 									   relation,
-									   page,
-									   false);
+									   page);
 		if (!BufferIsValid(*buffer))
 			elog(ERROR, "heapgettup: failed ReadBuffer");

@ -1088,8 +1084,8 @@ heap_insert(Relation relation, HeapTuple tup)
 		heap_tuple_toast_attrs(relation, tup, NULL);
 #endif

-	/* Find buffer for this tuple */
-	buffer = RelationGetBufferForTuple(relation, tup->t_len, 0);
+	/* Find buffer to insert this tuple into */
+	buffer = RelationGetBufferForTuple(relation, tup->t_len, InvalidBuffer);

 	/* NO ELOG(ERROR) from here till changes are logged */
 	START_CRIT_SECTION();
@ -1501,18 +1497,16 @@ l2:
 		 * buffer locks on both old and new pages.  To avoid deadlock against
 		 * some other backend trying to get the same two locks in the other
 		 * order, we must be consistent about the order we get the locks in.
-		 * We use the rule "lock the higher-numbered page of the relation
+		 * We use the rule "lock the lower-numbered page of the relation
 		 * first".  To implement this, we must do RelationGetBufferForTuple
-		 * while not holding the lock on the old page, and we must tell it
-		 * to give us a page beyond the old page.
+		 * while not holding the lock on the old page, and we must rely on it
+		 * to get the locks on both pages in the correct order.
 		 */
 		if (newtupsize > pagefree)
 		{
 			/* Assume there's no chance to put newtup on same page. */
 			newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
-											BufferGetBlockNumber(buffer) + 1);
-			/* Now reacquire lock on old tuple's page. */
-			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+											   buffer);
 		}
 		else
 		{
@ -1529,8 +1523,7 @@ l2:
 				 */
 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 				newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
-											BufferGetBlockNumber(buffer) + 1);
-				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+												   buffer);
 			}
 			else
 			{
@ -1550,7 +1543,8 @@ l2:

 	/*
 	 * At this point newbuf and buffer are both pinned and locked,
-	 * and newbuf has enough space for the new tuple.
+	 * and newbuf has enough space for the new tuple.  If they are
+	 * the same buffer, only one pin is held.
 	 */

 	/* NO ELOG(ERROR) from here till changes are logged */
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Id: hio.c,v 1.40 2001/06/27 23:31:38 tgl Exp $
+ *	  $Id: hio.c,v 1.41 2001/06/29 21:08:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -17,6 +17,8 @@

 #include "access/heapam.h"
 #include "access/hio.h"
+#include "storage/freespace.h"
+

 /*
 * RelationPutHeapTuple - place tuple at specified page
@ -66,16 +68,29 @@ RelationPutHeapTuple(Relation relation,
 /*
 * RelationGetBufferForTuple
 *
- *	Returns exclusive-locked buffer with free space >= given len,
- *	being careful to select only a page at or beyond minblocknum
- *	in the relation.
+ *	Returns pinned and exclusive-locked buffer of a page in given relation
+ *	with free space >= given len.
+ *
+ *	If otherBuffer is not InvalidBuffer, then it references a previously
+ *	pinned buffer of another page in the same relation; on return, this
+ *	buffer will also be exclusive-locked.  (This case is used by heap_update;
+ *	the otherBuffer contains the tuple being updated.)
 *
- *	The minblocknum parameter is needed to prevent deadlock between
- *	concurrent heap_update operations; see heap_update for details.
- *	Pass zero if you don't particularly care which page you get.
+ *	The reason for passing otherBuffer is that if two backends are doing
+ *	concurrent heap_update operations, a deadlock could occur if they try
+ *	to lock the same two buffers in opposite orders.  To ensure that this
+ *	can't happen, we impose the rule that buffers of a relation must be
+ *	locked in increasing page number order.  This is most conveniently done
+ *	by having RelationGetBufferForTuple lock them both, with suitable care
+ *	for ordering.
 *
- *	Note that we use LockPage to lock relation for extension. We can
- *	do this as long as in all other places we use page-level locking
+ *	NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
+ *	same buffer we select for insertion of the new tuple (this could only
+ *	happen if space is freed in that page after heap_update finds there's not
+ *	enough there).  In that case, the page will be pinned and locked only once.
+ *
+ *	Note that we use LockPage(rel, 0) to lock relation for extension.
+ *	We can do this as long as in all other places we use page-level locking
 *	for indices only. Alternatively, we could define pseudo-table as
 *	we do for transactions with XactLockTable.
 *
@ -84,12 +99,13 @@ RelationPutHeapTuple(Relation relation,
 */
 Buffer
 RelationGetBufferForTuple(Relation relation, Size len,
-						  BlockNumber minblocknum)
+						  Buffer otherBuffer)
 {
 	Buffer		buffer = InvalidBuffer;
 	Page		pageHeader;
-	BlockNumber lastblock,
-				oldnblocks;
+	Size		pageFreeSpace;
+	BlockNumber targetBlock,
+				otherBlock;

 	len = MAXALIGN(len);		/* be conservative */

@ -100,36 +116,118 @@ RelationGetBufferForTuple(Relation relation, Size len,
 		elog(ERROR, "Tuple is too big: size %lu, max size %ld",
 			 (unsigned long) len, MaxTupleSize);

+	if (otherBuffer != InvalidBuffer)
+		otherBlock = BufferGetBlockNumber(otherBuffer);
+	else
+		otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */
+
 	/*
-	 * First, use relcache's record of table length to guess where the
-	 * last page is, and try to put the tuple there.  This cached value
-	 * may be out of date, in which case we'll be inserting into a non-last
-	 * page, but that should be OK.  Note that in a newly created relcache
-	 * entry, rd_nblocks may be zero; if so, we'll set it correctly below.
+	 * We first try to put the tuple on the same page we last inserted a
+	 * tuple on, as cached in the relcache entry.  If that doesn't work,
+	 * we ask the shared Free Space Map to locate a suitable page.  Since
+	 * the FSM's info might be out of date, we have to be prepared to loop
+	 * around and retry multiple times.  (To insure this isn't an infinite
+	 * loop, we must update the FSM with the correct amount of free space on
+	 * each page that proves not to be suitable.)  If the FSM has no record of
+	 * a page with enough free space, we give up and extend the relation.
 	 */
-	if (relation->rd_nblocks > 0)
+
+	targetBlock = relation->rd_targblock;
+
+	if (targetBlock == InvalidBlockNumber)
+	{
+		/*
+		 * We have no cached target page, so ask the FSM for an initial
+		 * target.
+		 */
+		targetBlock = GetPageWithFreeSpace(&relation->rd_node, len);
+		/*
+		 * If the FSM knows nothing of the rel, try the last page before
+		 * we give up and extend.  This avoids one-tuple-per-page syndrome
+		 * during bootstrapping or in a recently-started system.
+		 */
+		if (targetBlock == InvalidBlockNumber)
+		{
+			BlockNumber	nblocks = RelationGetNumberOfBlocks(relation);
+
+			if (nblocks > 0)
+				targetBlock = nblocks - 1;
+		}
+	}
+
+	while (targetBlock != InvalidBlockNumber)
 	{
-		lastblock = relation->rd_nblocks - 1;
-		if (lastblock >= minblocknum)
+		/*
+		 * Read and exclusive-lock the target block, as well as the
+		 * other block if one was given, taking suitable care with
+		 * lock ordering and the possibility they are the same block.
+		 */
+		if (otherBuffer == InvalidBuffer)
+		{
+			/* easy case */
+			buffer = ReadBuffer(relation, targetBlock);
+			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+		}
+		else if (otherBlock == targetBlock)
+		{
+			/* also easy case */
+			buffer = otherBuffer;
+			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+		}
+		else if (otherBlock < targetBlock)
 		{
-			buffer = ReadBuffer(relation, lastblock);
+			/* lock other buffer first */
+			buffer = ReadBuffer(relation, targetBlock);
+			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-			pageHeader = (Page) BufferGetPage(buffer);
-			if (len <= PageGetFreeSpace(pageHeader))
-				return buffer;
-			/*
-			 * Doesn't fit, so we'll have to try someplace else.
-			 */
-			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-			/* buffer release will happen below... */
 		}
+		else
+		{
+			/* lock target buffer first */
+			buffer = ReadBuffer(relation, targetBlock);
+			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+		}
+		/*
+		 * Now we can check to see if there's enough free space here.
+		 * If so, we're done.
+		 */
+		pageHeader = (Page) BufferGetPage(buffer);
+		pageFreeSpace = PageGetFreeSpace(pageHeader);
+		if (len <= pageFreeSpace)
+		{
+			/* use this page as future insert target, too */
+			relation->rd_targblock = targetBlock;
+			return buffer;
+		}
+		/*
+		 * Not enough space, so we must give up our page locks and
+		 * pin (if any) and prepare to look elsewhere.  We don't care
+		 * which order we unlock the two buffers in, so this can be
+		 * slightly simpler than the code above.
+		 */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		if (otherBuffer == InvalidBuffer)
+		{
+			ReleaseBuffer(buffer);
+		}
+		else if (otherBlock != targetBlock)
+		{
+			LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
+			ReleaseBuffer(buffer);
+		}
+		/*
+		 * Update FSM as to condition of this page, and ask for another
+		 * page to try.
+		 */
+		targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node,
+													targetBlock,
+													pageFreeSpace,
+													len);
 	}

 	/*
-	 * Before extending relation, make sure no one else has done
-	 * so more recently than our last rd_nblocks update.  (If we
-	 * blindly extend the relation here, then probably most of the
-	 * page the other guy added will end up going to waste.)
+	 * Have to extend the relation.
 	 *
 	 * We have to use a lock to ensure no one else is extending the
 	 * rel at the same time, else we will both try to initialize the
@ -138,51 +236,27 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	if (!relation->rd_myxactonly)
 		LockPage(relation, 0, ExclusiveLock);

-	oldnblocks = relation->rd_nblocks;
 	/*
 	 * XXX This does an lseek - rather expensive - but at the moment it is
 	 * the only way to accurately determine how many blocks are in a
 	 * relation.  Is it worth keeping an accurate file length in shared
 	 * memory someplace, rather than relying on the kernel to do it for us?
 	 */
-	relation->rd_nblocks = RelationGetNumberOfBlocks(relation);
+	buffer = ReadBuffer(relation, P_NEW);

-	if (relation->rd_nblocks > oldnblocks)
-	{
-		/*
-		 * Someone else has indeed extended the relation recently.
-		 * Try to fit our tuple into the new last page.
-		 */
-		lastblock = relation->rd_nblocks - 1;
-		if (lastblock >= minblocknum)
-		{
-			buffer = ReleaseAndReadBuffer(buffer, relation, lastblock, false);
-			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-			pageHeader = (Page) BufferGetPage(buffer);
-			if (len <= PageGetFreeSpace(pageHeader))
-			{
-				/* OK, we don't need to extend again. */
-				if (!relation->rd_myxactonly)
-					UnlockPage(relation, 0, ExclusiveLock);
-				return buffer;
-			}
-			/*
-			 * Doesn't fit, so we'll have to extend the relation (again).
-			 */
-			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-			/* buffer release will happen below... */
-		}
-	}
+	/*
+	 * Release the file-extension lock; it's now OK for someone else
+	 * to extend the relation some more.
+	 */
+	if (!relation->rd_myxactonly)
+		UnlockPage(relation, 0, ExclusiveLock);

 	/*
-	 * Extend the relation by one page and update rd_nblocks for next time.
-	 *
-	 * Note: at this point minblocknum is ignored; we won't extend by more
-	 * than one block...
+	 * We can be certain that locking the otherBuffer first is OK,
+	 * since it must have a lower page number.
 	 */
-	lastblock = relation->rd_nblocks;
-	buffer = ReleaseAndReadBuffer(buffer, relation, lastblock, true);
-	relation->rd_nblocks = lastblock + 1;
+	if (otherBuffer != InvalidBuffer)
+		LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

 	/*
 	 * We need to initialize the empty new page.
@ -192,19 +266,22 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	Assert(PageIsNew((PageHeader) pageHeader));
 	PageInit(pageHeader, BufferGetPageSize(buffer), 0);

-	/*
-	 * Release the file-extension lock; it's now OK for someone else
-	 * to extend the relation some more.
-	 */
-	if (!relation->rd_myxactonly)
-		UnlockPage(relation, 0, ExclusiveLock);
-
 	if (len > PageGetFreeSpace(pageHeader))
 	{
 		/* We should not get here given the test at the top */
-		elog(STOP, "Tuple is too big: size %lu",
-			 (unsigned long) len);
+		elog(STOP, "Tuple is too big: size %lu", (unsigned long) len);
 	}

+	/*
+	 * Remember the new page as our target for future insertions.
+	 *
+	 * XXX should we enter the new page into the free space map immediately,
+	 * or just keep it for this backend's exclusive use in the short run
+	 * (until VACUUM sees it)?  Seems to depend on whether you expect the
+	 * current backend to make more insertions or not, which is probably a
+	 * good bet most of the time.  So for now, don't add it to FSM yet.
+	 */
+	relation->rd_targblock = BufferGetBlockNumber(buffer);
+
 	return buffer;
 }
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@ -6,7 +6,7 @@
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Header: /cvsroot/pgsql/src/backend/access/transam/xlogutils.c,v 1.15 2001/03/22 03:59:18 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/xlogutils.c,v 1.16 2001/06/29 21:08:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -171,6 +171,7 @@ XLogOpenLogRelation(void)
 	sprintf(RelationGetPhysicalRelationName(logRelation), "pg_log");
 	logRelation->rd_node.tblNode = InvalidOid;
 	logRelation->rd_node.relNode = RelOid_pg_log;
+	logRelation->rd_targblock = InvalidBlockNumber;
 	logRelation->rd_fd = -1;
 	logRelation->rd_fd = smgropen(DEFAULT_SMGR, logRelation, false);
 	if (logRelation->rd_fd < 0)
@ -384,6 +385,7 @@ XLogOpenRelation(bool redo, RmgrId rmid, RelFileNode rnode)

 		hentry->rdesc = res;

+		res->reldata.rd_targblock = InvalidBlockNumber;
 		res->reldata.rd_fd = -1;
 		res->reldata.rd_fd = smgropen(DEFAULT_SMGR, &(res->reldata),
 									  true /* allow failure */ );
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/heap.c,v 1.169 2001/06/27 23:31:38 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/heap.c,v 1.170 2001/06/29 21:08:24 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@ -39,7 +39,6 @@
 #include "catalog/pg_attrdef.h"
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_index.h"
-#include "catalog/pg_proc.h"
 #include "catalog/pg_relcheck.h"
 #include "catalog/pg_statistic.h"
 #include "catalog/pg_type.h"
@ -50,8 +49,6 @@
 #include "optimizer/planmain.h"
 #include "optimizer/prep.h"
 #include "optimizer/var.h"
-#include "nodes/makefuncs.h"
-#include "parser/parse_clause.h"
 #include "parser/parse_expr.h"
 #include "parser/parse_relation.h"
 #include "parser/parse_target.h"
@ -59,7 +56,6 @@
 #include "rewrite/rewriteRemove.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
-#include "utils/catcache.h"
 #include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
 #include "utils/relcache.h"
@ -197,25 +193,18 @@ heap_create(char *relname,
 	static unsigned int uniqueId = 0;

 	Oid			relid;
-	Relation	rel;
-	bool		nailme = false;
-	int			natts = tupDesc->natts;
-	int			i;
-	MemoryContext oldcxt;
 	Oid			tblNode = MyDatabaseId;
+	bool		nailme = false;
+	Relation	rel;

 	/*
 	 * sanity checks
 	 */
-	AssertArg(natts > 0);
-
 	if (relname && !allow_system_table_mods &&
 		IsSystemRelationName(relname) && IsNormalProcessingMode())
-	{
 		elog(ERROR, "Illegal class name '%s'"
 			 "\n\tThe 'pg_' name prefix is reserved for system catalogs",
 			 relname);
-	}

 	/*
 	 * Real ugly stuff to assign the proper relid in the relation
@ -276,70 +265,20 @@ heap_create(char *relname,

 	if (istemp)
 	{
-
 		/*
 		 * replace relname of caller with a unique name for a temp
 		 * relation
 		 */
 		snprintf(relname, NAMEDATALEN, "%s_%d_%u",
-				PG_TEMP_REL_PREFIX, (int) MyProcPid, uniqueId++);
+				 PG_TEMP_REL_PREFIX, (int) MyProcPid, uniqueId++);
 	}

 	/*
-	 * switch to the cache context to create the relcache entry.
-	 */
-	if (!CacheMemoryContext)
-		CreateCacheMemoryContext();
-
-	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
-
-	/*
-	 * allocate a new relation descriptor.
+	 * build the relcache entry.
 	 */
-	rel = (Relation) palloc(sizeof(RelationData));
-	MemSet((char *) rel, 0, sizeof(RelationData));
-	rel->rd_fd = -1;			/* physical file is not open */
-
-	RelationSetReferenceCount(rel, 1);
-
-	/*
-	 * create a new tuple descriptor from the one passed in
-	 */
-	rel->rd_att = CreateTupleDescCopyConstr(tupDesc);
-
-	/*
-	 * nail the reldesc if this is a bootstrap create reln and we may need
-	 * it in the cache later on in the bootstrap process so we don't ever
-	 * want it kicked out.	e.g. pg_attribute!!!
-	 */
-	if (nailme)
-		rel->rd_isnailed = true;
-
-	/*
-	 * initialize the fields of our new relation descriptor
-	 */
-	rel->rd_rel = (Form_pg_class) palloc(sizeof *rel->rd_rel);
-	MemSet((char *) rel->rd_rel, 0, sizeof *rel->rd_rel);
-	strcpy(RelationGetPhysicalRelationName(rel), relname);
-	rel->rd_rel->relkind = RELKIND_UNCATALOGED;
-	rel->rd_rel->relnatts = natts;
-	rel->rd_rel->reltype = InvalidOid;
-	if (tupDesc->constr)
-		rel->rd_rel->relchecks = tupDesc->constr->num_check;
-
-	for (i = 0; i < natts; i++)
-		rel->rd_att->attrs[i]->attrelid = relid;
-
-	RelationGetRelid(rel) = relid;
-
-	rel->rd_node.tblNode = tblNode;
-	rel->rd_node.relNode = relid;
-	rel->rd_rel->relfilenode = relid;
-
-	/*
-	 * done building relcache entry.
-	 */
-	MemoryContextSwitchTo(oldcxt);
+	rel = RelationBuildLocalRelation(relname, tupDesc,
+									 relid, tblNode,
+									 nailme);

 	/*
 	 * have the storage manager create the relation.
@ -347,8 +286,6 @@ heap_create(char *relname,
 	if (storage_create)
 		heap_storage_create(rel);

-	RelationRegisterRelation(rel);
-
 	return rel;
 }

--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/sequence.c,v 1.60 2001/06/23 00:07:34 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/sequence.c,v 1.61 2001/06/29 21:08:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -168,21 +168,26 @@ DefineSequence(CreateSeqStmt *seq)
 	DefineRelation(stmt, RELKIND_SEQUENCE);

 	rel = heap_openr(seq->seqname, AccessExclusiveLock);
-
 	tupDesc = RelationGetDescr(rel);

-	Assert(RelationGetNumberOfBlocks(rel) == 0);
+	/* Initialize first page of relation with special magic number */
+
 	buf = ReadBuffer(rel, P_NEW);

 	if (!BufferIsValid(buf))
 		elog(ERROR, "DefineSequence: ReadBuffer failed");

+	Assert(BufferGetBlockNumber(buf) == 0);
+
 	page = (PageHeader) BufferGetPage(buf);

 	PageInit((Page) page, BufferGetPageSize(buf), sizeof(sequence_magic));
 	sm = (sequence_magic *) PageGetSpecialPointer(page);
 	sm->magic = SEQ_MAGIC;

+	/* hack: ensure heap_insert will insert on the just-created page */
+	rel->rd_targblock = 0;
+
 	/* Now - form & insert sequence tuple */
 	tuple = heap_formtuple(tupDesc, value, null);
 	heap_insert(rel, tuple);
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.113 2001/06/22 19:16:22 wieck Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.114 2001/06/29 21:08:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -91,7 +91,7 @@ extern void AbortBufferIO(void);
 #define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY))

 static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
-								 bool isExtend, bool bufferLockHeld);
+								 bool bufferLockHeld);
 static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
 							   bool *foundPtr);
 static int	ReleaseBufferWithBufferLock(Buffer buffer);
@ -103,7 +103,9 @@ void		PrintBufferDescs(void);
 * ReadBuffer -- returns a buffer containing the requested
 *		block of the requested relation.  If the blknum
 *		requested is P_NEW, extend the relation file and
- *		allocate a new block.
+ *		allocate a new block.  (Caller is responsible for
+ *		ensuring that only one backend tries to extend a
+ *		relation at the same time!)
 *
 * Returns: the buffer number for the buffer containing
 *		the block read, or NULL on an error.  If successful,
@ -111,6 +113,8 @@ void		PrintBufferDescs(void);
 *
 * Assume when this function is called, that reln has been
 *		opened already.
+ *
+ * Note: a side effect of a P_NEW call is to update reln->rd_nblocks.
 */

 #undef ReadBuffer				/* conflicts with macro when BUFMGR_DEBUG
@ -122,28 +126,26 @@ void		PrintBufferDescs(void);
 Buffer
 ReadBuffer(Relation reln, BlockNumber blockNum)
 {
-	return ReadBufferInternal(reln, blockNum, false, false);
+	return ReadBufferInternal(reln, blockNum, false);
 }

 /*
 * ReadBufferInternal -- internal version of ReadBuffer with more options
 *
- * isExtend: if true, assume that we are extending the file and the caller
- * is passing the current EOF block number (ie, caller already called
- * smgrnblocks()).
- *
 * bufferLockHeld: if true, caller already acquired the bufmgr spinlock.
 * (This is assumed never to be true if dealing with a local buffer!)
 */
 static Buffer
 ReadBufferInternal(Relation reln, BlockNumber blockNum,
-				   bool isExtend, bool bufferLockHeld)
+				   bool bufferLockHeld)
 {
 	BufferDesc *bufHdr;
 	int			status;
 	bool		found;
+	bool		isExtend;
 	bool		isLocalBuf;

+	isExtend = (blockNum == P_NEW);
 	isLocalBuf = reln->rd_myxactonly;

 	if (isLocalBuf)
@ -151,11 +153,10 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 		ReadLocalBufferCount++;
 		pgstat_count_buffer_read(&reln->pgstat_info, reln);
 		/* Substitute proper block number if caller asked for P_NEW */
-		if (blockNum == P_NEW)
+		if (isExtend)
 		{
 			blockNum = reln->rd_nblocks;
 			reln->rd_nblocks++;
-			isExtend = true;
 		}
 		bufHdr = LocalBufferAlloc(reln, blockNum, &found);
 		if (found)
@ -169,10 +170,11 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 		ReadBufferCount++;
 		pgstat_count_buffer_read(&reln->pgstat_info, reln);
 		/* Substitute proper block number if caller asked for P_NEW */
-		if (blockNum == P_NEW)
+		if (isExtend)
 		{
-			blockNum = smgrnblocks(DEFAULT_SMGR, reln);
-			isExtend = true;
+			/* must be sure we have accurate file length! */
+			blockNum = reln->rd_nblocks = smgrnblocks(DEFAULT_SMGR, reln);
+			reln->rd_nblocks++;
 		}
 		/*
 		 * lookup the buffer.  IO_IN_PROGRESS is set if the requested
@ -649,26 +651,24 @@ WriteNoReleaseBuffer(Buffer buffer)
 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
 *		to save a spinlock release/acquire.
 *
- * An additional frammish of this routine is that the caller may perform
- * file extension (as if blockNum = P_NEW) by passing the actual current
- * EOF block number as blockNum and setting isExtend true.  This hack
- * allows us to avoid calling smgrnblocks() again when the caller has
- * already done it.
- *
- * Note: it is OK to pass buffer = InvalidBuffer, indicating that no old
- * buffer actually needs to be released.  This case is the same as ReadBuffer
- * except for the isExtend option.
- *
 * Also, if the passed buffer is valid and already contains the desired block
 * number, we simply return it without ever acquiring the spinlock at all.
 * Since the passed buffer must be pinned, it's OK to examine its block
 * number without getting the lock first.
+ *
+ * Note: it is OK to pass buffer = InvalidBuffer, indicating that no old
+ * buffer actually needs to be released.  This case is the same as ReadBuffer,
+ * but can save some tests in the caller.
+ *
+ * Also note: while it will work to call this routine with blockNum == P_NEW,
+ * it's best to avoid doing so, since that would result in calling
+ * smgrnblocks() while holding the bufmgr spinlock, hence some loss of
+ * concurrency.
 */
 Buffer
 ReleaseAndReadBuffer(Buffer buffer,
 					 Relation relation,
-					 BlockNumber blockNum,
-					 bool isExtend)
+					 BlockNumber blockNum)
 {
 	BufferDesc *bufHdr;

@ -703,14 +703,12 @@ ReleaseAndReadBuffer(Buffer buffer,
 					AddBufferToFreelist(bufHdr);
 					bufHdr->flags |= BM_FREE;
 				}
-				return ReadBufferInternal(relation, blockNum,
-										  isExtend, true);
+				return ReadBufferInternal(relation, blockNum, true);
 			}
 		}
 	}

-	return ReadBufferInternal(relation, blockNum,
-							  isExtend, false);
+	return ReadBufferInternal(relation, blockNum, false);
 }

 /*
@ -1738,14 +1736,13 @@ ReleaseAndReadBuffer_Debug(char *file,
 						   int line,
 						   Buffer buffer,
 						   Relation relation,
-						   BlockNumber blockNum,
-						   bool isExtend)
+						   BlockNumber blockNum)
 {
 	bool		bufferValid;
 	Buffer		b;

 	bufferValid = BufferIsValid(buffer);
-	b = ReleaseAndReadBuffer(buffer, relation, blockNum, isExtend);
+	b = ReleaseAndReadBuffer(buffer, relation, blockNum);
 	if (ShowPinTrace && bufferValid && BufferIsLocal(buffer)
 		&& is_userbuffer(buffer))
 	{
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/freespace/freespace.c,v 1.1 2001/06/27 23:31:39 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/freespace/freespace.c,v 1.2 2001/06/29 21:08:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -161,10 +161,42 @@ FreeSpaceShmemSize(void)
 	return size;
 }

+BlockNumber
+GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded)
+{
+	return InvalidBlockNumber;	/* stub */
+}
+
+void
+RecordFreeSpace(RelFileNode *rel, BlockNumber page, Size spaceAvail)
+{
+	/* stub */
+}
+
+BlockNumber
+RecordAndGetPageWithFreeSpace(RelFileNode *rel,
+							  BlockNumber oldPage,
+							  Size oldSpaceAvail,
+							  Size spaceNeeded)
+{
+	return InvalidBlockNumber;	/* stub */
+}
+
+void
+MultiRecordFreeSpace(RelFileNode *rel,
+					 BlockNumber minPage,
+					 BlockNumber maxPage,
+					 int nPages,
+					 BlockNumber *pages,
+					 Size *spaceAvail)
+{
+	/* stub */
+}

 void
 FreeSpaceMapForgetRel(RelFileNode *rel)
 {
+	/* stub */
 }


@ -178,6 +210,7 @@ FreeSpaceMapForgetRel(RelFileNode *rel)
 void
 DumpFreeSpace(void)
 {
+	/* stub */
 }

 #endif	 /* FREESPACE_DEBUG */
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@ -11,7 +11,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.50 2001/06/27 23:31:39 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.51 2001/06/29 21:08:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -437,7 +437,20 @@ smgrblindmarkdirty(int16 which,
 BlockNumber
 smgrnblocks(int16 which, Relation reln)
 {
-	return (*(smgrsw[which].smgr_nblocks)) (reln);
+	BlockNumber		nblocks;
+
+	nblocks = (*(smgrsw[which].smgr_nblocks)) (reln);
+	/*
+	 * NOTE: if a relation ever did grow to 2^32-1 blocks, this code would
+	 * fail --- but that's a good thing, because it would stop us from
+	 * extending the rel another block and having a block whose number
+	 * actually is InvalidBlockNumber.
+	 */
+	if (nblocks == InvalidBlockNumber)
+		elog(ERROR, "cannot count blocks for %s: %m",
+			 RelationGetRelationName(reln));
+
+	return nblocks;
 }

 /*
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v 1.140 2001/06/27 23:31:39 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v 1.141 2001/06/29 21:08:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1928,33 +1928,111 @@ RelationCacheAbortWalker(Relation *relationPtr, Datum dummy)
 }

 /*
- *		RelationRegisterRelation -
- *		   register the Relation descriptor of a newly created relation
- *		   with the relation descriptor Cache.
+ *		RelationBuildLocalRelation
+ *			Build a relcache entry for an about-to-be-created relation,
+ *			and enter it into the relcache.
 */
-void
-RelationRegisterRelation(Relation relation)
+Relation
+RelationBuildLocalRelation(const char *relname,
+						   TupleDesc tupDesc,
+						   Oid relid, Oid dbid,
+						   bool nailit)
 {
+	Relation	rel;
 	MemoryContext oldcxt;
+	int			natts = tupDesc->natts;
+	int			i;

-	RelationInitLockInfo(relation);
+	AssertArg(natts > 0);
+
+	/*
+	 * switch to the cache context to create the relcache entry.
+	 */
+	if (!CacheMemoryContext)
+		CreateCacheMemoryContext();

 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

-	RelationCacheInsert(relation);
+	/*
+	 * allocate a new relation descriptor.
+	 */
+	rel = (Relation) palloc(sizeof(RelationData));
+	MemSet((char *) rel, 0, sizeof(RelationData));
+	rel->rd_targblock = InvalidBlockNumber;
+
+	/* make sure relation is marked as having no open file yet */
+	rel->rd_fd = -1;
+
+	RelationSetReferenceCount(rel, 1);
+
+	/*
+	 * nail the reldesc if this is a bootstrap create reln and we may need
+	 * it in the cache later on in the bootstrap process so we don't ever
+	 * want it kicked out.	e.g. pg_attribute!!!
+	 */
+	if (nailit)
+		rel->rd_isnailed = true;
+
+	/*
+	 * create a new tuple descriptor from the one passed in
+	 * (we do this to copy it into the cache context)
+	 */
+	rel->rd_att = CreateTupleDescCopyConstr(tupDesc);
+
+	/*
+	 * initialize relation tuple form (caller may add/override data later)
+	 */
+	rel->rd_rel = (Form_pg_class) palloc(CLASS_TUPLE_SIZE);
+	MemSet((char *) rel->rd_rel, 0, CLASS_TUPLE_SIZE);
+
+	strcpy(RelationGetPhysicalRelationName(rel), relname);
+
+	rel->rd_rel->relkind = RELKIND_UNCATALOGED;
+	rel->rd_rel->relnatts = natts;
+	rel->rd_rel->reltype = InvalidOid;
+	if (tupDesc->constr)
+		rel->rd_rel->relchecks = tupDesc->constr->num_check;
+
+	/*
+	 * Insert relation OID and database/tablespace ID into the right places.
+	 * XXX currently we assume physical tblspace/relnode are same as logical
+	 * dbid/reloid.  Probably should pass an extra pair of parameters.
+	 */
+	rel->rd_rel->relisshared = (dbid == InvalidOid);
+
+	RelationGetRelid(rel) = relid;
+
+	for (i = 0; i < natts; i++)
+		rel->rd_att->attrs[i]->attrelid = relid;
+
+	RelationInitLockInfo(rel);	/* see lmgr.c */
+
+	rel->rd_node.tblNode = dbid;
+	rel->rd_node.relNode = relid;
+	rel->rd_rel->relfilenode = relid;
+
+	/*
+	 * Okay to insert into the relcache hash tables.
+	 */
+	RelationCacheInsert(rel);

 	/*
 	 * we've just created the relation. It is invisible to anyone else
 	 * before the transaction is committed. Setting rd_myxactonly allows
 	 * us to use the local buffer manager for select/insert/etc before the
 	 * end of transaction. (We also need to keep track of relations
-	 * created during a transaction and does the necessary clean up at the
+	 * created during a transaction and do the necessary clean up at the
 	 * end of the transaction.)				- ay 3/95
 	 */
-	relation->rd_myxactonly = TRUE;
-	newlyCreatedRelns = lcons(relation, newlyCreatedRelns);
+	rel->rd_myxactonly = true;
+	newlyCreatedRelns = lcons(rel, newlyCreatedRelns);

+	/*
+	 * done building relcache entry.
+	 */
 	MemoryContextSwitchTo(oldcxt);
+
+	return rel;
 }

 /*
@ -1972,14 +2050,18 @@ RelationPurgeLocalRelation(bool xactCommitted)
 		List	   *l = newlyCreatedRelns;
 		Relation	reln = lfirst(l);

+		newlyCreatedRelns = lnext(newlyCreatedRelns);
+		pfree(l);
+
 		Assert(reln != NULL && reln->rd_myxactonly);

 		reln->rd_myxactonly = false;	/* mark it not on list anymore */

-		newlyCreatedRelns = lnext(newlyCreatedRelns);
-		pfree(l);
-
-		/* XXX is this step still needed?  If so, why? */
+		/*
+		 * XXX while we clearly must throw out new Relation entries at
+		 * xact abort, it's not clear why we need to do it at commit.
+		 * Could this be improved?
+		 */
 		if (!IsBootstrapProcessingMode())
 			RelationClearRelation(reln, false);
 	}
--- a/src/include/access/hio.h
+++ b/src/include/access/hio.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: hio.h,v 1.18 2001/05/16 22:35:12 tgl Exp $
+ * $Id: hio.h,v 1.19 2001/06/29 21:08:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -19,6 +19,6 @@
 extern void RelationPutHeapTuple(Relation relation, Buffer buffer,
 					 HeapTuple tuple);
 extern Buffer RelationGetBufferForTuple(Relation relation, Size len,
-										BlockNumber minblocknum);
+										Buffer otherBuffer);

 #endif	 /* HIO_H */
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: bufmgr.h,v 1.52 2001/06/09 18:16:59 tgl Exp $
+ * $Id: bufmgr.h,v 1.53 2001/06/29 21:08:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -155,10 +155,11 @@ extern long *LocalRefCount;
 * prototypes for functions in bufmgr.c
 */
 extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+extern int	ReleaseBuffer(Buffer buffer);
 extern int	WriteBuffer(Buffer buffer);
 extern int	WriteNoReleaseBuffer(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
-								   BlockNumber blockNum, bool isExtend);
+								   BlockNumber blockNum);
 extern int	FlushBuffer(Buffer buffer, bool sync, bool release);

 extern void InitBufferPool(void);
@ -176,7 +177,6 @@ extern void DropRelFileNodeBuffers(RelFileNode rnode);
 extern void DropBuffers(Oid dbid);
 extern void PrintPinnedBufs(void);
 extern int	BufferShmemSize(void);
-extern int	ReleaseBuffer(Buffer buffer);
 extern RelFileNode BufferGetFileNode(Buffer buffer);

 extern void SetBufferCommitInfoNeedsSave(Buffer buffer);
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: relcache.h,v 1.24 2001/01/24 19:43:29 momjian Exp $
+ * $Id: relcache.h,v 1.25 2001/06/29 21:08:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -27,7 +27,6 @@ extern Relation RelationNodeCacheGetRelation(RelFileNode rnode);
 extern Relation RelationIdCacheGetRelation(Oid relationId);

 extern void RelationClose(Relation relation);
-extern void RelationForgetRelation(Oid rid);

 /*
 * Routines to compute/retrieve additional cached information
@ -40,14 +39,23 @@ extern List *RelationGetIndexList(Relation relation);
 extern void RelationCacheInitialize(void);
 extern void RelationCacheInitializePhase2(void);

+/*
+ * Routine to create a relcache entry for an about-to-be-created relation
+ */
+extern Relation RelationBuildLocalRelation(const char *relname,
+										   TupleDesc tupDesc,
+										   Oid relid, Oid dbid,
+										   bool nailit);
+
 /*
 * Routines for flushing/rebuilding relcache entries in various scenarios
 */
+extern void RelationForgetRelation(Oid rid);
+
 extern void RelationIdInvalidateRelationCacheByRelationId(Oid relationId);

 extern void RelationCacheInvalidate(void);

-extern void RelationRegisterRelation(Relation relation);
 extern void RelationPurgeLocalRelation(bool xactComitted);

 extern void RelationCacheAbort(void);