Buffer manager modifications to keep a local buffer-dirtied bit as well

as a shared dirtybit for each shared buffer. The shared dirtybit still controls writing the buffer, but the local bit controls whether we need to fsync the buffer's file. This arrangement fixes a bug that allowed some required fsyncs to be missed, and should improve performance as well. For more info see my post of same date on pghackers.
26 years ago · 1f6d8b90b8
parent 9c38a8d296
commit 1f6d8b90b8
13 changed files with 951 additions and 431 deletions
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.62 2000/03/17 02:36:05 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.63 2000/04/09 04:43:16 tgl Exp $
 *
 * NOTES
 *		Transaction aborts can now occur two ways:
@ -642,7 +642,7 @@ RecordTransactionCommit()
 	{
 		FlushBufferPool();
 		if (leak)
-			ResetBufferPool();
+			ResetBufferPool(true);

 		/*
 		 *	have the transaction access methods record the status
@ -658,7 +658,7 @@ RecordTransactionCommit()
 	}

 	if (leak)
-		ResetBufferPool();
+		ResetBufferPool(true);
 }


@ -759,7 +759,10 @@ RecordTransactionAbort()
 	if (SharedBufferChanged && !TransactionIdDidCommit(xid))
 		TransactionIdAbort(xid);

-	ResetBufferPool();
+	/*
+	 * Tell bufmgr and smgr to release resources.
+	 */
+	ResetBufferPool(false);		/* false -> is abort */
 }

 /* --------------------------------
--- a/src/backend/catalog/catalog.c
+++ b/src/backend/catalog/catalog.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.30 2000/01/26 05:56:10 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.31 2000/04/09 04:43:15 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -23,25 +23,87 @@
 #include "utils/syscache.h"

 /*
- * relpath				- path to the relation
- *		Perhaps this should be in-line code in relopen().
+ * relpath				- construct path to a relation's file
+ *
+ * Note that this only works with relations that are visible to the current
+ * backend, ie, either in the current database or shared system relations.
+ *
+ * Result is a palloc'd string.
 */
 char *
 relpath(const char *relname)
 {
 	char	   *path;
-	size_t		bufsize = 0;

 	if (IsSharedSystemRelationName(relname))
 	{
-		bufsize = strlen(DataDir) + sizeof(NameData) + 2;
+		/* Shared system relations live in DataDir */
+		size_t		bufsize = strlen(DataDir) + sizeof(NameData) + 2;
+
 		path = (char *) palloc(bufsize);
-		snprintf(path, bufsize, "%s/%s", DataDir, relname);
+		snprintf(path, bufsize, "%s%c%s", DataDir, SEP_CHAR, relname);
 		return path;
 	}
+	/*
+	 * If it is in the current database, assume it is in current working
+	 * directory.  NB: this does not work during bootstrap!
+	 */
 	return pstrdup(relname);
 }

+/*
+ * relpath_blind			- construct path to a relation's file
+ *
+ * Construct the path using only the info available to smgrblindwrt,
+ * namely the names and OIDs of the database and relation.  (Shared system
+ * relations are identified with dbid = 0.)  Note that we may have to
+ * access a relation belonging to a different database!
+ *
+ * Result is a palloc'd string.
+ */
+
+char *
+relpath_blind(const char *dbname, const char *relname,
+			  Oid dbid, Oid relid)
+{
+	char	   *path;
+
+	if (dbid == (Oid) 0)
+	{
+		/* Shared system relations live in DataDir */
+		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
+		sprintf(path, "%s%c%s", DataDir, SEP_CHAR, relname);
+	}
+	else if (dbid == MyDatabaseId)
+	{
+		/* XXX why is this inconsistent with relpath() ? */
+		path = (char *) palloc(strlen(DatabasePath) + sizeof(NameData) + 2);
+		sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relname);
+	}
+	else
+	{
+		/* this is work around only !!! */
+		char		dbpathtmp[MAXPGPATH];
+		Oid			id;
+		char	   *dbpath;
+
+		GetRawDatabaseInfo(dbname, &id, dbpathtmp);
+
+		if (id != dbid)
+			elog(FATAL, "relpath_blind: oid of db %s is not %u",
+				 dbname, dbid);
+		dbpath = ExpandDatabasePath(dbpathtmp);
+		if (dbpath == NULL)
+			elog(FATAL, "relpath_blind: can't expand path for db %s",
+				 dbname);
+		path = (char *) palloc(strlen(dbpath) + sizeof(NameData) + 2);
+		sprintf(path, "%s%c%s", dbpath, SEP_CHAR, relname);
+		pfree(dbpath);
+	}
+	return path;
+}
+
+
 /*
 * IsSystemRelationName
 *		True iff name is the name of a system catalog relation.
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.32 2000/01/26 05:56:50 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.33 2000/04/09 04:43:18 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -65,9 +65,11 @@ long	   *NWaitIOBackendP;
 extern IpcSemaphoreId WaitIOSemId;

 long	   *PrivateRefCount;	/* also used in freelist.c */
-bits8	   *BufferLocks;		/* */
-long	   *CommitInfoNeedsSave;/* to write buffers where we have filled
-								 * in t_infomask */
+bits8	   *BufferLocks;		/* flag bits showing locks I have set */
+BufferTag  *BufferTagLastDirtied; /* tag buffer had when last dirtied by me */
+BufferBlindId *BufferBlindLastDirtied; /* and its BlindId too */
+bool	   *BufferDirtiedByMe;	/* T if buf has been dirtied in cur xact */
+

 /*
 * Data Structures:
@ -247,7 +249,9 @@ InitBufferPool(IPCKey key)
 #endif
 	PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
 	BufferLocks = (bits8 *) calloc(NBuffers, sizeof(bits8));
-	CommitInfoNeedsSave = (long *) calloc(NBuffers, sizeof(long));
+	BufferTagLastDirtied = (BufferTag *) calloc(NBuffers, sizeof(BufferTag));
+	BufferBlindLastDirtied = (BufferBlindId *) calloc(NBuffers, sizeof(BufferBlindId));
+	BufferDirtiedByMe = (bool *) calloc(NBuffers, sizeof(bool));
 }

 /* -----------------------------------------------------
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.77 2000/03/31 02:43:31 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.78 2000/04/09 04:43:18 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -94,8 +94,10 @@ static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum,
 						 bool bufferLockHeld);
 static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
 			bool *foundPtr, bool bufferLockHeld);
+static void SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr);
+static void ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr);
 static void BufferSync(void);
-static int	BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld);
+static int	BufferReplace(BufferDesc *bufHdr);
 void		PrintBufferDescs(void);

 /* ---------------------------------------------------
@ -176,7 +178,7 @@ is_userbuffer(Buffer buffer)
 {
 	BufferDesc *buf = &BufferDescriptors[buffer - 1];

-	if (IsSystemRelationName(buf->sb_relname))
+	if (IsSystemRelationName(buf->blind.relname))
 		return false;
 	return true;
 }
@ -199,7 +201,7 @@ ReadBuffer_Debug(char *file,

 		fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \
 refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->sb_relname, buf->tag.blockNum,
+				buffer, buf->blind.relname, buf->tag.blockNum,
 				PrivateRefCount[buffer - 1], file, line);
 	}
 	return buffer;
@ -390,7 +392,6 @@ BufferAlloc(Relation reln,
 			 * If there's no IO for the buffer and the buffer
 			 * is BROKEN,it should be read again. So start a
 			 * new buffer IO here. 
-
 			 *
 			 * wierd race condition:
 			 *
@ -465,33 +466,24 @@ BufferAlloc(Relation reln,
 			 * in WaitIO until we're done.
 			 */
 			inProgress = TRUE;
-#ifdef HAS_TEST_AND_SET

 			/*
 			 * All code paths that acquire this lock pin the buffer first;
 			 * since no one had it pinned (it just came off the free
 			 * list), no one else can have this lock.
 			 */
-#endif	 /* HAS_TEST_AND_SET */
 			StartBufferIO(buf, false);

 			/*
 			 * Write the buffer out, being careful to release BufMgrLock
 			 * before starting the I/O.
-			 *
-			 * This #ifndef is here because a few extra semops REALLY kill
-			 * you on machines that don't have spinlocks.  If you don't
-			 * operate with much concurrency, well...
 			 */
-			smok = BufferReplace(buf, true);
-#ifndef OPTIMIZE_SINGLE
-			SpinAcquire(BufMgrLock);
-#endif	 /* OPTIMIZE_SINGLE */
+			smok = BufferReplace(buf);

 			if (smok == FALSE)
 			{
 				elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s",
-					 buf->tag.blockNum, buf->sb_dbname, buf->sb_relname);
+					 buf->tag.blockNum, buf->blind.dbname, buf->blind.relname);
 				inProgress = FALSE;
 				buf->flags |= BM_IO_ERROR;
 				buf->flags &= ~BM_IO_IN_PROGRESS;
@ -516,7 +508,7 @@ BufferAlloc(Relation reln,
 				if (buf->flags & BM_JUST_DIRTIED)
 				{
 					elog(FATAL, "BufferAlloc: content of block %u (%s) changed while flushing",
-						 buf->tag.blockNum, buf->sb_relname);
+						 buf->tag.blockNum, buf->blind.relname);
 				}
 				else
 					buf->flags &= ~BM_DIRTY;
@ -562,6 +554,7 @@ BufferAlloc(Relation reln,
 				 */
 				if (buf != NULL)
 				{
+					buf->flags &= ~BM_IO_IN_PROGRESS;
 					TerminateBufferIO(buf);
 					/* give up the buffer since we don't need it any more */
 					PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
@ -572,7 +565,6 @@ BufferAlloc(Relation reln,
 						AddBufferToFreelist(buf);
 						buf->flags |= BM_FREE;
 					}
-					buf->flags &= ~BM_IO_IN_PROGRESS;
 				}

 				PinBuffer(buf2);
@ -619,8 +611,8 @@ BufferAlloc(Relation reln,
 	}

 	/* record the database name and relation name for this buffer */
-	strcpy(buf->sb_relname, RelationGetPhysicalRelationName(reln));
-	strcpy(buf->sb_dbname, DatabaseName);
+	strcpy(buf->blind.dbname, DatabaseName);
+	strcpy(buf->blind.relname, RelationGetPhysicalRelationName(reln));

 	INIT_BUFFERTAG(&(buf->tag), reln, blockNum);
 	if (!BufTableInsert(buf))
@ -683,9 +675,9 @@ WriteBuffer(Buffer buffer)
 	SpinAcquire(BufMgrLock);
 	Assert(bufHdr->refcount > 0);
 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+	SetBufferDirtiedByMe(buffer, bufHdr);
 	UnpinBuffer(bufHdr);
 	SpinRelease(BufMgrLock);
-	CommitInfoNeedsSave[buffer - 1] = 0;

 	return TRUE;
 }
@ -702,7 +694,7 @@ WriteBuffer_Debug(char *file, int line, Buffer buffer)
 		buf = &BufferDescriptors[buffer - 1];
 		fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \
 refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->sb_relname, buf->tag.blockNum,
+				buffer, buf->blind.relname, buf->tag.blockNum,
 				PrivateRefCount[buffer - 1], file, line);
 	}
 }
@ -767,8 +759,9 @@ DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest)
 *
 * 'buffer' is known to be dirty/pinned, so there should not be a
 * problem reading the BufferDesc members without the BufMgrLock
- * (nobody should be able to change tags, flags, etc. out from under
- * us).  Unpin if 'release' is TRUE.
+ * (nobody should be able to change tags out from under us).
+ *
+ * Unpin if 'release' is TRUE.
 */
 int
 FlushBuffer(Buffer buffer, bool release)
@ -784,6 +777,8 @@ FlushBuffer(Buffer buffer, bool release)
 	if (BAD_BUFFER_ID(buffer))
 		return STATUS_ERROR;

+	Assert(PrivateRefCount[buffer - 1] > 0); /* else caller didn't pin */
+
 	bufHdr = &BufferDescriptors[buffer - 1];
 	bufdb = bufHdr->tag.relId.dbId;

@ -809,7 +804,7 @@ FlushBuffer(Buffer buffer, bool release)
 	if (status == SM_FAIL)
 	{
 		elog(ERROR, "FlushBuffer: cannot flush block %u of the relation %s",
-			 bufHdr->tag.blockNum, bufHdr->sb_relname);
+			 bufHdr->tag.blockNum, bufHdr->blind.relname);
 		return STATUS_ERROR;
 	}
 	BufferFlushCount++;
@ -820,19 +815,21 @@ FlushBuffer(Buffer buffer, bool release)

 	/*
 	 * If this buffer was marked by someone as DIRTY while we were
-	 * flushing it out we must not clear DIRTY flag - vadim 01/17/97
+	 * flushing it out we must not clear shared DIRTY flag - vadim 01/17/97
+	 *
+	 * ... but we can clear BufferDirtiedByMe anyway - tgl 3/31/00
 	 */
 	if (bufHdr->flags & BM_JUST_DIRTIED)
 	{
 		elog(NOTICE, "FlushBuffer: content of block %u (%s) changed while flushing",
-			 bufHdr->tag.blockNum, bufHdr->sb_relname);
+			 bufHdr->tag.blockNum, bufHdr->blind.relname);
 	}
 	else
 		bufHdr->flags &= ~BM_DIRTY;
+	ClearBufferDirtiedByMe(buffer, bufHdr);
 	if (release)
 		UnpinBuffer(bufHdr);
 	SpinRelease(BufMgrLock);
-	CommitInfoNeedsSave[buffer - 1] = 0;

 	return STATUS_OK;
 }
@ -857,9 +854,10 @@ WriteNoReleaseBuffer(Buffer buffer)
 	SharedBufferChanged = true;

 	SpinAcquire(BufMgrLock);
+	Assert(bufHdr->refcount > 0);
 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+	SetBufferDirtiedByMe(buffer, bufHdr);
 	SpinRelease(BufMgrLock);
-	CommitInfoNeedsSave[buffer - 1] = 0;

 	return STATUS_OK;
 }
@ -901,11 +899,6 @@ ReleaseAndReadBuffer(Buffer buffer,
 					AddBufferToFreelist(bufHdr);
 					bufHdr->flags |= BM_FREE;
 				}
-				if (CommitInfoNeedsSave[buffer - 1])
-				{
-					bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-					CommitInfoNeedsSave[buffer - 1] = 0;
-				}
 				retbuf = ReadBufferWithBufferLock(relation, blockNum, true);
 				return retbuf;
 			}
@ -915,13 +908,120 @@ ReleaseAndReadBuffer(Buffer buffer,
 	return ReadBuffer(relation, blockNum);
 }

+/*
+ * SetBufferDirtiedByMe -- mark a shared buffer as being dirtied by this xact
+ *
+ * This flag essentially remembers that we need to write and fsync this buffer
+ * before we can commit the transaction.  The write might end up getting done
+ * by another backend, but we must do the fsync ourselves (else we could
+ * commit before the data actually reaches disk).  We do not issue fsync
+ * instantly upon write; the storage manager keeps track of which files need
+ * to be fsync'd before commit can occur.  A key aspect of this data structure
+ * is that we will be able to notify the storage manager that an fsync is
+ * needed even after another backend has done the physical write and replaced
+ * the buffer contents with something else!
+ *
+ * NB: we must be holding the bufmgr lock at entry, and the buffer must be
+ * pinned so that no other backend can take it away from us.
+ */
+static void
+SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr)
+{
+	BufferTag  *tagLastDirtied = & BufferTagLastDirtied[buffer - 1];
+	Relation	reln;
+	int			status;
+
+	/*
+	 * If the flag is already set, check to see whether the buffertag is
+	 * the same.  If not, some other backend already wrote the buffer data
+	 * that we dirtied.  We must tell the storage manager to make an fsync
+	 * pending on that file before we can overwrite the old tag value.
+	 */
+	if (BufferDirtiedByMe[buffer - 1])
+	{
+		if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId &&
+			bufHdr->tag.relId.relId == tagLastDirtied->relId.relId &&
+			bufHdr->tag.blockNum == tagLastDirtied->blockNum)
+			return;				/* Same tag already dirtied, so no work */
+
+#ifndef OPTIMIZE_SINGLE
+		SpinRelease(BufMgrLock);
+#endif	 /* OPTIMIZE_SINGLE */
+
+		reln = RelationIdCacheGetRelation(tagLastDirtied->relId.relId);
+
+		if (reln == (Relation) NULL)
+		{
+			status = smgrblindmarkdirty(DEFAULT_SMGR,
+										BufferBlindLastDirtied[buffer - 1].dbname,
+										BufferBlindLastDirtied[buffer - 1].relname,
+										tagLastDirtied->relId.dbId,
+										tagLastDirtied->relId.relId,
+										tagLastDirtied->blockNum);
+		}
+		else
+		{
+			status = smgrmarkdirty(DEFAULT_SMGR, reln,
+								   tagLastDirtied->blockNum);
+			/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+			RelationDecrementReferenceCount(reln);
+		}
+		if (status == SM_FAIL)
+		{
+			elog(ERROR, "SetBufferDirtiedByMe: cannot mark %u for %s",
+				 tagLastDirtied->blockNum,
+				 BufferBlindLastDirtied[buffer - 1].relname);
+		}
+
+#ifndef OPTIMIZE_SINGLE
+		SpinAcquire(BufMgrLock);
+#endif	 /* OPTIMIZE_SINGLE */
+
+	}
+
+	*tagLastDirtied = bufHdr->tag;
+	BufferBlindLastDirtied[buffer - 1] = bufHdr->blind;
+	BufferDirtiedByMe[buffer - 1] = true;
+}
+
+/*
+ * ClearBufferDirtiedByMe -- mark a shared buffer as no longer needing fsync
+ *
+ * If we write out a buffer ourselves, then the storage manager will set its
+ * needs-fsync flag for that file automatically, and so we can clear our own
+ * flag that says it needs to be done later.
+ *
+ * NB: we must be holding the bufmgr lock at entry.
+ */
+static void
+ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr)
+{
+	BufferTag  *tagLastDirtied = & BufferTagLastDirtied[buffer - 1];
+
+	/*
+	 * Do *not* clear the flag if it refers to some other buffertag than
+	 * the data we just wrote.  This is unlikely, but possible if some
+	 * other backend replaced the buffer contents since we set our flag.
+	 */
+	if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId &&
+		bufHdr->tag.relId.relId == tagLastDirtied->relId.relId &&
+		bufHdr->tag.blockNum == tagLastDirtied->blockNum)
+	{
+		BufferDirtiedByMe[buffer - 1] = false;
+	}
+}
+
 /*
 * BufferSync -- Flush all dirty buffers in the pool.
 *
- *		This is called at transaction commit time.	It does the wrong thing,
- *		right now.	We should flush only our own changes to stable storage,
- *		and we should obey the lock protocol on the buffer manager metadata
- *		as we do it.  Also, we need to be sure that no other transaction is
+ *		This is called at transaction commit time.  We find all buffers
+ *		that have been dirtied by the current xact and flush them to disk.
+ *		We do *not* flush dirty buffers that have been dirtied by other xacts.
+ *		(This is a substantial change from pre-7.0 behavior.)
+ *
+ *	OLD COMMENTS (do these still apply?)
+ *
+ *		Also, we need to be sure that no other transaction is
 *		modifying the page as we flush it.	This is only a problem for objects
 *		that use a non-two-phase locking protocol, like btree indices.	For
 *		those objects, we would like to set a write lock for the duration of
@ -936,21 +1036,49 @@ static void
 BufferSync()
 {
 	int			i;
-	Oid			bufdb;
-	Oid			bufrel;
-	Relation	reln;
 	BufferDesc *bufHdr;
 	int			status;
+	Relation	reln;
+	bool		didwrite;

-	SpinAcquire(BufMgrLock);
 	for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
 	{
+		/* Ignore buffers that were not dirtied by me */
+		if (! BufferDirtiedByMe[i])
+			continue;
+
+		SpinAcquire(BufMgrLock);
+
+		/*
+		 * We only need to write if the buffer is still dirty and still
+		 * contains the same disk page that it contained when we dirtied it.
+		 * Otherwise, someone else has already written our changes for us,
+		 * and we need only fsync.
+		 *
+		 * (NOTE: it's still possible to do an unnecessary write, if other
+		 * xacts have written and then re-dirtied the page since our last
+		 * change to it.  But that should be pretty uncommon, and there's
+		 * no easy way to detect it anyway.)
+		 */
+		reln = NULL;
+		didwrite = false;
 		if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 		{
+			Oid			bufdb;
+			Oid			bufrel;
+
 			bufdb = bufHdr->tag.relId.dbId;
 			bufrel = bufHdr->tag.relId.relId;
-			if (bufdb == MyDatabaseId || bufdb == (Oid) 0)
+			if (bufdb == BufferTagLastDirtied[i].relId.dbId &&
+				bufrel == BufferTagLastDirtied[i].relId.relId &&
+				bufHdr->tag.blockNum == BufferTagLastDirtied[i].blockNum)
 			{
+				/*
+				 * Try to find relation for buf.  This could fail, if the
+				 * rel has been flushed from the relcache since we dirtied
+				 * the page.  That should be uncommon, so paying the extra
+				 * cost of a blind write when it happens seems OK.
+				 */
 				reln = RelationIdCacheGetRelation(bufrel);

 				/*
@ -970,14 +1098,11 @@ BufferSync()
 					if (bufHdr->flags & BM_IO_ERROR)
 					{
 						elog(ERROR, "BufferSync: write error %u for %s",
-							 bufHdr->tag.blockNum, bufHdr->sb_relname);
+							 bufHdr->tag.blockNum, bufHdr->blind.relname);
 					}
-					/* drop refcnt from RelationIdCacheGetRelation */
-					if (reln != (Relation) NULL)
-						RelationDecrementReferenceCount(reln);
-					continue;
 				}
-
+				else
+				{
 					/*
 					 * To check if block content changed while flushing (see
 					 * below). - vadim 01/17/97
@ -987,19 +1112,20 @@ BufferSync()
 					StartBufferIO(bufHdr, false); /* output IO start */

 					/*
-				 * If we didn't have the reldesc in our local cache, flush
+					 * If we didn't have the reldesc in our local cache, write
 					 * this page out using the 'blind write' storage manager
 					 * routine.  If we did find it, use the standard
 					 * interface.
 					 */
-
 #ifndef OPTIMIZE_SINGLE
 					SpinRelease(BufMgrLock);
 #endif	 /* OPTIMIZE_SINGLE */
 					if (reln == (Relation) NULL)
 					{
-					status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname,
-									   bufHdr->sb_relname, bufdb, bufrel,
+						status = smgrblindwrt(DEFAULT_SMGR,
+											  bufHdr->blind.dbname,
+											  bufHdr->blind.relname,
+											  bufdb, bufrel,
 											  bufHdr->tag.blockNum,
 											  (char *) MAKE_PTR(bufHdr->data));
 					}
@ -1018,26 +1144,68 @@ BufferSync()
 					{
 						bufHdr->flags |= BM_IO_ERROR;
 						elog(ERROR, "BufferSync: cannot write %u for %s",
-						 bufHdr->tag.blockNum, bufHdr->sb_relname);
+							 bufHdr->tag.blockNum, bufHdr->blind.relname);
 					}
 					bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */
 					TerminateBufferIO(bufHdr);	/* Sync IO finished */
 					BufferFlushCount++;
+					didwrite = true;

 					/*
 					 * If this buffer was marked by someone as DIRTY while we
 					 * were flushing it out we must not clear DIRTY flag -
 					 * vadim 01/17/97
+					 *
+					 * but it is OK to clear BufferDirtiedByMe - tgl 3/31/00
 					 */
 					if (!(bufHdr->flags & BM_JUST_DIRTIED))
 						bufHdr->flags &= ~BM_DIRTY;
-				/* drop refcnt from RelationIdCacheGetRelation */
+				}
+
+				/* drop refcnt obtained by RelationIdCacheGetRelation */
 				if (reln != (Relation) NULL)
 					RelationDecrementReferenceCount(reln);
 			}
 		}
+
+		/*
+		 * If we did not write the buffer (because someone else did),
+		 * we must still fsync the file containing it, to ensure that the
+		 * write is down to disk before we commit.
+		 */
+		if (! didwrite)
+		{
+#ifndef OPTIMIZE_SINGLE
+			SpinRelease(BufMgrLock);
+#endif	 /* OPTIMIZE_SINGLE */
+
+			reln = RelationIdCacheGetRelation(BufferTagLastDirtied[i].relId.relId);
+			if (reln == (Relation) NULL)
+			{
+				status = smgrblindmarkdirty(DEFAULT_SMGR,
+											BufferBlindLastDirtied[i].dbname,
+											BufferBlindLastDirtied[i].relname,
+											BufferTagLastDirtied[i].relId.dbId,
+											BufferTagLastDirtied[i].relId.relId,
+											BufferTagLastDirtied[i].blockNum);
 			}
+			else
+			{
+				status = smgrmarkdirty(DEFAULT_SMGR, reln,
+									   BufferTagLastDirtied[i].blockNum);
+				/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+				RelationDecrementReferenceCount(reln);
+
+			}
+#ifndef OPTIMIZE_SINGLE
+			SpinAcquire(BufMgrLock);
+#endif	 /* OPTIMIZE_SINGLE */
+		}
+
+		BufferDirtiedByMe[i] = false;
+
 		SpinRelease(BufMgrLock);
+	}

 	LocalBufferSync();
 }
@ -1166,13 +1334,19 @@ ResetBufferUsage()
 /* ----------------------------------------------
 *		ResetBufferPool
 *
- *		this routine is supposed to be called when a transaction aborts.
+ *		This routine is supposed to be called when a transaction aborts.
 *		it will release all the buffer pins held by the transaction.
+ *		Currently, we also call it during commit if BufferPoolCheckLeak
+ *		detected a problem --- in that case, isCommit is TRUE, and we
+ *		only clean up buffer pin counts.
+ *
+ * During abort, we also forget any pending fsync requests.  Dirtied buffers
+ * will still get written, eventually, but there will be no fsync for them.
 *
 * ----------------------------------------------
 */
 void
-ResetBufferPool()
+ResetBufferPool(bool isCommit)
 {
 	int			i;

@ -1193,10 +1367,15 @@ ResetBufferPool()
 			SpinRelease(BufMgrLock);
 		}
 		PrivateRefCount[i] = 0;
-		CommitInfoNeedsSave[i] = 0;
+
+		if (! isCommit)
+			BufferDirtiedByMe[i] = false;
 	}

 	ResetLocalBufferPool();
+
+	if (! isCommit)
+		smgrabort();
 }

 /* -----------------------------------------------
@ -1222,7 +1401,7 @@ BufferPoolCheckLeak()
 				 "Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \
 relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)",
 				 i - 1, buf->freeNext, buf->freePrev,
-				 buf->sb_relname, buf->tag.blockNum, buf->flags,
+				 buf->blind.relname, buf->tag.blockNum, buf->flags,
 				 buf->refcount, PrivateRefCount[i - 1]);
 			result = 1;
 		}
@ -1306,25 +1485,25 @@ BufferGetRelation(Buffer buffer)
 /*
 * BufferReplace
 *
- * Flush the buffer corresponding to 'bufHdr'
+ * Write out the buffer corresponding to 'bufHdr'
+ *
+ * This routine used to flush the data to disk (ie, force immediate fsync)
+ * but that's no longer necessary because BufferSync is smarter than before.
 *
+ * BufMgrLock must be held at entry, and the buffer must be pinned.
 */
 static int
-BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
+BufferReplace(BufferDesc *bufHdr)
 {
 	Relation	reln;
 	Oid			bufdb,
 				bufrel;
 	int			status;

-	if (!bufferLockHeld)
-		SpinAcquire(BufMgrLock);
-
 	/*
 	 * first try to find the reldesc in the cache, if no luck, don't
 	 * bother to build the reldesc from scratch, just do a blind write.
 	 */
-
 	bufdb = bufHdr->tag.relId.dbId;
 	bufrel = bufHdr->tag.relId.relId;

@ -1336,22 +1515,27 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
 	/* To check if block content changed while flushing. - vadim 01/17/97 */
 	bufHdr->flags &= ~BM_JUST_DIRTIED;

+#ifndef OPTIMIZE_SINGLE
 	SpinRelease(BufMgrLock);
+#endif	 /* OPTIMIZE_SINGLE */

 	if (reln != (Relation) NULL)
 	{
-		status = smgrflush(DEFAULT_SMGR, reln, bufHdr->tag.blockNum,
+		status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum,
 						   (char *) MAKE_PTR(bufHdr->data));
 	}
 	else
 	{
-		/* blind write always flushes */
-		status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname,
-							  bufHdr->sb_relname, bufdb, bufrel,
+		status = smgrblindwrt(DEFAULT_SMGR, bufHdr->blind.dbname,
+							  bufHdr->blind.relname, bufdb, bufrel,
 							  bufHdr->tag.blockNum,
 							  (char *) MAKE_PTR(bufHdr->data));
 	}

+#ifndef OPTIMIZE_SINGLE
+	SpinAcquire(BufMgrLock);
+#endif	 /* OPTIMIZE_SINGLE */
+
 	/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
 	if (reln != (Relation) NULL)
 		RelationDecrementReferenceCount(reln);
@ -1359,6 +1543,11 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
 	if (status == SM_FAIL)
 		return FALSE;

+	/* If we had marked this buffer as needing to be fsync'd, we can forget
+	 * about that, because it's now the storage manager's responsibility.
+	 */
+	ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr);
+
 	BufferFlushCount++;

 	return TRUE;
@ -1440,7 +1629,7 @@ ReleaseRelationBuffers(Relation rel)
 			}
 			/* Now we can do what we came for */
 			buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED);
-			CommitInfoNeedsSave[i - 1] = 0;
+			ClearBufferDirtiedByMe(i, buf);
 			/*
 			 * Release any refcount we may have.
 			 *
@ -1502,6 +1691,7 @@ DropBuffers(Oid dbid)
 			}
 			/* Now we can do what we came for */
 			buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED);
+			ClearBufferDirtiedByMe(i, buf);
 			/*
 			 * The thing should be free, if caller has checked that
 			 * no backends are running in that database.
@ -1533,7 +1723,7 @@ PrintBufferDescs()
 			elog(DEBUG, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \
 blockNum=%d, flags=0x%x, refcount=%d %ld)",
 				 i, buf->freeNext, buf->freePrev,
-				 buf->sb_relname, buf->tag.blockNum, buf->flags,
+				 buf->blind.relname, buf->tag.blockNum, buf->flags,
 				 buf->refcount, PrivateRefCount[i]);
 		}
 		SpinRelease(BufMgrLock);
@ -1544,7 +1734,7 @@ blockNum=%d, flags=0x%x, refcount=%d %ld)",
 		for (i = 0; i < NBuffers; ++i, ++buf)
 		{
 			printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n",
-				   i, buf->sb_relname, buf->tag.blockNum,
+				   i, buf->blind.relname, buf->tag.blockNum,
 				   buf->flags, buf->refcount, PrivateRefCount[i]);
 		}
 	}
@ -1562,7 +1752,7 @@ PrintPinnedBufs()
 		if (PrivateRefCount[i] > 0)
 			elog(NOTICE, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \
 blockNum=%d, flags=0x%x, refcount=%d %ld)\n",
-				 i, buf->freeNext, buf->freePrev, buf->sb_relname,
+				 i, buf->freeNext, buf->freePrev, buf->blind.relname,
 				 buf->tag.blockNum, buf->flags,
 				 buf->refcount, PrivateRefCount[i]);
 	}
@ -1601,33 +1791,42 @@ BufferPoolBlowaway()
 *		FlushRelationBuffers
 *
 *		This function removes from the buffer pool all pages of a relation
- *		that have blocknumber >= specified block.  If doFlush is true,
- *		dirty buffers are written out --- otherwise it's an error for any
- *		of the buffers to be dirty.
+ *		that have blocknumber >= specified block.  Pages that are dirty are
+ *		written out first.  If expectDirty is false, a notice is emitted
+ *		warning of dirty buffers, but we proceed anyway.  An error code is
+ *		returned if we fail to dump a dirty buffer or if we find one of
+ *		the target pages is pinned into the cache.
 *
 *		This is used by VACUUM before truncating the relation to the given
- *		number of blocks.  For VACUUM, we pass doFlush = false since it would
- *		mean a bug in VACUUM if any of the unwanted pages were still dirty.
- *		(TRUNCATE TABLE also uses it in the same way.)
+ *		number of blocks.  For VACUUM, we pass expectDirty = false since it
+ *		could mean a bug in VACUUM if any of the unwanted pages were still
+ *		dirty.  (TRUNCATE TABLE also uses it in the same way.)
 *
- *		This is also used by RENAME TABLE (with block = 0 and doFlush = true)
+ *		This is also used by RENAME TABLE (with block=0 and expectDirty=true)
 *		to clear out the buffer cache before renaming the physical files of
 *		a relation.  Without that, some other backend might try to do a
- *		blind write of a buffer page (relying on the sb_relname of the buffer)
+ *		blind write of a buffer page (relying on the BlindId of the buffer)
 *		and fail because it's not got the right filename anymore.
 *
 *		In both cases, the caller should be holding AccessExclusiveLock on
 *		the target relation to ensure that no other backend is busy reading
- *		more blocks of the relation...
+ *		more blocks of the relation.
 *
- *		Returns: 0 - Ok, -1 - DIRTY, -2 - PINNED
+ *		Formerly, we considered it an error condition if we found unexpectedly
+ *		dirty buffers.  However, since BufferSync no longer forces out all
+ *		dirty buffers at every xact commit, it's possible for dirty buffers
+ *		to still be present in the cache due to failure of an earlier
+ *		transaction.  So, downgrade the error to a mere notice.  Maybe we
+ *		shouldn't even emit a notice...
+ *
+ *		Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED
 *
 *		XXX currently it sequentially searches the buffer pool, should be
 *		changed to more clever ways of searching.
 * --------------------------------------------------------------------
 */
 int
-FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
+FlushRelationBuffers(Relation rel, BlockNumber block, bool expectDirty)
 {
 	int			i;
 	BufferDesc *buf;
@ -1642,19 +1841,13 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
 			{
 				if (buf->flags & BM_DIRTY)
 				{
-					if (doFlush)
-					{
-						if (FlushBuffer(-i-1, false) != STATUS_OK)
-						{
-							elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
+					if (! expectDirty)
+						elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty",
 							 RelationGetRelationName(rel),
 							 block, buf->tag.blockNum);
-							return -1;
-						}
-					}
-					else
+					if (FlushBuffer(-i-1, false) != STATUS_OK)
 					{
-						elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty",
+						elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
 							 RelationGetRelationName(rel),
 							 block, buf->tag.blockNum);
 						return -1;
@ -1676,39 +1869,42 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
 	SpinAcquire(BufMgrLock);
 	for (i = 0; i < NBuffers; i++)
 	{
+	recheck:
 		buf = &BufferDescriptors[i];
-		if (buf->tag.relId.dbId == MyDatabaseId &&
-			buf->tag.relId.relId == RelationGetRelid(rel) &&
+		if (buf->tag.relId.relId == RelationGetRelid(rel) &&
+			(buf->tag.relId.dbId == MyDatabaseId ||
+			 buf->tag.relId.dbId == (Oid) NULL) &&
 			buf->tag.blockNum >= block)
 		{
 			if (buf->flags & BM_DIRTY)
 			{
-				if (doFlush)
-				{
+				PinBuffer(buf);
 				SpinRelease(BufMgrLock);
-					if (FlushBuffer(i+1, false) != STATUS_OK)
+				if (! expectDirty)
+					elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d)",
+						 RelationGetRelationName(rel), block,
+						 buf->tag.blockNum,
+						 PrivateRefCount[i], buf->refcount);
+				if (FlushBuffer(i+1, true) != STATUS_OK)
 				{
 					elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it",
-							 buf->sb_relname, block, buf->tag.blockNum,
+						 RelationGetRelationName(rel), block,
+						 buf->tag.blockNum,
 						 PrivateRefCount[i], buf->refcount);
 					return -1;
 				}
 				SpinAcquire(BufMgrLock);
-				}
-				else
-				{
-					SpinRelease(BufMgrLock);
-					elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d)",
-						 buf->sb_relname, block, buf->tag.blockNum,
-						 PrivateRefCount[i], buf->refcount);
-					return -1;
-				}
+				/* Buffer could already be reassigned, so must recheck
+				 * whether it still belongs to rel before freeing it!
+				 */
+				goto recheck;
 			}
 			if (!(buf->flags & BM_FREE))
 			{
 				SpinRelease(BufMgrLock);
 				elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)",
-					 buf->sb_relname, block, buf->tag.blockNum,
+					 RelationGetRelationName(rel), block,
+					 buf->tag.blockNum,
 					 PrivateRefCount[i], buf->refcount);
 				return -2;
 			}
@ -1755,11 +1951,6 @@ ReleaseBuffer(Buffer buffer)
 			AddBufferToFreelist(bufHdr);
 			bufHdr->flags |= BM_FREE;
 		}
-		if (CommitInfoNeedsSave[buffer - 1])
-		{
-			bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-			CommitInfoNeedsSave[buffer - 1] = 0;
-		}
 		SpinRelease(BufMgrLock);
 	}

@ -1777,7 +1968,7 @@ IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)

 		fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \
 refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->sb_relname, buf->tag.blockNum,
+				buffer, buf->blind.relname, buf->tag.blockNum,
 				PrivateRefCount[buffer - 1], file, line);
 	}
 }
@ -1795,7 +1986,7 @@ ReleaseBuffer_Debug(char *file, int line, Buffer buffer)

 		fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \
 refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->sb_relname, buf->tag.blockNum,
+				buffer, buf->blind.relname, buf->tag.blockNum,
 				PrivateRefCount[buffer - 1], file, line);
 	}
 }
@ -1822,7 +2013,7 @@ ReleaseAndReadBuffer_Debug(char *file,

 		fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
 refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->sb_relname, buf->tag.blockNum,
+				buffer, buf->blind.relname, buf->tag.blockNum,
 				PrivateRefCount[buffer - 1], file, line);
 	}
 	if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer))
@ -1831,7 +2022,7 @@ refcount = %ld, file: %s, line: %d\n",

 		fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
 refcount = %ld, file: %s, line: %d\n",
-				b, buf->sb_relname, buf->tag.blockNum,
+				b, buf->blind.relname, buf->tag.blockNum,
 				PrivateRefCount[b - 1], file, line);
 	}
 	return b;
@ -1983,11 +2174,43 @@ _bm_die(Oid dbId, Oid relId, int blkNo, int bufNo,

 #endif	 /* BMTRACE */

+/*
+ * SetBufferCommitInfoNeedsSave
+ *
+ *	Mark a buffer dirty when we have updated tuple commit-status bits in it.
+ *
+ * This is similar to WriteNoReleaseBuffer, except that we do not set
+ * SharedBufferChanged or BufferDirtiedByMe, because we have not made a
+ * critical change that has to be flushed to disk before xact commit --- the
+ * status-bit update could be redone by someone else just as easily.  The
+ * buffer will be marked dirty, but it will not be written to disk until
+ * there is another reason to write it.
+ *
+ * This routine might get called many times on the same page, if we are making
+ * the first scan after commit of an xact that added/deleted many tuples.
+ * So, be as quick as we can if the buffer is already dirty.
+ */
 void
 SetBufferCommitInfoNeedsSave(Buffer buffer)
 {
-	if (!BufferIsLocal(buffer))
-		CommitInfoNeedsSave[buffer - 1]++;
+	BufferDesc *bufHdr;
+
+	if (BufferIsLocal(buffer))
+		return;
+
+	if (BAD_BUFFER_ID(buffer))
+		return;
+
+	bufHdr = &BufferDescriptors[buffer - 1];
+
+	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
+		(BM_DIRTY | BM_JUST_DIRTIED))
+	{
+		SpinAcquire(BufMgrLock);
+		Assert(bufHdr->refcount > 0);
+		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+		SpinRelease(BufMgrLock);
+	}
 }

 void
@ -2175,7 +2398,16 @@ static void StartBufferIO(BufferDesc *buf, bool forInput)
 	Assert(!(buf->flags & BM_IO_IN_PROGRESS));
 	buf->flags |= BM_IO_IN_PROGRESS;
 #ifdef	HAS_TEST_AND_SET
-	Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)))
+	/*
+	 * There used to be
+	 *
+	 * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
+	 *
+	 * here, but that's wrong because of the way WaitIO works: someone else
+	 * waiting for the I/O to complete will succeed in grabbing the lock for
+	 * a few instructions, and if we context-swap back to here the Assert
+	 * could fail.  Tiny window for failure, but I've seen it happen -- tgl
+	 */
 	S_LOCK(&(buf->io_in_progress_lock));
 #endif /* HAS_TEST_AND_SET */
 	InProgressBuf = buf;
@ -2217,7 +2449,7 @@ static void ContinueBufferIO(BufferDesc *buf, bool forInput)
 	IsForInput = forInput;
 }

-extern void	InitBufferIO(void)
+void InitBufferIO(void)
 {
 	InProgressBuf = (BufferDesc *)0;
 }
@ -2229,7 +2461,7 @@ extern void	InitBufferIO(void)
 *	set in case of output,this routine would kill all 
 *	backends and reset postmaster.
 */
-extern void	AbortBufferIO(void)
+void AbortBufferIO(void)
 {
 	BufferDesc *buf = InProgressBuf;
 	if (buf)
@ -2252,8 +2484,8 @@ extern void	AbortBufferIO(void)
 			buf->flags |= BM_DIRTY;
 		}
 		buf->flags |= BM_IO_ERROR;
-		TerminateBufferIO(buf);
 		buf->flags &= ~BM_IO_IN_PROGRESS;
+		TerminateBufferIO(buf);
 		SpinRelease(BufMgrLock);
 	}
 }
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.20 2000/01/26 05:56:52 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.21 2000/04/09 04:43:19 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -122,7 +122,7 @@ PinBuffer_Debug(char *file, int line, BufferDesc *buf)

 		fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
 refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->sb_relname, buf->tag.blockNum,
+				buffer, buf->blind.relname, buf->tag.blockNum,
 				PrivateRefCount[buffer - 1], file, line);
 	}
 }
@ -168,7 +168,7 @@ UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)

 		fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
 refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->sb_relname, buf->tag.blockNum,
+				buffer, buf->blind.relname, buf->tag.blockNum,
 				PrivateRefCount[buffer - 1], file, line);
 	}
 }
@ -304,7 +304,7 @@ PrintBufferFreeList()
 		int			i = (buf - BufferDescriptors);

 		printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n",
-			   i, buf->sb_relname, buf->tag.blockNum,
+			   i, buf->blind.relname, buf->tag.blockNum,
 			   buf->flags, buf->refcount, PrivateRefCount[i],
 			   buf->freeNext, buf->freePrev);

--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.54 2000/03/17 02:36:19 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.55 2000/04/09 04:43:19 tgl Exp $
 *
 * NOTES:
 *
@ -293,7 +293,7 @@ LruDelete(File file)
 	vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
 	Assert(vfdP->seekPos != -1);

-	/* if we have written to the file, sync it */
+	/* if we have written to the file, sync it before closing */
 	if (vfdP->fdstate & FD_DIRTY)
 	{
 		returnValue = pg_fsync(vfdP->fd);
@ -381,9 +381,6 @@ tryAgain:
 			returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
 			Assert(returnValue != -1);
 		}
-
-		/* Update state as appropriate for re-open (needed?) */
-		vfdP->fdstate &= ~FD_DIRTY;
 	}

 	/*
@ -804,7 +801,7 @@ FileWrite(File file, char *buffer, int amount)
 	if (returnCode > 0)
 		VfdCache[file].seekPos += returnCode;

-	/* record the write */
+	/* mark the file as needing fsync */
 	VfdCache[file].fdstate |= FD_DIRTY;

 	return returnCode;
@ -873,6 +870,35 @@ FileTruncate(File file, long offset)
 	return returnCode;
 }

+/*
+ * FileSync --- if a file is marked as dirty, fsync it.
+ *
+ * The FD_DIRTY bit is slightly misnamed: it doesn't mean that we need to
+ * write the file, but that we *have* written it and need to execute an
+ * fsync() to ensure the changes are down on disk before we mark the current
+ * transaction committed.
+ *
+ * FD_DIRTY is set by FileWrite or by an explicit FileMarkDirty() call.
+ * It is cleared after successfully fsync'ing the file.  FileClose() will
+ * fsync a dirty File that is about to be closed, since there will be no
+ * other place to remember the need to fsync after the VFD is gone.
+ *
+ * Note that the DIRTY bit is logically associated with the actual disk file,
+ * not with any particular kernel FD we might have open for it.  We assume
+ * that fsync will force out any dirty buffers for that file, whether or not
+ * they were written through the FD being used for the fsync call --- they
+ * might even have been written by some other backend!
+ *
+ * Note also that LruDelete currently fsyncs a dirty file that it is about
+ * to close the kernel file descriptor for.  The idea there is to avoid
+ * having to re-open the kernel descriptor later.  But it's not real clear
+ * that this is a performance win; we could end up fsyncing the same file
+ * multiple times in a transaction, which would probably cost more time
+ * than is saved by avoiding an open() call.  This should be studied.
+ *
+ * This routine used to think it could skip the fsync if the file is
+ * physically closed, but that is now WRONG; see comments for FileMarkDirty.
+ */
 int
 FileSync(File file)
 {
@ -880,23 +906,66 @@ FileSync(File file)

 	Assert(FileIsValid(file));

-	/*
-	 * If the file isn't open, then we don't need to sync it; we always
-	 * sync files when we close them.  Also, if we haven't done any writes
-	 * that we haven't already synced, we can ignore the request.
-	 */
-
-	if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY))
+	if (!(VfdCache[file].fdstate & FD_DIRTY))
+	{
+		/* Need not sync if file is not dirty. */
+		returnCode = 0;
+	}
+	else if (disableFsync)
+	{
+		/* Don't force the file open if pg_fsync isn't gonna sync it. */
 		returnCode = 0;
+		VfdCache[file].fdstate &= ~FD_DIRTY;
+	}
 	else 
 	{
+		/* We don't use FileAccess() because we don't want to force the
+		 * file to the front of the LRU ring; we aren't expecting to
+		 * access it again soon.
+		 */
+		if (FileIsNotOpen(file))
+		{
+			returnCode = LruInsert(file);
+			if (returnCode != 0)
+				return returnCode;
+		}
 		returnCode = pg_fsync(VfdCache[file].fd);
+		if (returnCode == 0)
 			VfdCache[file].fdstate &= ~FD_DIRTY;
 	}

 	return returnCode;
 }

+/*
+ * FileMarkDirty --- mark a file as needing fsync at transaction commit.
+ *
+ * Since FileWrite marks the file dirty, this routine is not needed in
+ * normal use.  It is called when the buffer manager detects that some other
+ * backend has written out a shared buffer that this backend dirtied (but
+ * didn't write) in the current xact.  In that scenario, we need to fsync
+ * the file before we can commit.  We cannot assume that the other backend
+ * has fsync'd the file yet; we need to do our own fsync to ensure that
+ * (a) the disk page is written and (b) this backend's commit is delayed
+ * until the write is complete.
+ *
+ * Note we are assuming that an fsync issued by this backend will write
+ * kernel disk buffers that were dirtied by another backend.  Furthermore,
+ * it doesn't matter whether we currently have the file physically open;
+ * we must fsync even if we have to re-open the file to do it.
+ */
+void
+FileMarkDirty(File file)
+{
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(DEBUG, "FileMarkDirty: %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	VfdCache[file].fdstate |= FD_DIRTY;
+}
+
+
 /*
 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
 * rather than plain fopen().  This lets fd.c deal with freeing FDs if
@ -992,6 +1061,12 @@ closeAllVfds()
 * exit (it doesn't particularly care which).  All still-open temporary-file
 * VFDs are closed, which also causes the underlying files to be deleted.
 * Furthermore, all "allocated" stdio files are closed.
+ *
+ * This routine is not involved in fsync'ing non-temporary files at xact
+ * commit; that is done by FileSync under control of the buffer manager.
+ * During a commit, that is done *before* control gets here.  If we still
+ * have any needs-fsync bits set when we get here, we assume this is abort
+ * and clear them.
 */
 void
 AtEOXact_Files(void)
@ -1006,6 +1081,8 @@ AtEOXact_Files(void)
 			if ((VfdCache[i].fdstate & FD_TEMPORARY) &&
 				VfdCache[i].fileName != NULL)
 				FileClose(i);
+			else
+				VfdCache[i].fdstate &= ~FD_DIRTY;
 		}
 	}

--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.64 2000/02/07 02:38:18 inoue Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.65 2000/04/09 04:43:20 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -48,7 +48,12 @@
 typedef struct _MdfdVec
 {
 	int			mdfd_vfd;		/* fd number in vfd pool */
-	uint16		mdfd_flags;		/* clean, dirty, free */
+	int			mdfd_flags;		/* free, temporary */
+
+/* these are the assigned bits in mdfd_flags: */
+#define MDFD_FREE		(1 << 0)/* unused entry */
+#define MDFD_TEMP		(1 << 1)/* close this entry at transaction end */
+
 	int			mdfd_lstbcnt;	/* most recent block count */
 	int			mdfd_nextFree;	/* next free vector */
 #ifndef LET_OS_MANAGE_FILESIZE
@ -62,13 +67,13 @@ static int	Md_Free = -1;		/* head of freelist of unused fdvec entries */
 static int	CurFd = 0;			/* first never-used fdvec index */
 static MemoryContext MdCxt;		/* context for all my allocations */

-#define MDFD_DIRTY		(uint16) 0x01
-#define MDFD_FREE		(uint16) 0x02
-
 /* routines declared here */
+static void mdclose_fd(int fd);
 static int _mdfd_getrelnfd(Relation reln);
 static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
 static MdfdVec *_mdfd_getseg(Relation reln, int blkno);
+static MdfdVec *_mdfd_blind_getseg(char *dbname, char *relname,
+								   Oid dbid, Oid relid, int blkno);
 static int	_fdvec_alloc(void);
 static void _fdvec_free(int);
 static BlockNumber _mdnblocks(File file, Size blcksz);
@ -186,6 +191,8 @@ mdcreate(Relation reln)
 #endif
 	Md_fdvec[vfd].mdfd_lstbcnt = 0;

+	pfree(path);
+
 	return vfd;
 }

@ -290,9 +297,6 @@ mdextend(Relation reln, char *buffer)
 		return SM_FAIL;
 	}

-	/* remember that we did a write, so we can sync at xact commit */
-	v->mdfd_flags |= MDFD_DIRTY;
-
 	/* try to keep the last block count current, though it's just a hint */
 #ifndef LET_OS_MANAGE_FILESIZE
 	if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
@ -367,6 +371,8 @@ mdopen(Relation reln)
 #endif
 #endif

+	pfree(path);
+
 	return vfd;
 }

@ -382,13 +388,24 @@ int
 mdclose(Relation reln)
 {
 	int			fd;
-	MdfdVec    *v;
-	MemoryContext oldcxt;

 	fd = RelationGetFile(reln);
 	if (fd < 0)
 		return SM_SUCCESS;		/* already closed, so no work */

+	mdclose_fd(fd);
+
+	reln->rd_fd = -1;
+
+	return SM_SUCCESS;
+}
+
+static void
+mdclose_fd(int fd)
+{
+	MdfdVec    *v;
+	MemoryContext oldcxt;
+
 	oldcxt = MemoryContextSwitchTo(MdCxt);
 #ifndef LET_OS_MANAGE_FILESIZE
 	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
@ -398,17 +415,14 @@ mdclose(Relation reln)
 		/* if not closed already */
 		if (v->mdfd_vfd >= 0)
 		{
-
 			/*
 			 * We sync the file descriptor so that we don't need to reopen
-			 * it at transaction commit to force changes to disk.
+			 * it at transaction commit to force changes to disk.  (This
+			 * is not really optional, because we are about to forget that
+			 * the file even exists...)
 			 */
-
 			FileSync(v->mdfd_vfd);
 			FileClose(v->mdfd_vfd);
-
-			/* mark this file descriptor as clean in our private table */
-			v->mdfd_flags &= ~MDFD_DIRTY;
 		}
 		/* Now free vector */
 		v = v->mdfd_chain;
@ -423,28 +437,20 @@ mdclose(Relation reln)
 	{
 		if (v->mdfd_vfd >= 0)
 		{
-
 			/*
 			 * We sync the file descriptor so that we don't need to reopen
-			 * it at transaction commit to force changes to disk.
+			 * it at transaction commit to force changes to disk.  (This
+			 * is not really optional, because we are about to forget that
+			 * the file even exists...)
 			 */
-
 			FileSync(v->mdfd_vfd);
 			FileClose(v->mdfd_vfd);
-
-			/* mark this file descriptor as clean in our private table */
-			v->mdfd_flags &= ~MDFD_DIRTY;
 		}
 	}
 #endif
 	MemoryContextSwitchTo(oldcxt);

 	_fdvec_free(fd);
-
-	/* be sure to mark relation closed */
-	reln->rd_fd = -1;
-
-	return SM_SUCCESS;
 }

 /*
@ -521,8 +527,6 @@ mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
 	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
 		status = SM_FAIL;

-	v->mdfd_flags |= MDFD_DIRTY;
-
 	return status;
 }

@ -560,14 +564,6 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer)
 		|| FileSync(v->mdfd_vfd) < 0)
 		status = SM_FAIL;

-	/*
-	 * By here, the block is written and changes have been forced to
-	 * stable storage.	Mark the descriptor as clean until the next write,
-	 * so we don't sync it again unnecessarily at transaction commit.
-	 */
-
-	v->mdfd_flags &= ~MDFD_DIRTY;
-
 	return status;
 }

@ -575,139 +571,87 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer)
 *	mdblindwrt() -- Write a block to disk blind.
 *
 *		We have to be able to do this using only the name and OID of
- *		the database and relation in which the block belongs.  This
- *		is a synchronous write.
+ *		the database and relation in which the block belongs.  Otherwise
+ *		this is just like mdwrite().
 */
 int
-mdblindwrt(char *dbstr,
-		   char *relstr,
+mdblindwrt(char *dbname,
+		   char *relname,
 		   Oid dbid,
 		   Oid relid,
 		   BlockNumber blkno,
 		   char *buffer)
 {
-	int			fd;
-	int			segno;
-	long		seekpos;
 	int			status;
-	char	   *path;
-
-#ifndef LET_OS_MANAGE_FILESIZE
-	int			nchars;
+	long		seekpos;
+	MdfdVec    *v;

-	/* be sure we have enough space for the '.segno', if any */
-	segno = blkno / RELSEG_SIZE;
-	if (segno > 0)
-		nchars = 10;
-	else
-		nchars = 0;
+	v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);

-	/* construct the path to the file and open it */
-	/* system table? then put in system area... */
-	if (dbid == (Oid) 0)
-	{
-		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
-		if (segno == 0)
-			sprintf(path, "%s/%s", DataDir, relstr);
-		else
-			sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
-	}
-	/* user table? then put in user database area... */
-	else if (dbid == MyDatabaseId)
-	{
-		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
-		if (segno == 0)
-			sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
-		else
-			sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
-	}
-	else
-/* this is work arround only !!! */
-	{
-		char		dbpath[MAXPGPATH];
-		Oid			id;
-		char	   *tmpPath;
-
-		GetRawDatabaseInfo(dbstr, &id, dbpath);
-
-		if (id != dbid)
-			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
-		tmpPath = ExpandDatabasePath(dbpath);
-		if (tmpPath == NULL)
-			elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
-		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
-		if (segno == 0)
-			sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
-		else
-			sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
-		pfree(tmpPath);
-	}
-#else
-	/* construct the path to the file and open it */
-	/* system table? then put in system area... */
-	if (dbid == (Oid) 0)
-	{
-		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
-		sprintf(path, "%s/%s", DataDir, relstr);
-	}
-	/* user table? then put in user database area... */
-	else if (dbid == MyDatabaseId)
-	{
-		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
-		sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
-	}
-	else
-/* this is work arround only !!! */
-	{
-		char		dbpath[MAXPGPATH];
-		Oid			id;
-		char	   *tmpPath;
-
-		GetRawDatabaseInfo(dbstr, &id, dbpath);
-
-		if (id != dbid)
-			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
-		tmpPath = ExpandDatabasePath(dbpath);
-		if (tmpPath == NULL)
-			elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
-		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
-		sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
-		pfree(tmpPath);
-	}
-#endif
-
-#ifndef __CYGWIN32__
-	if ((fd = open(path, O_RDWR, 0600)) < 0)
-#else
-	if ((fd = open(path, O_RDWR | O_BINARY, 0600)) < 0)
-#endif
+	if (v == NULL)
 		return SM_FAIL;

-	/* seek to the right spot */
 #ifndef LET_OS_MANAGE_FILESIZE
 	seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
+#ifdef DIAGNOSTIC
+	if (seekpos >= BLCKSZ * RELSEG_SIZE)
+		elog(FATAL, "seekpos too big!");
+#endif
 #else
 	seekpos = (long) (BLCKSZ * (blkno));
 #endif

-	if (lseek(fd, seekpos, SEEK_SET) != seekpos)
-	{
-		close(fd);
+	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 		return SM_FAIL;
-	}

 	status = SM_SUCCESS;
-
-	/* write and sync the block */
-	if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
+	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
 		status = SM_FAIL;

-	if (close(fd) < 0)
-		status = SM_FAIL;
+	return status;
+}

-	pfree(path);
+/*
+ *	mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
+ *
+ *		Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mdmarkdirty(Relation reln, BlockNumber blkno)
+{
+	MdfdVec    *v;

-	return status;
+	v = _mdfd_getseg(reln, blkno);
+
+	FileMarkDirty(v->mdfd_vfd);
+
+	return SM_SUCCESS;
+}
+
+/*
+ *	mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
+ *
+ *		We have to be able to do this using only the name and OID of
+ *		the database and relation in which the block belongs.  Otherwise
+ *		this is just like mdmarkdirty().
+ */
+int
+mdblindmarkdirty(char *dbname,
+				 char *relname,
+				 Oid dbid,
+				 Oid relid,
+				 BlockNumber blkno)
+{
+	MdfdVec    *v;
+
+	v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
+
+	if (v == NULL)
+		return SM_FAIL;
+
+	FileMarkDirty(v->mdfd_vfd);
+
+	return SM_SUCCESS;
 }

 /*
@ -873,19 +817,26 @@ mdcommit()

 	for (i = 0; i < CurFd; i++)
 	{
+		v = &Md_fdvec[i];
+		if (v->mdfd_flags & MDFD_FREE)
+			continue;
+		if (v->mdfd_flags & MDFD_TEMP)
+		{
+			/* Sync and close the file */
+			mdclose_fd(i);
+		}
+		else
+		{
+			/* Sync, but keep the file entry */
+
 #ifndef LET_OS_MANAGE_FILESIZE
-		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
+			for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain)
 #else
-		v = &Md_fdvec[i];
 			if (v != (MdfdVec *) NULL)
 #endif
-		{
-			if (v->mdfd_flags & MDFD_DIRTY)
 			{
 				if (FileSync(v->mdfd_vfd) < 0)
 					return SM_FAIL;
-
-				v->mdfd_flags &= ~MDFD_DIRTY;
 			}
 		}
 	}
@ -908,13 +859,14 @@ mdabort()

 	for (i = 0; i < CurFd; i++)
 	{
-#ifndef LET_OS_MANAGE_FILESIZE
-		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
-			v->mdfd_flags &= ~MDFD_DIRTY;
-#else
 		v = &Md_fdvec[i];
-		v->mdfd_flags &= ~MDFD_DIRTY;
-#endif
+		if (v->mdfd_flags & MDFD_FREE)
+			continue;
+		if (v->mdfd_flags & MDFD_TEMP)
+		{
+			/* Close the file */
+			mdclose_fd(i);
+		}
 	}

 	return SM_SUCCESS;
@ -995,7 +947,6 @@ _fdvec_free(int fdvec)
 	Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
 	Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
 	Md_Free = fdvec;
-
 }

 static MdfdVec *
@ -1004,19 +955,17 @@ _mdfd_openseg(Relation reln, int segno, int oflags)
 	MemoryContext oldcxt;
 	MdfdVec    *v;
 	int			fd;
-	bool		dofree;
 	char	   *path,
 			   *fullpath;

 	/* be sure we have enough space for the '.segno', if any */
 	path = relpath(RelationGetPhysicalRelationName(reln));

-	dofree = false;
 	if (segno > 0)
 	{
-		dofree = true;
 		fullpath = (char *) palloc(strlen(path) + 12);
 		sprintf(fullpath, "%s.%d", path, segno);
+		pfree(path);
 	}
 	else
 		fullpath = path;
@ -1028,7 +977,6 @@ _mdfd_openseg(Relation reln, int segno, int oflags)
 	fd = FileNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600);
 #endif

-	if (dofree)
 	pfree(fullpath);

 	if (fd < 0)
@ -1109,6 +1057,104 @@ _mdfd_getseg(Relation reln, int blkno)
 	return v;
 }

+/* Find the segment of the relation holding the specified block.
+ * This is the same as _mdfd_getseg() except that we must work
+ * "blind" with no Relation struct.
+ *
+ * NOTE: we have no easy way to tell whether a FD already exists for the
+ * target relation, so we always make a new one.  This should probably
+ * be improved somehow, but I doubt it's a significant performance issue
+ * under normal circumstances.  The FD is marked to be closed at end of xact
+ * so that we don't accumulate a lot of dead FDs.
+ */
+
+static MdfdVec *
+_mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid,
+				   int blkno)
+{
+	MdfdVec    *v;
+	char	   *path;
+	int			fd;
+	int			vfd;
+#ifndef LET_OS_MANAGE_FILESIZE
+	int			segno;
+	int			targsegno;
+#endif
+
+	/* construct the path to the file and open it */
+	path = relpath_blind(dbname, relname, dbid, relid);
+
+#ifndef __CYGWIN32__
+	fd = FileNameOpenFile(path, O_RDWR, 0600);
+#else
+	fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);
+#endif
+
+	if (fd < 0)
+		return NULL;
+
+	vfd = _fdvec_alloc();
+	if (vfd < 0)
+		return NULL;
+
+	Md_fdvec[vfd].mdfd_vfd = fd;
+	Md_fdvec[vfd].mdfd_flags = MDFD_TEMP;
+	Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+#ifndef LET_OS_MANAGE_FILESIZE
+	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
+
+#ifdef DIAGNOSTIC
+	if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
+		elog(FATAL, "segment too big on relopen!");
+#endif
+
+	targsegno = blkno / RELSEG_SIZE;
+	for (v = &Md_fdvec[vfd], segno = 1; segno <= targsegno; segno++)
+	{
+		char	   *segpath;
+		MdfdVec    *newv;
+		MemoryContext oldcxt;
+
+		segpath = (char *) palloc(strlen(path) + 12);
+		sprintf(segpath, "%s.%d", path, segno);
+
+#ifndef __CYGWIN32__
+		fd = FileNameOpenFile(segpath, O_RDWR | O_CREAT, 0600);
+#else
+		fd = FileNameOpenFile(segpath, O_RDWR | O_BINARY | O_CREAT, 0600);
+#endif
+
+		pfree(segpath);
+
+		if (fd < 0)
+			return (MdfdVec *) NULL;
+
+		/* allocate an mdfdvec entry for it */
+		oldcxt = MemoryContextSwitchTo(MdCxt);
+		newv = (MdfdVec *) palloc(sizeof(MdfdVec));
+		MemoryContextSwitchTo(oldcxt);
+
+		/* fill the entry */
+		newv->mdfd_vfd = fd;
+		newv->mdfd_flags = MDFD_TEMP;
+		newv->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+		newv->mdfd_chain = (MdfdVec *) NULL;
+#ifdef DIAGNOSTIC
+		if (newv->mdfd_lstbcnt > RELSEG_SIZE)
+			elog(FATAL, "segment too big on open!");
+#endif
+		v->mdfd_chain = newv;
+		v = newv;
+	}
+#else
+	v = &Md_fdvec[vfd];
+#endif
+
+	pfree(path);
+
+	return v;
+}
+
 static BlockNumber
 _mdnblocks(File file, Size blcksz)
 {
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@ -11,7 +11,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.32 2000/01/26 05:57:05 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.33 2000/04/09 04:43:20 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -23,21 +23,30 @@ static void smgrshutdown(int dummy);

 typedef struct f_smgr
 {
-	int			(*smgr_init) ();/* may be NULL */
-	int			(*smgr_shutdown) ();	/* may be NULL */
-	int			(*smgr_create) ();
-	int			(*smgr_unlink) ();
-	int			(*smgr_extend) ();
-	int			(*smgr_open) ();
-	int			(*smgr_close) ();
-	int			(*smgr_read) ();
-	int			(*smgr_write) ();
-	int			(*smgr_flush) ();
-	int			(*smgr_blindwrt) ();
-	int			(*smgr_nblocks) ();
-	int			(*smgr_truncate) ();
-	int			(*smgr_commit) ();		/* may be NULL */
-	int			(*smgr_abort) ();		/* may be NULL */
+	int			(*smgr_init) (void); /* may be NULL */
+	int			(*smgr_shutdown) (void); /* may be NULL */
+	int			(*smgr_create) (Relation reln);
+	int			(*smgr_unlink) (Relation reln);
+	int			(*smgr_extend) (Relation reln, char *buffer);
+	int			(*smgr_open) (Relation reln);
+	int			(*smgr_close) (Relation reln);
+	int			(*smgr_read) (Relation reln, BlockNumber blocknum,
+							  char *buffer);
+	int			(*smgr_write) (Relation reln, BlockNumber blocknum,
+							   char *buffer);
+	int			(*smgr_flush) (Relation reln, BlockNumber blocknum,
+							   char *buffer);
+	int			(*smgr_blindwrt) (char *dbname, char *relname,
+								  Oid dbid, Oid relid,
+								  BlockNumber blkno, char *buffer);
+	int			(*smgr_markdirty) (Relation reln, BlockNumber blkno);
+	int			(*smgr_blindmarkdirty) (char *dbname, char *relname,
+										Oid dbid, Oid relid,
+										BlockNumber blkno);
+	int			(*smgr_nblocks) (Relation reln);
+	int			(*smgr_truncate) (Relation reln, int nblocks);
+	int			(*smgr_commit) (void); /* may be NULL */
+	int			(*smgr_abort) (void); /* may be NULL */
 } f_smgr;

 /*
@ -49,14 +58,14 @@ static f_smgr smgrsw[] = {

 	/* magnetic disk */
 	{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
-		mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdtruncate,
-	mdcommit, mdabort},
+	 mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
+	 mdnblocks, mdtruncate, mdcommit, mdabort},

 #ifdef STABLE_MEMORY_STORAGE
 	/* main memory */
 	{mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
-		mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, NULL,
-	mmcommit, mmabort},
+	 mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty,
+	 mmnblocks, NULL, mmcommit, mmabort},

 #endif
 };
@ -299,6 +308,7 @@ smgrblindwrt(int16 which,
 	char	   *relstr;
 	int			status;

+	/* strdup here is probably redundant */
 	dbstr = pstrdup(dbname);
 	relstr = pstrdup(relname);

@ -315,6 +325,67 @@ smgrblindwrt(int16 which,
 	return status;
 }

+/*
+ *	smgrmarkdirty() -- Mark a page dirty (needs fsync).
+ *
+ *		Mark the specified page as needing to be fsync'd before commit.
+ *		Ordinarily, the storage manager will do this implicitly during
+ *		smgrwrite().  However, the buffer manager may discover that some
+ *		other backend has written a buffer that we dirtied in the current
+ *		transaction.  In that case, we still need to fsync the file to be
+ *		sure the page is down to disk before we commit.
+ */
+int
+smgrmarkdirty(int16 which,
+			  Relation reln,
+			  BlockNumber blkno)
+{
+	int			status;
+
+	status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno);
+
+	if (status == SM_FAIL)
+		elog(ERROR, "cannot mark block %d of %s",
+			 blkno, RelationGetRelationName(reln));
+
+	return status;
+}
+
+/*
+ *	smgrblindmarkdirty() -- Mark a page dirty, "blind".
+ *
+ *		Just like smgrmarkdirty, except we don't have a reldesc.
+ */
+int
+smgrblindmarkdirty(int16 which,
+				   char *dbname,
+				   char *relname,
+				   Oid dbid,
+				   Oid relid,
+				   BlockNumber blkno)
+{
+	char	   *dbstr;
+	char	   *relstr;
+	int			status;
+
+	/* strdup here is probably redundant */
+	dbstr = pstrdup(dbname);
+	relstr = pstrdup(relname);
+
+	status = (*(smgrsw[which].smgr_blindmarkdirty)) (dbstr, relstr,
+													 dbid, relid,
+													 blkno);
+
+	if (status == SM_FAIL)
+		elog(ERROR, "cannot mark block %d of %s [%s] blind",
+			 blkno, relstr, dbstr);
+
+	pfree(dbstr);
+	pfree(relstr);
+
+	return status;
+}
+
 /*
 *	smgrnblocks() -- Calculate the number of POSTGRES blocks in the
 *					 supplied relation.
@ -378,7 +449,6 @@ smgrcommit()
 	return SM_SUCCESS;
 }

-#ifdef NOT_USED
 int
 smgrabort()
 {
@ -396,8 +466,6 @@ smgrabort()
 	return SM_SUCCESS;
 }

-#endif
-
 #ifdef NOT_USED
 bool
 smgriswo(int16 smgrno)
--- a/src/include/catalog/catalog.h
+++ b/src/include/catalog/catalog.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: catalog.h,v 1.10 2000/01/26 05:57:56 momjian Exp $
+ * $Id: catalog.h,v 1.11 2000/04/09 04:43:14 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -17,6 +17,8 @@
 #include "access/tupdesc.h"

 extern char *relpath(const char *relname);
+extern char *relpath_blind(const char *dbname, const char *relname,
+						   Oid dbid, Oid relid);
 extern bool IsSystemRelationName(const char *relname);
 extern bool IsSharedSystemRelationName(const char *relname);
 extern Oid	newoid(void);
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: buf_internals.h,v 1.35 2000/01/26 05:58:32 momjian Exp $
+ * $Id: buf_internals.h,v 1.36 2000/04/09 04:43:18 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -61,6 +61,16 @@ typedef struct buftag
 	(a)->relId = (xx_reln)->rd_lockInfo.lockRelId \
 )

+/* If we have to write a buffer "blind" (without a relcache entry),
+ * the BufferTag is not enough information.  BufferBlindId carries the
+ * additional information needed.
+ */
+typedef struct bufblindid
+{
+	char		dbname[NAMEDATALEN]; /* name of db in which buf belongs */
+	char		relname[NAMEDATALEN]; /* name of reln */
+} BufferBlindId;
+
 #define BAD_BUFFER_ID(bid) ((bid) < 1 || (bid) > NBuffers)
 #define INVALID_DESCRIPTOR (-3)

@ -98,8 +108,7 @@ typedef struct sbufdesc
 	bool		ri_lock;		/* read-intent lock */
 	bool		w_lock;			/* context exclusively locked */

-	char		sb_dbname[NAMEDATALEN]; /* name of db in which buf belongs */
-	char		sb_relname[NAMEDATALEN];		/* name of reln */
+	BufferBlindId blind;		/* extra info to support blind write */
 } BufferDesc;

 /*
@ -164,7 +173,9 @@ extern BufferDesc *BufferDescriptors;
 extern BufferBlock BufferBlocks;
 extern long *PrivateRefCount;
 extern bits8 *BufferLocks;
-extern long *CommitInfoNeedsSave;
+extern BufferTag *BufferTagLastDirtied;
+extern BufferBlindId *BufferBlindLastDirtied;
+extern bool *BufferDirtiedByMe;
 extern SPINLOCK BufMgrLock;

 /* localbuf.c */
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: bufmgr.h,v 1.35 2000/03/31 02:43:30 tgl Exp $
+ * $Id: bufmgr.h,v 1.36 2000/04/09 04:43:18 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -164,7 +164,7 @@ extern int	FlushBuffer(Buffer buffer, bool release);
 extern void InitBufferPool(IPCKey key);
 extern void PrintBufferUsage(FILE *statfp);
 extern void ResetBufferUsage(void);
-extern void ResetBufferPool(void);
+extern void ResetBufferPool(bool isCommit);
 extern int	BufferPoolCheckLeak(void);
 extern void FlushBufferPool(void);
 extern BlockNumber BufferGetBlockNumber(Buffer buffer);
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: fd.h,v 1.19 2000/01/26 05:58:32 momjian Exp $
+ * $Id: fd.h,v 1.20 2000/04/09 04:43:18 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -15,7 +15,7 @@
 /*
 * calls:
 *
- *	File {Close, Read, Write, Seek, Tell, Sync}
+ *	File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
 *	{File Name Open, Allocate, Free} File
 *
 * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
@ -58,6 +58,7 @@ extern int	FileWrite(File file, char *buffer, int amount);
 extern long FileSeek(File file, long offset, int whence);
 extern int	FileTruncate(File file, long offset);
 extern int	FileSync(File file);
+extern void FileMarkDirty(File file);

 /* Operations that allow use of regular stdio --- USE WITH CAUTION */
 extern FILE *AllocateFile(char *name, char *mode);
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: smgr.h,v 1.17 2000/01/26 05:58:33 momjian Exp $
+ * $Id: smgr.h,v 1.18 2000/04/09 04:43:18 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -35,11 +35,17 @@ extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
 					  char *buffer);
 extern int	smgrflush(int16 which, Relation reln, BlockNumber blocknum,
 					  char *buffer);
-extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid,
-			 Oid relid, BlockNumber blkno, char *buffer);
+extern int	smgrblindwrt(int16 which, char *dbname, char *relname,
+						 Oid dbid, Oid relid,
+						 BlockNumber blkno, char *buffer);
+extern int	smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno);
+extern int	smgrblindmarkdirty(int16 which, char *dbname, char *relname,
+							   Oid dbid, Oid relid,
+							   BlockNumber blkno);
 extern int	smgrnblocks(int16 which, Relation reln);
 extern int	smgrtruncate(int16 which, Relation reln, int nblocks);
 extern int	smgrcommit(void);
+extern int	smgrabort(void);



@ -55,8 +61,11 @@ extern int	mdclose(Relation reln);
 extern int	mdread(Relation reln, BlockNumber blocknum, char *buffer);
 extern int	mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
 extern int	mdflush(Relation reln, BlockNumber blocknum, char *buffer);
-extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
+extern int	mdblindwrt(char *dbname, char *relname, Oid dbid, Oid relid,
 					   BlockNumber blkno, char *buffer);
+extern int	mdmarkdirty(Relation reln, BlockNumber blkno);
+extern int	mdblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid,
+							 BlockNumber blkno);
 extern int	mdnblocks(Relation reln);
 extern int	mdtruncate(Relation reln, int nblocks);
 extern int	mdcommit(void);
@ -66,7 +75,6 @@ extern int	mdabort(void);
 extern SPINLOCK MMCacheLock;

 extern int	mminit(void);
-extern int	mmshutdown(void);
 extern int	mmcreate(Relation reln);
 extern int	mmunlink(Relation reln);
 extern int	mmextend(Relation reln, char *buffer);
@ -75,11 +83,17 @@ extern int	mmclose(Relation reln);
 extern int	mmread(Relation reln, BlockNumber blocknum, char *buffer);
 extern int	mmwrite(Relation reln, BlockNumber blocknum, char *buffer);
 extern int	mmflush(Relation reln, BlockNumber blocknum, char *buffer);
-extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
+extern int	mmblindwrt(char *dbname, char *relname, Oid dbid, Oid relid,
 					   BlockNumber blkno, char *buffer);
+extern int	mmmarkdirty(Relation reln, BlockNumber blkno);
+extern int	mmblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid,
+							 BlockNumber blkno);
 extern int	mmnblocks(Relation reln);
+extern int	mmtruncate(Relation reln, int nblocks);
 extern int	mmcommit(void);
 extern int	mmabort(void);
+
+extern int	mmshutdown(void);
 extern int	MMShmemSize(void);

 /* smgrtype.c */