Revise bgwriter fsync-request mechanism to improve robustness when a table

is deleted. A backend about to unlink a file now sends a "revoke fsync" request to the bgwriter to make it clean out pending fsync requests. There is still a race condition where the bgwriter may try to fsync after the unlink has happened, but we can resolve that by rechecking the fsync request queue to see if a revoke request arrived meanwhile. This eliminates the former kluge of "just assuming" that an ENOENT failure is okay, and lets us handle the fact that on Windows it might be EACCES too without introducing any questionable assumptions. After an idea of mine improved by Magnus. The HEAD patch doesn't apply cleanly to 8.2, but I'll see about a back-port later. In the meantime this could do with some testing on Windows; I've been able to force it through the code path via ENOENT, but that doesn't prove that it actually fixes the Windows problem ...
19 years ago · 6d660587f6
parent 7f58ed1a10
commit 6d660587f6
2 changed files with 213 additions and 98 deletions
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@ -2,7 +2,7 @@
 *
 * bgwriter.c
 *
- * The background writer (bgwriter) is new in Postgres 8.0.  It attempts
+ * The background writer (bgwriter) is new as of Postgres 8.0.  It attempts
 * to keep regular backends from having to write out dirty shared buffers
 * (which they would only do when needing to free a shared buffer to read in
 * another page).  In the best scenario all writes from shared buffers will
@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.34 2007/01/05 22:19:36 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.35 2007/01/17 00:17:20 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -103,8 +103,8 @@
 typedef struct
 {
 	RelFileNode rnode;
-	BlockNumber segno;
-	/* might add a request-type field later */
+	BlockNumber segno;			/* InvalidBlockNumber means "revoke" */
+	/* might add a real request-type field later; not needed yet */
 } BgWriterRequest;

 typedef struct
@ -695,6 +695,11 @@ RequestCheckpoint(bool waitforit, bool warnontime)
 * ForwardFsyncRequest
 *		Forward a file-fsync request from a backend to the bgwriter
 *
+ * segno specifies which segment (not block!) of the relation needs to be
+ * fsync'd.  If segno == InvalidBlockNumber, the meaning is to revoke any
+ * pending fsync requests for the entire relation (this message is sent
+ * when the relation is about to be deleted).
+ *
 * Whenever a backend is compelled to write directly to a relation
 * (which should be seldom, if the bgwriter is getting its job done),
 * the backend calls this routine to pass over knowledge that the relation
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.125 2007/01/05 22:19:38 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.126 2007/01/17 00:17:21 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -31,6 +31,19 @@
 /* interval for calling AbsorbFsyncRequests in mdsync */
 #define FSYNCS_PER_ABSORB		10

+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get.  Ugh.  This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting revoked ... see mdsync).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
+#endif
+
 /*
 *	The magnetic disk storage manager keeps track of open file
 *	descriptors in its own descriptor pool.  This is done to make it
@ -93,9 +106,8 @@ static MemoryContext MdCxt;		/* context for all md.c allocations */
 * we keep track of pending fsync operations: we need to remember all relation
 * segments that have been written since the last checkpoint, so that we can
 * fsync them down to disk before completing the next checkpoint.  This hash
- * table remembers the pending operations.	We use a hash table not because
- * we want to look up individual operations, but simply as a convenient way
- * of eliminating duplicate requests.
+ * table remembers the pending operations.	We use a hash table mostly as
+ * a convenient way of eliminating duplicate requests.
 *
 * (Regular backends do not track pending operations locally, but forward
 * them to the bgwriter.)
@ -104,6 +116,12 @@ typedef struct
 {
 	RelFileNode rnode;			/* the targeted relation */
 	BlockNumber segno;			/* which segment */
+} PendingOperationTag;
+
+typedef struct
+{
+	PendingOperationTag tag;	/* hash table key (must be first!) */
+	int			failures;		/* number of failed attempts to fsync */
 } PendingOperationEntry;

 static HTAB *pendingOpsTable = NULL;
@ -153,7 +171,7 @@ mdinit(void)
 		HASHCTL		hash_ctl;

 		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-		hash_ctl.keysize = sizeof(PendingOperationEntry);
+		hash_ctl.keysize = sizeof(PendingOperationTag);
 		hash_ctl.entrysize = sizeof(PendingOperationEntry);
 		hash_ctl.hash = tag_hash;
 		hash_ctl.hcxt = MdCxt;
@ -236,6 +254,35 @@ mdunlink(RelFileNode rnode, bool isRedo)
 {
 	char	   *path;

+	/*
+	 * We have to clean out any pending fsync requests for the doomed relation,
+	 * else the next mdsync() will fail.
+	 */
+	if (pendingOpsTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(rnode, InvalidBlockNumber);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/*
+		 * Notify the bgwriter about it.  If we fail to queue the revoke
+		 * message, we have to sleep and try again ... ugly, but hopefully
+		 * won't happen often.
+		 *
+		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
+		 * an error would leave the no-longer-used file still present on
+		 * disk, which would be bad, so I'm inclined to assume that the
+		 * bgwriter will always empty the queue soon.
+		 */
+		while (!ForwardFsyncRequest(rnode, InvalidBlockNumber))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+		/*
+		 * Note we don't wait for the bgwriter to actually absorb the
+		 * revoke message; see mdsync() for the implications.
+		 */
+	}
+
 	path = relpath(rnode);

 	/* Delete the first segment, or only segment if not doing segmenting */
@ -414,7 +461,8 @@ mdopen(SMgrRelation reln, ExtensionBehavior behavior)
 		if (fd < 0)
 		{
 			pfree(path);
-			if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
+			if (behavior == EXTENSION_RETURN_NULL &&
+				FILE_POSSIBLY_DELETED(errno))
 				return NULL;
 			ereport(ERROR,
 					(errcode_for_file_access(),
@ -834,94 +882,126 @@ mdimmedsync(SMgrRelation reln)
 void
 mdsync(void)
 {
-	HASH_SEQ_STATUS hstat;
-	PendingOperationEntry *entry;
-	int			absorb_counter;
+	bool		need_retry;

 	if (!pendingOpsTable)
 		elog(ERROR, "cannot sync without a pendingOpsTable");

 	/*
-	 * If we are in the bgwriter, the sync had better include all fsync
-	 * requests that were queued by backends before the checkpoint REDO point
-	 * was determined.	We go that a little better by accepting all requests
-	 * queued up to the point where we start fsync'ing.
+	 * The fsync table could contain requests to fsync relations that have
+	 * been deleted (unlinked) by the time we get to them.  Rather than
+	 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
+	 * what we will do is retry the whole process after absorbing fsync
+	 * request messages again.  Since mdunlink() queues a "revoke" message
+	 * before actually unlinking, the fsync request is guaranteed to be gone
+	 * the second time if it really was this case.
 	 */
-	AbsorbFsyncRequests();
+	do {
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
+		int			absorb_counter;
+
+		need_retry = false;

-	absorb_counter = FSYNCS_PER_ABSORB;
-	hash_seq_init(&hstat, pendingOpsTable);
-	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-	{
 		/*
-		 * If fsync is off then we don't have to bother opening the file at
-		 * all.  (We delay checking until this point so that changing fsync on
-		 * the fly behaves sensibly.)
+		 * If we are in the bgwriter, the sync had better include all fsync
+		 * requests that were queued by backends before the checkpoint REDO
+		 * point was determined. We go that a little better by accepting all
+		 * requests queued up to the point where we start fsync'ing.
 		 */
-		if (enableFsync)
-		{
-			SMgrRelation reln;
-			MdfdVec    *seg;
+		AbsorbFsyncRequests();

+		absorb_counter = FSYNCS_PER_ABSORB;
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+		{
 			/*
-			 * If in bgwriter, absorb pending requests every so often to
-			 * prevent overflow of the fsync request queue.  The hashtable
-			 * code does not specify whether entries added by this will be
-			 * visited by our search, but we don't really care: it's OK if we
-			 * do, and OK if we don't.
+			 * If fsync is off then we don't have to bother opening the file
+			 * at all.  (We delay checking until this point so that changing
+			 * fsync on the fly behaves sensibly.)
 			 */
-			if (--absorb_counter <= 0)
+			if (enableFsync)
 			{
-				AbsorbFsyncRequests();
-				absorb_counter = FSYNCS_PER_ABSORB;
-			}
-
-			/*
-			 * Find or create an smgr hash entry for this relation. This may
-			 * seem a bit unclean -- md calling smgr?  But it's really the
-			 * best solution.  It ensures that the open file reference isn't
-			 * permanently leaked if we get an error here. (You may say "but
-			 * an unreferenced SMgrRelation is still a leak!" Not really,
-			 * because the only case in which a checkpoint is done by a
-			 * process that isn't about to shut down is in the bgwriter, and
-			 * it will periodically do smgrcloseall().	This fact justifies
-			 * our not closing the reln in the success path either, which is a
-			 * good thing since in non-bgwriter cases we couldn't safely do
-			 * that.)  Furthermore, in many cases the relation will have been
-			 * dirtied through this same smgr relation, and so we can save a
-			 * file open/close cycle.
-			 */
-			reln = smgropen(entry->rnode);
+				SMgrRelation reln;
+				MdfdVec    *seg;
+
+				/*
+				 * If in bgwriter, we want to absorb pending requests every so
+				 * often to prevent overflow of the fsync request queue.  This
+				 * could result in deleting the current entry out from under
+				 * our hashtable scan, so the procedure is to fall out of the
+				 * scan and start over from the top of the function.
+				 */
+				if (--absorb_counter <= 0)
+				{
+					need_retry = true;
+					break;
+				}

-			/*
-			 * It is possible that the relation has been dropped or truncated
-			 * since the fsync request was entered.  Therefore, we have to
-			 * allow file-not-found errors.  This applies both during
-			 * _mdfd_getseg() and during FileSync, since fd.c might have
-			 * closed the file behind our back.
-			 */
-			seg = _mdfd_getseg(reln,
-							   entry->segno * ((BlockNumber) RELSEG_SIZE),
-							   false, EXTENSION_RETURN_NULL);
-			if (seg)
-			{
-				if (FileSync(seg->mdfd_vfd) < 0 &&
-					errno != ENOENT)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
-									entry->segno,
-									entry->rnode.spcNode,
-									entry->rnode.dbNode,
-									entry->rnode.relNode)));
+				/*
+				 * Find or create an smgr hash entry for this relation. This
+				 * may seem a bit unclean -- md calling smgr?  But it's really
+				 * the best solution.  It ensures that the open file reference
+				 * isn't permanently leaked if we get an error here. (You may
+				 * say "but an unreferenced SMgrRelation is still a leak!" Not
+				 * really, because the only case in which a checkpoint is done
+				 * by a process that isn't about to shut down is in the
+				 * bgwriter, and it will periodically do smgrcloseall(). This
+				 * fact justifies our not closing the reln in the success path
+				 * either, which is a good thing since in non-bgwriter cases
+				 * we couldn't safely do that.)  Furthermore, in many cases
+				 * the relation will have been dirtied through this same smgr
+				 * relation, and so we can save a file open/close cycle.
+				 */
+				reln = smgropen(entry->tag.rnode);
+
+				/*
+				 * It is possible that the relation has been dropped or
+				 * truncated since the fsync request was entered.  Therefore,
+				 * allow ENOENT, but only if we didn't fail once already on
+				 * this file.  This applies both during _mdfd_getseg() and
+				 * during FileSync, since fd.c might have closed the file
+				 * behind our back.
+				 */
+				seg = _mdfd_getseg(reln,
+								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
+								   false, EXTENSION_RETURN_NULL);
+				if (seg == NULL ||
+					FileSync(seg->mdfd_vfd) < 0)
+				{
+					/*
+					 * XXX is there any point in allowing more than one try?
+					 * Don't see one at the moment, but easy to change the
+					 * test here if so.
+					 */
+					if (!FILE_POSSIBLY_DELETED(errno) ||
+						++(entry->failures) > 1)
+						ereport(ERROR,
+								(errcode_for_file_access(),
+								 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+										entry->tag.segno,
+										entry->tag.rnode.spcNode,
+										entry->tag.rnode.dbNode,
+										entry->tag.rnode.relNode)));
+					else
+						ereport(DEBUG1,
+								(errcode_for_file_access(),
+								 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
+										entry->tag.segno,
+										entry->tag.rnode.spcNode,
+										entry->tag.rnode.dbNode,
+										entry->tag.rnode.relNode)));
+					need_retry = true;
+					continue;	/* don't delete the hashtable entry */
+				}
 			}
-		}

-		/* Okay, delete this entry */
-		if (hash_search(pendingOpsTable, entry,
-						HASH_REMOVE, NULL) == NULL)
-			elog(ERROR, "pendingOpsTable corrupted");
-	}
+			/* Okay, delete this entry */
+			if (hash_search(pendingOpsTable, &entry->tag,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "pendingOpsTable corrupted");
+		}
+	} while (need_retry);
 }

 /*
@ -938,14 +1018,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 {
 	if (pendingOpsTable)
 	{
-		PendingOperationEntry entry;
-
-		/* ensure any pad bytes in the struct are zeroed */
-		MemSet(&entry, 0, sizeof(entry));
-		entry.rnode = reln->smgr_rnode;
-		entry.segno = seg->mdfd_segno;
-
-		(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+		/* push it into local pending-ops table */
+		RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
 	}
 	else
 	{
@ -968,20 +1042,55 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 *
 * We stuff the fsync request into the local hash table for execution
 * during the bgwriter's next checkpoint.
+ *
+ * segno == InvalidBlockNumber is a "revoke" request: remove any pending
+ * fsync requests for the whole relation.
 */
 void
 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 {
-	PendingOperationEntry entry;
-
 	Assert(pendingOpsTable);

-	/* ensure any pad bytes in the struct are zeroed */
-	MemSet(&entry, 0, sizeof(entry));
-	entry.rnode = rnode;
-	entry.segno = segno;
+	if (segno != InvalidBlockNumber)
+	{
+		/* Enter a request to fsync this segment */
+		PendingOperationTag key;
+		PendingOperationEntry *entry;
+		bool		found;
+
+		/* ensure any pad bytes in the hash key are zeroed */
+		MemSet(&key, 0, sizeof(key));
+		key.rnode = rnode;
+		key.segno = segno;
+
+		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
+													  &key,
+													  HASH_ENTER,
+													  &found);
+		if (!found)				/* new entry, so initialize it */
+			entry->failures = 0;
+	}
+	else
+	{
+		/*
+		 * Remove any pending requests for the entire relation.  (This is a
+		 * tad slow but it doesn't seem worth rethinking the table structure.)
+		 */
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;

-	(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (RelFileNodeEquals(entry->tag.rnode, rnode))
+			{
+				/* Okay, delete this entry */
+				if (hash_search(pendingOpsTable, &entry->tag,
+								HASH_REMOVE, NULL) == NULL)
+					elog(ERROR, "pendingOpsTable corrupted");
+			}
+		}
+	}
 }

 /*
@ -1102,7 +1211,8 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
 			}
 			if (v->mdfd_chain == NULL)
 			{
-				if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
+				if (behavior == EXTENSION_RETURN_NULL &&
+					FILE_POSSIBLY_DELETED(errno))
 					return NULL;
 				ereport(ERROR,
 						(errcode_for_file_access(),