Allow Hot Standby to begin from a shutdown checkpoint.

Patch by Simon Riggs & me
15 years ago · 361bd1662e
parent ea9c103237
commit 361bd1662e
3 changed files with 224 additions and 62 deletions
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.59 2010/02/26 02:00:34 momjian Exp $
+ *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.60 2010/04/13 14:17:46 heikki Exp $
 *
 * NOTES
 *		Each global transaction is associated with a global transaction
@ -1718,6 +1718,89 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
 	return result;
 }

+/*
+ * StandbyRecoverPreparedTransactions
+ *
+ * Scan the pg_twophase directory and setup all the required information to
+ * allow standby queries to treat prepared transactions as still active.
+ * This is never called at the end of recovery - we use
+ * RecoverPreparedTransactions() at that point.
+ *
+ * Currently we simply call SubTransSetParent() for any subxids of prepared
+ * transactions. If overwriteOK is true, it's OK if some XIDs have already
+ * been marked in pg_subtrans.
+ */
+void
+StandbyRecoverPreparedTransactions(bool overwriteOK)
+{
+	DIR		   *cldir;
+	struct dirent *clde;
+
+	cldir = AllocateDir(TWOPHASE_DIR);
+	while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
+	{
+		if (strlen(clde->d_name) == 8 &&
+			strspn(clde->d_name, "0123456789ABCDEF") == 8)
+		{
+			TransactionId xid;
+			char	   *buf;
+			TwoPhaseFileHeader *hdr;
+			TransactionId *subxids;
+			int			i;
+
+			xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
+
+			/* Already processed? */
+			if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+			{
+				ereport(WARNING,
+						(errmsg("removing stale two-phase state file \"%s\"",
+								clde->d_name)));
+				RemoveTwoPhaseFile(xid, true);
+				continue;
+			}
+
+			/* Read and validate file */
+			buf = ReadTwoPhaseFile(xid, true);
+			if (buf == NULL)
+			{
+				ereport(WARNING,
+					  (errmsg("removing corrupt two-phase state file \"%s\"",
+							  clde->d_name)));
+				RemoveTwoPhaseFile(xid, true);
+				continue;
+			}
+
+			/* Deconstruct header */
+			hdr = (TwoPhaseFileHeader *) buf;
+			if (!TransactionIdEquals(hdr->xid, xid))
+			{
+				ereport(WARNING,
+					  (errmsg("removing corrupt two-phase state file \"%s\"",
+							  clde->d_name)));
+				RemoveTwoPhaseFile(xid, true);
+				pfree(buf);
+				continue;
+			}
+
+			/*
+			 * Examine subtransaction XIDs ... they should all follow main
+			 * XID.
+			 */
+			subxids = (TransactionId *)
+				(buf + MAXALIGN(sizeof(TwoPhaseFileHeader)));
+			for (i = 0; i < hdr->nsubxacts; i++)
+			{
+				TransactionId subxid = subxids[i];
+
+				Assert(TransactionIdFollows(subxid, xid));
+				SubTransSetParent(xid, subxid, overwriteOK);
+			}
+		}
+	}
+	FreeDir(cldir);
+}
+
 /*
 * RecoverPreparedTransactions
 *
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.393 2010/04/12 10:40:42 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.394 2010/04/13 14:17:46 heikki Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -496,6 +496,7 @@ static TimeLineID lastPageTLI = 0;
 static XLogRecPtr minRecoveryPoint;		/* local copy of
 										 * ControlFile->minRecoveryPoint */
 static bool updateMinRecoveryPoint = true;
+static bool reachedMinRecoveryPoint = false;

 static bool InRedo = false;

@ -551,6 +552,7 @@ static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
+static void CheckRecoveryConsistency(void);
 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 static List *readTimeLineHistory(TimeLineID targetTLI);
@ -5591,7 +5593,6 @@ StartupXLOG(void)
 	uint32		freespace;
 	TransactionId oldestActiveXID;
 	bool		bgwriterLaunched = false;
-	bool		backendsAllowed = false;

 	/*
 	 * Read control file and check XLOG status looks valid.
@ -5838,6 +5839,8 @@ StartupXLOG(void)
 	if (InRecovery)
 	{
 		int			rmid;
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;

 		/*
 		 * Update pg_control to show that we are recovering and to show the
@ -5930,6 +5933,33 @@ StartupXLOG(void)
 			StartupMultiXact();

 			ProcArrayInitRecoveryInfo(oldestActiveXID);
+
+			/*
+			 * If we're beginning at a shutdown checkpoint, we know that
+			 * nothing was running on the master at this point. So fake-up
+			 * an empty running-xacts record and use that here and now.
+			 * Recover additional standby state for prepared transactions.
+			 */
+			if (wasShutdown)
+			{
+				RunningTransactionsData running;
+
+				/*
+				 * Construct a RunningTransactions snapshot representing a shut
+				 * down server, with only prepared transactions still alive.
+				 * We're never overflowed at this point because all subxids
+				 * are listed with their parent prepared transactions.
+				 */
+				running.xcnt = nxids;
+				running.subxid_overflow = false;
+				running.nextXid = checkPoint.nextXid;
+				running.oldestRunningXid = oldestActiveXID;
+				running.xids = xids;
+
+				ProcArrayApplyRecoveryInfo(&running);
+
+				StandbyRecoverPreparedTransactions(false);
+			}
 		}

 		/* Initialize resource managers */
@ -5939,6 +5969,46 @@ StartupXLOG(void)
 				RmgrTable[rmid].rm_startup();
 		}

+		/*
+		 * Initialize shared replayEndRecPtr and recoveryLastRecPtr.
+		 *
+		 * This is slightly confusing if we're starting from an online
+		 * checkpoint; we've just read and replayed the chekpoint record,
+		 * but we're going to start replay from its redo pointer, which
+		 * precedes the location of the checkpoint record itself. So even
+		 * though the last record we've replayed is indeed ReadRecPtr, we
+		 * haven't replayed all the preceding records yet. That's OK for
+		 * the current use of these variables.
+		 */
+		SpinLockAcquire(&xlogctl->info_lck);
+		xlogctl->replayEndRecPtr = ReadRecPtr;
+		xlogctl->recoveryLastRecPtr = ReadRecPtr;
+		SpinLockRelease(&xlogctl->info_lck);
+
+		/*
+		 * Let postmaster know we've started redo now, so that it can
+		 * launch bgwriter to perform restartpoints.  We don't bother
+		 * during crash recovery as restartpoints can only be performed
+		 * during archive recovery.  And we'd like to keep crash recovery
+		 * simple, to avoid introducing bugs that could you from
+		 * recovering after crash.
+		 *
+		 * After this point, we can no longer assume that we're the only
+		 * process in addition to postmaster!  Also, fsync requests are
+		 * subsequently to be handled by the bgwriter, not locally.
+		 */
+		if (InArchiveRecovery && IsUnderPostmaster)
+		{
+			SetForwardFsyncRequests();
+			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+			bgwriterLaunched = true;
+		}
+
+		/*
+		 * Allow read-only connections immediately if we're consistent already.
+		 */
+		CheckRecoveryConsistency();
+
 		/*
 		 * Find the first record that logically follows the checkpoint --- it
 		 * might physically precede it, though.
@ -5958,43 +6028,14 @@ StartupXLOG(void)
 		{
 			bool		recoveryContinue = true;
 			bool		recoveryApply = true;
-			bool		reachedMinRecoveryPoint = false;
 			ErrorContextCallback errcontext;

-			/* use volatile pointer to prevent code rearrangement */
-			volatile XLogCtlData *xlogctl = XLogCtl;
-
-			/* initialize shared replayEndRecPtr and recoveryLastRecPtr */
-			SpinLockAcquire(&xlogctl->info_lck);
-			xlogctl->replayEndRecPtr = ReadRecPtr;
-			xlogctl->recoveryLastRecPtr = ReadRecPtr;
-			SpinLockRelease(&xlogctl->info_lck);
-
 			InRedo = true;

 			ereport(LOG,
 					(errmsg("redo starts at %X/%X",
 							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));

-			/*
-			 * Let postmaster know we've started redo now, so that it can
-			 * launch bgwriter to perform restartpoints.  We don't bother
-			 * during crash recovery as restartpoints can only be performed
-			 * during archive recovery.  And we'd like to keep crash recovery
-			 * simple, to avoid introducing bugs that could you from
-			 * recovering after crash.
-			 *
-			 * After this point, we can no longer assume that we're the only
-			 * process in addition to postmaster!  Also, fsync requests are
-			 * subsequently to be handled by the bgwriter, not locally.
-			 */
-			if (InArchiveRecovery && IsUnderPostmaster)
-			{
-				SetForwardFsyncRequests();
-				SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-				bgwriterLaunched = true;
-			}
-
 			/*
 			 * main redo apply loop
 			 */
@ -6024,32 +6065,8 @@ StartupXLOG(void)
 				/* Handle interrupt signals of startup process */
 				HandleStartupProcInterrupts();

-				/*
-				 * Have we passed our safe starting point?
-				 */
-				if (!reachedMinRecoveryPoint &&
-					XLByteLE(minRecoveryPoint, EndRecPtr) &&
-					XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
-				{
-					reachedMinRecoveryPoint = true;
-					ereport(LOG,
-						(errmsg("consistent recovery state reached at %X/%X",
-								EndRecPtr.xlogid, EndRecPtr.xrecoff)));
-				}
-
-				/*
-				 * Have we got a valid starting snapshot that will allow
-				 * queries to be run? If so, we can tell postmaster that the
-				 * database is consistent now, enabling connections.
-				 */
-				if (standbyState == STANDBY_SNAPSHOT_READY &&
-					!backendsAllowed &&
-					reachedMinRecoveryPoint &&
-					IsUnderPostmaster)
-				{
-					backendsAllowed = true;
-					SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
-				}
+				/* Allow read-only connections if we're consistent now */
+				CheckRecoveryConsistency();

 				/*
 				 * Have we reached our recovery target?
@ -6398,6 +6415,44 @@ StartupXLOG(void)
 	}
 }

+/*
+ * Checks if recovery has reached a consistent state. When consistency is
+ * reached and we have a valid starting standby snapshot, tell postmaster
+ * that it can start accepting read-only connections.
+ */
+static void
+CheckRecoveryConsistency(void)
+{
+	static bool		backendsAllowed = false;
+
+	/*
+	 * Have we passed our safe starting point?
+	 */
+	if (!reachedMinRecoveryPoint &&
+		XLByteLE(minRecoveryPoint, EndRecPtr) &&
+		XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+	{
+		reachedMinRecoveryPoint = true;
+		ereport(LOG,
+				(errmsg("consistent recovery state reached at %X/%X",
+						EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+	}
+
+	/*
+	 * Have we got a valid starting snapshot that will allow
+	 * queries to be run? If so, we can tell postmaster that the
+	 * database is consistent now, enabling connections.
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_READY &&
+		!backendsAllowed &&
+		reachedMinRecoveryPoint &&
+		IsUnderPostmaster)
+	{
+		backendsAllowed = true;
+		SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+	}
+}
+
 /*
 * Is the system still in recovery?
 *
@ -7657,13 +7712,36 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		if (standbyState != STANDBY_DISABLED)
 			CheckRequiredParameterValues(checkPoint);

+		/*
+		 * If we see a shutdown checkpoint, we know that nothing was
+		 * running on the master at this point. So fake-up an empty
+		 * running-xacts record and use that here and now. Recover
+		 * additional standby state for prepared transactions.
+		 */
 		if (standbyState >= STANDBY_INITIALIZED)
 		{
+			TransactionId *xids;
+			int			nxids;
+			TransactionId oldestActiveXID;
+			RunningTransactionsData running;
+
+			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+
 			/*
-			 * Remove stale transactions, if any.
+			 * Construct a RunningTransactions snapshot representing a shut
+			 * down server, with only prepared transactions still alive.
+			 * We're never overflowed at this point because all subxids
+			 * are listed with their parent prepared transactions.
 			 */
-			ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid);
-			StandbyReleaseOldLocks(checkPoint.nextXid);
+			running.xcnt = nxids;
+			running.subxid_overflow = false;
+			running.nextXid = checkPoint.nextXid;
+			running.oldestRunningXid = oldestActiveXID;
+			running.xids = xids;
+
+			ProcArrayApplyRecoveryInfo(&running);
+
+			StandbyRecoverPreparedTransactions(true);
 		}

 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.14 2010/01/02 16:58:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.15 2010/04/13 14:17:46 heikki Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid);

 extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
 							int *nxids_p);
+extern void StandbyRecoverPreparedTransactions(bool overwriteOK);
 extern void RecoverPreparedTransactions(void);

 extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);