Fast promote mode skips checkpoint at end of recovery.

pg_ctl promote -m fast will skip the checkpoint at end of recovery so that we can achieve very fast failover when the apply delay is low. Write new WAL record XLOG_END_OF_RECOVERY to allow us to switch timeline correctly for downstream log readers. If we skip synchronous end of recovery checkpoint we request a normal spread checkpoint so that the window of re-recovery is low. Simon Riggs and Kyotaro Horiguchi, with input from Fujii Masao. Review by Heikki Linnakangas
13 years ago · fd4ced5230
parent ee22c55f5a
commit fd4ced5230
5 changed files with 195 additions and 32 deletions
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@ -18,6 +18,7 @@
 #include "access/xlog_internal.h"
 #include "catalog/pg_control.h"
 #include "utils/guc.h"
+#include "utils/timestamp.h"

 /*
 * GUC support
@ -119,6 +120,15 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
 		memcpy(&fpw, rec, sizeof(bool));
 		appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
 	}
+	else if (info == XLOG_END_OF_RECOVERY)
+	{
+		xl_end_of_recovery xlrec;
+
+		memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
+		appendStringInfo(buf, "end_of_recovery: tli %u; time %s",
+						 xlrec.ThisTimeLineID,
+						 timestamptz_to_str(xlrec.end_time));
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@ -66,6 +66,7 @@
 #define RECOVERY_COMMAND_FILE	"recovery.conf"
 #define RECOVERY_COMMAND_DONE	"recovery.done"
 #define PROMOTE_SIGNAL_FILE "promote"
+#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"


 /* User-settable parameters */
@ -210,6 +211,9 @@ bool StandbyMode = false;
 static char *PrimaryConnInfo = NULL;
 static char *TriggerFile = NULL;

+/* whether request for fast promotion has been made yet */
+static bool fast_promote = false;
+
 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 static TransactionId recoveryStopXid;
 static TimestampTz recoveryStopTime;
@ -611,6 +615,7 @@ static void CheckRequiredParameterValues(void);
 static void XLogReportParameters(void);
 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI);
 static void LocalSetXLogInsertAllowed(void);
+static void CreateEndOfRecoveryRecord(void);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);

@ -642,7 +647,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 		   int emode, bool fetching_ckpt);
 static void CheckRecoveryConsistency(void);
 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
-					 XLogRecPtr RecPtr, int whichChkpt);
+					 XLogRecPtr RecPtr, int whichChkpti, bool report);
 static bool rescanLatestTimeLine(void);
 static void WriteControlFile(void);
 static void ReadControlFile(void);
@ -4848,7 +4853,7 @@ StartupXLOG(void)
 		 * When a backup_label file is present, we want to roll forward from
 		 * the checkpoint it identifies, rather than using pg_control.
 		 */
-		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
+		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
 		if (record != NULL)
 		{
 			memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
@ -4890,7 +4895,7 @@ StartupXLOG(void)
 		 */
 		checkPointLoc = ControlFile->checkPoint;
 		RedoStartLSN = ControlFile->checkPointCopy.redo;
-		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
+		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
 		if (record != NULL)
 		{
 			ereport(DEBUG1,
@ -4909,7 +4914,7 @@ StartupXLOG(void)
 		else
 		{
 			checkPointLoc = ControlFile->prevCheckPoint;
-			record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
+			record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
 			if (record != NULL)
 			{
 				ereport(LOG,
@ -5393,22 +5398,33 @@ StartupXLOG(void)
 				}

 				/*
-				 * Before replaying this record, check if it is a shutdown
-				 * checkpoint record that causes the current timeline to
-				 * change. The checkpoint record is already considered to be
-				 * part of the new timeline, so we update ThisTimeLineID
-				 * before replaying it. That's important so that replayEndTLI,
-				 * which is recorded as the minimum recovery point's TLI if
+				 * Before replaying this record, check if this record
+				 * causes the current timeline to change. The record is
+				 * already considered to be part of the new timeline,
+				 * so we update ThisTimeLineID before replaying it.
+				 * That's important so that replayEndTLI, which is
+				 * recorded as the minimum recovery point's TLI if
 				 * recovery stops after this record, is set correctly.
 				 */
-				if (record->xl_rmid == RM_XLOG_ID &&
-					(record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN)
+				if (record->xl_rmid == RM_XLOG_ID)
 				{
-					CheckPoint	checkPoint;
-					TimeLineID	newTLI;
+					TimeLineID	newTLI = ThisTimeLineID;
+					uint8		info = record->xl_info & ~XLR_INFO_MASK;
+
+					if (info == XLOG_CHECKPOINT_SHUTDOWN)
+					{
+						CheckPoint	checkPoint;
+
+						memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+						newTLI = checkPoint.ThisTimeLineID;
+					}
+					else if (info == XLOG_END_OF_RECOVERY)
+					{
+						xl_end_of_recovery	xlrec;

-					memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
-					newTLI = checkPoint.ThisTimeLineID;
+						memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+						newTLI = xlrec.ThisTimeLineID;
+					}

 					if (newTLI != ThisTimeLineID)
 					{
@ -5729,9 +5745,36 @@ StartupXLOG(void)
 		 * allows some extra error checking in xlog_redo.
 		 */
 		if (bgwriterLaunched)
-			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
-							  CHECKPOINT_IMMEDIATE |
-							  CHECKPOINT_WAIT);
+		{
+			bool	checkpoint_wait = true;
+
+			/*
+			 * If we've been explicitly promoted with fast option,
+			 * end of recovery without a checkpoint if possible.
+			 */
+			if (fast_promote)
+			{
+				checkPointLoc = ControlFile->prevCheckPoint;
+				record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false);
+				if (record != NULL)
+				{
+					checkpoint_wait = false;
+					CreateEndOfRecoveryRecord();
+				}
+			}
+
+			/*
+			 * In most cases we will wait for a full checkpoint to complete.
+			 *
+			 * If not, issue a normal, non-immediate checkpoint but don't wait.
+			 */
+			if (checkpoint_wait)
+				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+									CHECKPOINT_IMMEDIATE |
+									CHECKPOINT_WAIT);
+			else
+				RequestCheckpoint(0);	/* No flags */
+		}
 		else
 			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);

@ -6060,12 +6103,15 @@ LocalSetXLogInsertAllowed(void)
 */
 static XLogRecord *
 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
-					 int whichChkpt)
+					 int whichChkpt, bool report)
 {
 	XLogRecord *record;

 	if (!XRecOffIsValid(RecPtr))
 	{
+		if (!report)
+			return NULL;
+
 		switch (whichChkpt)
 		{
 			case 1:
@ -6088,6 +6134,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,

 	if (record == NULL)
 	{
+		if (!report)
+			return NULL;
+
 		switch (whichChkpt)
 		{
 			case 1:
@ -6882,6 +6931,44 @@ CreateCheckPoint(int flags)
 	LWLockRelease(CheckpointLock);
 }

+/*
+ * Mark the end of recovery in WAL though without running a full checkpoint.
+ * We can expect that a restartpoint is likely to be in progress as we
+ * do this, though we are unwilling to wait for it to complete. So be
+ * careful to avoid taking the CheckpointLock anywhere here.
+ *
+ * CreateRestartPoint() allows for the case where recovery may end before
+ * the restartpoint completes so there is no concern of concurrent behaviour.
+ */
+void
+CreateEndOfRecoveryRecord(void)
+{
+	xl_end_of_recovery	xlrec;
+	XLogRecData			rdata;
+
+	/* sanity check */
+	if (!RecoveryInProgress())
+		elog(ERROR, "can only be used to end recovery");
+
+	xlrec.end_time = time(NULL);
+	xlrec.ThisTimeLineID = ThisTimeLineID;
+
+	LocalSetXLogInsertAllowed();
+
+	START_CRIT_SECTION();
+
+	rdata.data = (char *) &xlrec;
+	rdata.len = sizeof(xl_end_of_recovery);
+	rdata.buffer = InvalidBuffer;
+	rdata.next = NULL;
+
+	(void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
+
+	END_CRIT_SECTION();
+
+	LocalXLogInsertAllowed = -1;		/* return to "check" state */
+}
+
 /*
 * Flush all data in shared memory to disk, and fsync
 *
@ -7613,6 +7700,27 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)

 		RecoveryRestartPoint(&checkPoint);
 	}
+	else if (info == XLOG_END_OF_RECOVERY)
+	{
+		xl_end_of_recovery xlrec;
+
+		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+
+		/*
+		 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
+		 * but this case is rarer and harder to test, so the benefit doesn't
+		 * outweigh the potential extra cost of maintenance.
+		 */
+
+		/*
+		 * We should've already switched to the new TLI before replaying this
+		 * record.
+		 */
+		if (xlrec.ThisTimeLineID != ThisTimeLineID)
+			ereport(PANIC,
+					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
+							xlrec.ThisTimeLineID, ThisTimeLineID)));
+	}
 	else if (info == XLOG_NOOP)
 	{
 		/* nothing to do here */
@ -9405,8 +9513,39 @@ CheckForStandbyTrigger(void)

 	if (IsPromoteTriggered())
 	{
-		ereport(LOG,
+		/*
+		 * In 9.1 and 9.2 the postmaster unlinked the promote file
+		 * inside the signal handler. We now leave the file in place
+		 * and let the Startup process do the unlink. This allows
+		 * Startup to know whether we're doing fast or normal
+		 * promotion. Fast promotion takes precedence.
+		 */
+		if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+		{
+			unlink(FAST_PROMOTE_SIGNAL_FILE);
+			unlink(PROMOTE_SIGNAL_FILE);
+			fast_promote = true;
+		}
+		else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+		{
+			unlink(PROMOTE_SIGNAL_FILE);
+			fast_promote = false;
+		}
+
+		/*
+		 * We only look for fast promote via the pg_ctl promote option.
+		 * It would be possible to extend trigger file support for the
+		 * fast promotion option but that wouldn't be backwards compatible
+		 * anyway and we're looking to focus further work on the promote
+		 * option as the right way to signal end of recovery.
+		 */
+		if (fast_promote)
+			ereport(LOG,
+				(errmsg("received fast promote request")));
+		else
+			ereport(LOG,
 				(errmsg("received promote request")));
+
 		ResetPromoteTriggered();
 		triggered = true;
 		return true;
@ -9435,15 +9574,10 @@ CheckPromoteSignal(void)
 {
 	struct stat stat_buf;

-	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-	{
-		/*
-		 * Since we are in a signal handler, it's not safe to elog. We
-		 * silently ignore any error from unlink.
-		 */
-		unlink(PROMOTE_SIGNAL_FILE);
+	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
+		stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
 		return true;
-	}
+
 	return false;
 }

--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@ -1136,6 +1136,15 @@ do_promote(void)
 		exit(1);
 	}

+	/*
+	 * Use two different kinds of promotion file so we can understand
+	 * the difference between smart and fast promotion.
+	 */
+	if (shutdown_mode >= FAST_MODE)
+		snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data);
+	else
+		snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
+
 	if ((prmfile = fopen(promote_file, "w")) == NULL)
 	{
 		write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
@ -1799,7 +1808,7 @@ do_help(void)
 			 "                 [-o \"OPTIONS\"]\n"), progname);
 	printf(_("  %s reload  [-D DATADIR] [-s]\n"), progname);
 	printf(_("  %s status  [-D DATADIR]\n"), progname);
-	printf(_("  %s promote [-D DATADIR] [-s]\n"), progname);
+	printf(_("  %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname);
 	printf(_("  %s kill    SIGNALNAME PID\n"), progname);
 #if defined(WIN32) || defined(__CYGWIN__)
 	printf(_("  %s register   [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
@ -1828,7 +1837,7 @@ do_help(void)
 	printf(_("  -o OPTIONS             command line options to pass to postgres\n"
 	 "                         (PostgreSQL server executable) or initdb\n"));
 	printf(_("  -p PATH-TO-POSTGRES    normally not necessary\n"));
-	printf(_("\nOptions for stop or restart:\n"));
+	printf(_("\nOptions for stop, restart or promote:\n"));
 	printf(_("  -m, --mode=MODE        MODE can be \"smart\", \"fast\", or \"immediate\"\n"));

 	printf(_("\nShutdown modes are:\n"));
@ -1836,6 +1845,10 @@ do_help(void)
 	printf(_("  fast        quit directly, with proper shutdown\n"));
 	printf(_("  immediate   quit without complete shutdown; will lead to recovery on restart\n"));

+	printf(_("\nPromotion modes are:\n"));
+	printf(_("  smart       promote after performing a checkpoint\n"));
+	printf(_("  fast        promote quickly without waiting for checkpoint completion\n"));
+
 	printf(_("\nAllowed signal names for kill:\n"));
 	printf("  ABRT HUP INT QUIT TERM USR1 USR2\n");

@ -2271,7 +2284,6 @@ main(int argc, char **argv)
 		snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data);
 		snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data);
 		snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data);
-		snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
 	}

 	switch (ctl_command)
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@ -217,6 +217,12 @@ typedef struct xl_restore_point
 	char		rp_name[MAXFNAMELEN];
 } xl_restore_point;

+/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
+typedef struct xl_end_of_recovery
+{
+	TimestampTz end_time;
+	TimeLineID	ThisTimeLineID;
+} xl_end_of_recovery;

 /*
 * XLogRecord is defined in xlog.h, but we avoid #including that to keep
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@ -64,6 +64,7 @@ typedef struct CheckPoint
 #define XLOG_PARAMETER_CHANGE			0x60
 #define XLOG_RESTORE_POINT				0x70
 #define XLOG_FPW_CHANGE				0x80
+#define XLOG_END_OF_RECOVERY			0x90


 /*