Restart bgworkers immediately after a crash-and-restart cycle.

Just as we would start bgworkers immediately after an initial startup of the server, we should restart them immediately when reinitializing. Petr Jelinek and Robert Haas
12 years ago · 970d1f76d1
parent 364ddc3e5c
commit 970d1f76d1
3 changed files with 33 additions and 9 deletions
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@ -394,6 +394,27 @@ BackgroundWorkerStopNotifications(pid_t pid)
 	}
 }

+/*
+ * Reset background worker crash state.
+ *
+ * We assume that, after a crash-and-restart cycle, background workers should
+ * be restarted immediately, instead of waiting for bgw_restart_time to
+ * elapse.
+ */
+void
+ResetBackgroundWorkerCrashTimes(void)
+{
+	slist_mutable_iter	iter;
+
+	slist_foreach_modify(iter, &BackgroundWorkerList)
+	{
+		RegisteredBgWorker *rw;
+
+		rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur);
+		rw->rw_crashed_at = 0;
+	}
+}
+
 #ifdef EXEC_BACKEND
 /*
 * In EXEC_BACKEND mode, workers use this to retrieve their details from
@ -478,13 +499,14 @@ bgworker_quickdie(SIGNAL_ARGS)
 	on_exit_reset();

 	/*
-	 * Note we do exit(0) here, not exit(2) like quickdie.  The reason is that
-	 * we don't want to be seen this worker as independently crashed, because
-	 * then postmaster would delay restarting it again afterwards.  If some
-	 * idiot DBA manually sends SIGQUIT to a random bgworker, the "dead man
-	 * switch" will ensure that postmaster sees this as a crash.
+	 * Note we do exit(2) not exit(0).  This is to force the postmaster into a
+	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+	 * backend.  This is necessary precisely because we don't clean up our
+	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+	 * should ensure the postmaster sees this as a crash, too, but no harm in
+	 * being doubly sure.)
 	 */
-	exit(0);
+	exit(2);
 }

 /*
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@ -2616,7 +2616,7 @@ reaper(SIGNAL_ARGS)
 			if (PgStatPID == 0)
 				PgStatPID = pgstat_start();

-			/* some workers may be scheduled to start now */
+			/* workers may be scheduled to start now */
 			maybe_start_bgworker();

 			/* at this point we are really open for business */
@ -2860,7 +2860,6 @@ CleanupBackgroundWorker(int pid,
 		{
 			if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
 			{
-				rw->rw_crashed_at = GetCurrentTimestamp();
 				HandleChildCrash(pid, exitstatus, namebuf);
 				return true;
 			}
@ -2871,7 +2870,6 @@ CleanupBackgroundWorker(int pid,
 				 * Uh-oh, the child failed to clean itself up.  Treat as a
 				 * crash after all.
 				 */
-				rw->rw_crashed_at = GetCurrentTimestamp();
 				HandleChildCrash(pid, exitstatus, namebuf);
 				return true;
 			}
@ -3546,6 +3544,9 @@ PostmasterStateMachine(void)
 		ereport(LOG,
 				(errmsg("all server processes terminated; reinitializing")));

+		/* allow background workers to immediately restart */
+		ResetBackgroundWorkerCrashTimes();
+
 		shmem_exit(1);
 		reset_shared(PostPortNumber);

--- a/src/include/postmaster/bgworker_internals.h
+++ b/src/include/postmaster/bgworker_internals.h
@ -43,6 +43,7 @@ extern void BackgroundWorkerStateChange(void);
 extern void ForgetBackgroundWorker(slist_mutable_iter *cur);
 extern void ReportBackgroundWorkerPID(RegisteredBgWorker *);
 extern void BackgroundWorkerStopNotifications(pid_t pid);
+extern void ResetBackgroundWorkerCrashTimes(void);

 /* Function to start a background worker, called from postmaster.c */
 extern void StartBackgroundWorker(void);