Prevent freshly-started backend from ignoring SIGUSR1, per race condition

observed by Inoue. Also, don't call ProcRemove() from postmaster if we have detected a backend crash --- too risky if shared memory is corrupted. It's not needed anyway, considering we are going to reinitialize shared memory and semaphores as soon as the last child is dead.
25 years ago · 39b547f430
parent e4d97cb70e
commit 39b547f430
2 changed files with 74 additions and 63 deletions
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@ -11,7 +11,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/postmaster/postmaster.c,v 1.200 2000/12/18 18:45:04 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/postmaster/postmaster.c,v 1.201 2000/12/20 21:51:52 tgl Exp $
 *
 * NOTES
 *
@ -180,7 +180,7 @@ static time_t	checkpointed = 0;

 static int	Shutdown = NoShutdown;

-static bool FatalError = false;
+static bool FatalError = false;	/* T if recovering from backend crash */

 /*
 * State for assigning random salts and cancel keys.
@ -649,7 +649,7 @@ PostmasterMain(int argc, char *argv[])
 	pqsignal(SIGTERM, pmdie);	/* wait for children and ShutdownDataBase */
 	pqsignal(SIGALRM, SIG_IGN); /* ignored */
 	pqsignal(SIGPIPE, SIG_IGN); /* ignored */
-	pqsignal(SIGUSR1, SIG_IGN); /* ignored */
+	pqsignal(SIGUSR1, pmdie);	/* currently ignored, but see note in pmdie */
 	pqsignal(SIGUSR2, pmdie);	/* send SIGUSR2, don't die */
 	pqsignal(SIGCHLD, reaper);	/* handle child termination */
 	pqsignal(SIGTTIN, SIG_IGN); /* ignored */
@ -1329,6 +1329,18 @@ pmdie(SIGNAL_ARGS)

 	switch (postgres_signal_arg)
 	{
+		case SIGUSR1:
+			/*
+			 * Currently the postmaster ignores SIGUSR1 (maybe it should
+			 * do something useful instead?)  But we must have some handler
+			 * installed for SIGUSR1, not just set it to SIG_IGN.  Else, a
+			 * freshly spawned backend would likewise have it set to SIG_IGN,
+			 * which would mean the backend would ignore any attempt to kill
+			 * it before it had gotten as far as setting up its own handler.
+			 */
+			errno = save_errno;
+			return;
+
 		case SIGUSR2:

 			/*
@ -1511,7 +1523,7 @@ reaper(SIGNAL_ARGS)
 				ExitPostmaster(1);
 			}
 			StartupPID = 0;
-			FatalError = false;
+			FatalError = false;	/* done with recovery */
 			if (Shutdown > NoShutdown)
 			{
 				if (ShutdownPID > 0)
@ -1539,12 +1551,7 @@ reaper(SIGNAL_ARGS)
 		/*
 		 * Wait for all children exit, then reset shmem and StartupDataBase.
 		 */
-		if (DLGetHead(BackendList))
-		{
-			errno = save_errno;
-			return;
-		}
-		if (StartupPID > 0 || ShutdownPID > 0)
+		if (DLGetHead(BackendList) || StartupPID > 0 || ShutdownPID > 0)
 		{
 			errno = save_errno;
 			return;
@ -1595,13 +1602,10 @@ CleanupProc(int pid,
 	Dlelem	   *curr,
 			   *next;
 	Backend    *bp;
-	int			sig;

 	if (DebugLvl)
-	{
 		fprintf(stderr, "%s: CleanupProc: pid %d exited with status %d\n",
 				progname, pid, exitstatus);
-	}

 	/*
 	 * If a backend dies in an ugly way (i.e. exit status not 0) then we
@ -1609,7 +1613,7 @@ CleanupProc(int pid,
 	 * we assume everything is hunky dory and simply remove the backend
 	 * from the active backend list.
 	 */
-	if (!exitstatus)
+	if (exitstatus == 0)
 	{
 		curr = DLGetHead(BackendList);
 		while (curr)
@ -1628,66 +1632,64 @@ CleanupProc(int pid,
 		if (pid == CheckPointPID)
 		{
 			CheckPointPID = 0;
-			checkpointed = time(NULL);
+			if (!FatalError)
+				checkpointed = time(NULL);
 		}
 		else
-			ProcRemove(pid);
+		{
+			/* Why is this done here, and not by the backend itself? */
+			if (!FatalError)
+				ProcRemove(pid);
+		}

 		return;
 	}

 	if (!FatalError)
 	{
+		/* Make log entry unless we did so already */
 		tnow = time(NULL);
 		fprintf(stderr, "Server process (pid %d) exited with status %d at %s"
 				"Terminating any active server processes...\n",
 				pid, exitstatus, ctime(&tnow));
 		fflush(stderr);
 	}
-	FatalError = true;
+
 	curr = DLGetHead(BackendList);
 	while (curr)
 	{
 		next = DLGetSucc(curr);
 		bp = (Backend *) DLE_VAL(curr);
-
-		/*
-		 * SIGUSR1 is the special signal that says exit without proc_exit
-		 * and let the user know what's going on. ProcSemaphoreKill()
-		 * cleans up the backends semaphore.  If SendStop is set (-s on
-		 * command line), then we send a SIGSTOP so that we can core dumps
-		 * from all backends by hand.
-		 */
-		sig = (SendStop) ? SIGSTOP : SIGUSR1;
 		if (bp->pid != pid)
 		{
-			if (DebugLvl)
-				fprintf(stderr, "%s: CleanupProc: sending %s to process %d\n",
-						progname,
-						(sig == SIGUSR1)
-						? "SIGUSR1" : "SIGSTOP",
-						bp->pid);
-			kill(bp->pid, sig);
+			/*
+			 * This backend is still alive.  Unless we did so already,
+			 * tell it to commit hara-kiri.
+			 *
+			 * SIGUSR1 is the special signal that says exit without proc_exit
+			 * and let the user know what's going on. But if SendStop is set
+			 * (-s on command line), then we send SIGSTOP instead, so that we
+			 * can get core dumps from all backends by hand.
+			 */
+			if (!FatalError)
+			{
+				if (DebugLvl)
+					fprintf(stderr, "%s: CleanupProc: sending %s to process %d\n",
+							progname,
+							(SendStop ? "SIGSTOP" : "SIGUSR1"),
+							bp->pid);
+				kill(bp->pid, (SendStop ? SIGSTOP : SIGUSR1));
+			}
 		}
 		else
 		{
-
 			/*
-			 * I don't like that we call ProcRemove() here, assuming that
-			 * shmem may be corrupted! But is there another way to free
-			 * backend semaphores? Actually, I believe that we need not in
-			 * per backend semaphore at all (we use them to wait on lock
-			 * only, couldn't we just sigpause?), so probably we'll remove
-			 * this call from here someday.  -- vadim 04-10-1999
+			 * Found entry for freshly-dead backend, so remove it.
+			 *
+			 * Don't call ProcRemove() here, since shmem may be corrupted!
+			 * We are going to reinitialize shmem and semaphores anyway
+			 * once all the children are dead, so no need for it.
 			 */
-			if (pid == CheckPointPID)
-			{
-				CheckPointPID = 0;
-				checkpointed = 0;
-			}
-			else
-				ProcRemove(pid);
-
 			DLRemove(curr);
 			free(bp);
 			DLFreeElem(curr);
@ -1695,6 +1697,13 @@ CleanupProc(int pid,
 		curr = next;
 	}

+	if (pid == CheckPointPID)
+	{
+		CheckPointPID = 0;
+		checkpointed = 0;
+	}
+
+	FatalError = true;
 }

 /*
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/tcop/postgres.c,v 1.197 2000/12/18 18:45:05 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/tcop/postgres.c,v 1.198 2000/12/20 21:51:52 tgl Exp $
 *
 * NOTES
 *	  this is the "main" module of the postgres backend and
@ -1462,21 +1462,12 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
 	Assert(DataDir);

 	/*
-	 * 1. Set BlockSig and UnBlockSig masks. 2. Set up signal handlers. 3.
-	 * Allow only SIGUSR1 signal (we never block it) during
-	 * initialization.
+	 * Set up signal handlers and masks.
 	 *
-	 * Note that postmaster already blocked ALL signals to make us happy.
+	 * Note that postmaster blocked all signals before forking child process,
+	 * so there is no race condition whereby we might receive a signal before
+	 * we have set up the handler.
 	 */
-	pqinitmask();
-
-#ifdef HAVE_SIGPROCMASK
-	sigdelset(&BlockSig, SIGUSR1);
-#else
-	BlockSig &= ~(sigmask(SIGUSR1));
-#endif
-
-	PG_SETMASK(&BlockSig);		/* block everything except SIGUSR1 */

 	pqsignal(SIGHUP, SigHupHandler);	/* set flag to read config file */
 	pqsignal(SIGINT, QueryCancelHandler);		/* cancel current query */
@ -1499,6 +1490,17 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
 	pqsignal(SIGTTOU, SIG_DFL);
 	pqsignal(SIGCONT, SIG_DFL);

+	pqinitmask();
+
+	/* We allow SIGUSR1 (quickdie) at all times */
+#ifdef HAVE_SIGPROCMASK
+	sigdelset(&BlockSig, SIGUSR1);
+#else
+	BlockSig &= ~(sigmask(SIGUSR1));
+#endif
+
+	PG_SETMASK(&BlockSig);		/* block everything except SIGUSR1 */
+

 	if (IsUnderPostmaster)
 	{
@ -1649,7 +1651,7 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
 	if (!IsUnderPostmaster)
 	{
 		puts("\nPOSTGRES backend interactive interface ");
-		puts("$Revision: 1.197 $ $Date: 2000/12/18 18:45:05 $\n");
+		puts("$Revision: 1.198 $ $Date: 2000/12/20 21:51:52 $\n");
 	}

 	/*