@ -79,10 +79,11 @@
* and also sets stopSignaled = true to handle the race condition when the
* postmaster has not noticed the promotion yet and thus may end up restarting
* the slot sync worker . If stopSignaled is set , the worker will exit in such a
* case . Note that we don ' t need to reset this variable as after promotion the
* slot sync worker won ' t be restarted because the pmState changes to PM_RUN from
* PM_HOT_STANDBY and we don ' t support demoting primary without restarting the
* server . See MaybeStartSlotSyncWorker .
* case . The SQL function pg_sync_replication_slots ( ) will also error out if
* this flag is set . Note that we don ' t need to reset this variable as after
* promotion the slot sync worker won ' t be restarted because the pmState
* changes to PM_RUN from PM_HOT_STANDBY and we don ' t support demoting
* primary without restarting the server . See MaybeStartSlotSyncWorker .
*
* The ' syncing ' flag is needed to prevent concurrent slot syncs to avoid slot
* overwrites .
@ -92,9 +93,6 @@
* is expected ( e . g . , slot sync GUCs change ) , slot sync worker will reset
* last_start_time before exiting , so that postmaster can start the worker
* without waiting for SLOTSYNC_RESTART_INTERVAL_SEC .
*
* All the fields except ' syncing ' are used only by slotsync worker .
* ' syncing ' is used both by worker and SQL function pg_sync_replication_slots .
*/
typedef struct SlotSyncCtxStruct
{
@ -807,20 +805,6 @@ synchronize_slots(WalReceiverConn *wrconn)
" FROM pg_catalog.pg_replication_slots "
" WHERE failover and NOT temporary " ;
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
if ( SlotSyncCtx - > syncing )
{
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
ereport ( ERROR ,
errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ) ,
errmsg ( " cannot synchronize replication slots concurrently " ) ) ;
}
SlotSyncCtx - > syncing = true ;
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
syncing_slots = true ;
/* The syscache access in walrcv_exec() needs a transaction env. */
if ( ! IsTransactionState ( ) )
{
@ -937,12 +921,6 @@ synchronize_slots(WalReceiverConn *wrconn)
if ( started_tx )
CommitTransactionCommand ( ) ;
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
SlotSyncCtx - > syncing = false ;
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
syncing_slots = false ;
return some_slot_updated ;
}
@ -1190,6 +1168,19 @@ ProcessSlotSyncInterrupts(WalReceiverConn *wrconn)
slotsync_reread_config ( ) ;
}
/*
* Connection cleanup function for slotsync worker .
*
* Called on slotsync worker exit .
*/
static void
slotsync_worker_disconnect ( int code , Datum arg )
{
WalReceiverConn * wrconn = ( WalReceiverConn * ) DatumGetPointer ( arg ) ;
walrcv_disconnect ( wrconn ) ;
}
/*
* Cleanup function for slotsync worker .
*
@ -1198,8 +1189,38 @@ ProcessSlotSyncInterrupts(WalReceiverConn *wrconn)
static void
slotsync_worker_onexit ( int code , Datum arg )
{
/*
* We need to do slots cleanup here just like WalSndErrorCleanup ( ) does .
*
* The startup process during promotion invokes ShutDownSlotSync ( ) which
* waits for slot sync to finish and it does that by checking the
* ' syncing ' flag . Thus the slot sync worker must be done with slots '
* release and cleanup to avoid any dangling temporary slots or active
* slots before it marks itself as finished syncing .
*/
/* Make sure active replication slots are released */
if ( MyReplicationSlot ! = NULL )
ReplicationSlotRelease ( ) ;
/* Also cleanup the temporary slots. */
ReplicationSlotCleanup ( false ) ;
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
SlotSyncCtx - > pid = InvalidPid ;
/*
* If syncing_slots is true , it indicates that the process errored out
* without resetting the flag . So , we need to clean up shared memory and
* reset the flag here .
*/
if ( syncing_slots )
{
SlotSyncCtx - > syncing = false ;
syncing_slots = false ;
}
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
}
@ -1242,6 +1263,64 @@ wait_for_slot_activity(bool some_slot_updated)
ResetLatch ( MyLatch ) ;
}
/*
* Emit an error if a promotion or a concurrent sync call is in progress .
* Otherwise , advertise that a sync is in progress .
*/
static void
check_and_set_sync_info ( pid_t worker_pid )
{
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
/* The worker pid must not be already assigned in SlotSyncCtx */
Assert ( worker_pid = = InvalidPid | | SlotSyncCtx - > pid = = InvalidPid ) ;
/*
* Emit an error if startup process signaled the slot sync machinery to
* stop . See comments atop SlotSyncCtxStruct .
*/
if ( SlotSyncCtx - > stopSignaled )
{
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
ereport ( ERROR ,
errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ) ,
errmsg ( " cannot synchronize replication slots when standby promotion is ongoing " ) ) ;
}
if ( SlotSyncCtx - > syncing )
{
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
ereport ( ERROR ,
errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ) ,
errmsg ( " cannot synchronize replication slots concurrently " ) ) ;
}
SlotSyncCtx - > syncing = true ;
/*
* Advertise the required PID so that the startup process can kill the
* slot sync worker on promotion .
*/
SlotSyncCtx - > pid = worker_pid ;
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
syncing_slots = true ;
}
/*
* Reset syncing flag .
*/
static void
reset_syncing_flag ( )
{
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
SlotSyncCtx - > syncing = false ;
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
syncing_slots = false ;
} ;
/*
* The main loop of our worker process .
*
@ -1278,47 +1357,6 @@ ReplSlotSyncWorkerMain(char *startup_data, size_t startup_data_len)
Assert ( SlotSyncCtx ! = NULL ) ;
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
Assert ( SlotSyncCtx - > pid = = InvalidPid ) ;
/*
* Startup process signaled the slot sync worker to stop , so if meanwhile
* postmaster ended up starting the worker again , exit .
*/
if ( SlotSyncCtx - > stopSignaled )
{
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
proc_exit ( 0 ) ;
}
/* Advertise our PID so that the startup process can kill us on promotion */
SlotSyncCtx - > pid = MyProcPid ;
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
ereport ( LOG , errmsg ( " slot sync worker started " ) ) ;
/* Register it as soon as SlotSyncCtx->pid is initialized. */
before_shmem_exit ( slotsync_worker_onexit , ( Datum ) 0 ) ;
/* Setup signal handling */
pqsignal ( SIGHUP , SignalHandlerForConfigReload ) ;
pqsignal ( SIGINT , SignalHandlerForShutdownRequest ) ;
pqsignal ( SIGTERM , die ) ;
pqsignal ( SIGFPE , FloatExceptionHandler ) ;
pqsignal ( SIGUSR1 , procsignal_sigusr1_handler ) ;
pqsignal ( SIGUSR2 , SIG_IGN ) ;
pqsignal ( SIGPIPE , SIG_IGN ) ;
pqsignal ( SIGCHLD , SIG_DFL ) ;
/*
* Establishes SIGALRM handler and initialize timeout module . It is needed
* by InitPostgres to register different timeouts .
*/
InitializeTimeouts ( ) ;
/* Load the libpq-specific functions */
load_file ( " libpqwalreceiver " , false ) ;
/*
* If an exception is encountered , processing resumes here .
*
@ -1350,6 +1388,32 @@ ReplSlotSyncWorkerMain(char *startup_data, size_t startup_data_len)
/* We can now handle ereport(ERROR) */
PG_exception_stack = & local_sigjmp_buf ;
/* Setup signal handling */
pqsignal ( SIGHUP , SignalHandlerForConfigReload ) ;
pqsignal ( SIGINT , SignalHandlerForShutdownRequest ) ;
pqsignal ( SIGTERM , die ) ;
pqsignal ( SIGFPE , FloatExceptionHandler ) ;
pqsignal ( SIGUSR1 , procsignal_sigusr1_handler ) ;
pqsignal ( SIGUSR2 , SIG_IGN ) ;
pqsignal ( SIGPIPE , SIG_IGN ) ;
pqsignal ( SIGCHLD , SIG_DFL ) ;
check_and_set_sync_info ( MyProcPid ) ;
ereport ( LOG , errmsg ( " slot sync worker started " ) ) ;
/* Register it as soon as SlotSyncCtx->pid is initialized. */
before_shmem_exit ( slotsync_worker_onexit , ( Datum ) 0 ) ;
/*
* Establishes SIGALRM handler and initialize timeout module . It is needed
* by InitPostgres to register different timeouts .
*/
InitializeTimeouts ( ) ;
/* Load the libpq-specific functions */
load_file ( " libpqwalreceiver " , false ) ;
/*
* Unblock signals ( they were blocked when the postmaster forked us )
*/
@ -1402,13 +1466,13 @@ ReplSlotSyncWorkerMain(char *startup_data, size_t startup_data_len)
errmsg ( " could not connect to the primary server: %s " , err ) ) ;
/*
* Register the failure callback once we have the connection .
* Register the disconnection callback .
*
* XXX : This can be combined with previous such cleanup registration of
* XXX : This can be combined with previous cleanup registration of
* slotsync_worker_onexit ( ) but that will need the connection to be made
* global and we want to avoid introducing global for this purpose .
*/
before_shmem_exit ( slotsync_failure_callback , PointerGetDatum ( wrconn ) ) ;
before_shmem_exit ( slotsync_worker_disconnect , PointerGetDatum ( wrconn ) ) ;
/*
* Using the specified primary server connection , check that we are not a
@ -1457,8 +1521,8 @@ update_synced_slots_inactive_since(void)
if ( ! StandbyMode )
return ;
/* The slot sync worker mustn't be running by now */
Assert ( SlotSyncCtx - > pid = = InvalidPid ) ;
/* The slot sync worker or SQL function mustn't be running by now */
Assert ( ( SlotSyncCtx - > pid = = InvalidPid ) & & ! SlotSyncCtx - > syncing ) ;
LWLockAcquire ( ReplicationSlotControlLock , LW_SHARED ) ;
@ -1471,6 +1535,9 @@ update_synced_slots_inactive_since(void)
{
Assert ( SlotIsLogical ( s ) ) ;
/* The slot must not be acquired by any process */
Assert ( s - > active_pid = = 0 ) ;
/* Use the same inactive_since time for all the slots. */
if ( now = = 0 )
now = GetCurrentTimestamp ( ) ;
@ -1486,25 +1553,39 @@ update_synced_slots_inactive_since(void)
/*
* Shut down the slot sync worker .
*
* This function sends signal to shutdown slot sync worker , if required . It
* also waits till the slot sync worker has exited or
* pg_sync_replication_slots ( ) has finished .
*/
void
ShutDownSlotSync ( void )
{
pid_t worker_pid ;
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
SlotSyncCtx - > stopSignaled = true ;
if ( SlotSyncCtx - > pid = = InvalidPid )
/*
* Return if neither the slot sync worker is running nor the function
* pg_sync_replication_slots ( ) is executing .
*/
if ( ! SlotSyncCtx - > syncing )
{
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
update_synced_slots_inactive_since ( ) ;
return ;
}
worker_pid = SlotSyncCtx - > pid ;
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
kill ( SlotSyncCtx - > pid , SIGINT ) ;
if ( worker_pid ! = InvalidPid )
kill ( worker_pid , SIGINT ) ;
/* Wait for it to die */
/* Wait for slot sync to end */
for ( ; ; )
{
int rc ;
@ -1522,8 +1603,8 @@ ShutDownSlotSync(void)
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
/* Is it gone? */
if ( SlotSyncCtx - > pid = = InvalidPid )
/* Ensure that no process is syncing the slots. */
if ( ! SlotSyncCtx - > syncing )
break ;
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
@ -1601,26 +1682,37 @@ SlotSyncShmemInit(void)
}
/*
* Error cleanup callback for slot synchroniza tion .
* Error cleanup callback for slot sync SQL func tion .
*/
static void
slotsync_failure_callback ( int code , Datum arg )
{
WalReceiverConn * wrconn = ( WalReceiverConn * ) DatumGetPointer ( arg ) ;
if ( syncing_slots )
{
/*
* If syncing_slots is true , it indicates that the process errored out
* without resetting the flag . So , we need to clean up shared memory
* and reset the flag here .
*/
SpinLockAcquire ( & SlotSyncCtx - > mutex ) ;
SlotSyncCtx - > syncing = false ;
SpinLockRelease ( & SlotSyncCtx - > mutex ) ;
/*
* We need to do slots cleanup here just like WalSndErrorCleanup ( ) does .
*
* The startup process during promotion invokes ShutDownSlotSync ( ) which
* waits for slot sync to finish and it does that by checking the
* ' syncing ' flag . Thus the SQL function must be done with slots ' release
* and cleanup to avoid any dangling temporary slots or active slots
* before it marks itself as finished syncing .
*/
syncing_slots = false ;
}
/* Make sure active replication slots are released */
if ( MyReplicationSlot ! = NULL )
ReplicationSlotRelease ( ) ;
/* Also cleanup the synced temporary slots. */
ReplicationSlotCleanup ( true ) ;
/*
* The set syncing_slots indicates that the process errored out without
* resetting the flag . So , we need to clean up shared memory and reset the
* flag here .
*/
if ( syncing_slots )
reset_syncing_flag ( ) ;
walrcv_disconnect ( wrconn ) ;
}
@ -1634,9 +1726,17 @@ SyncReplicationSlots(WalReceiverConn *wrconn)
{
PG_ENSURE_ERROR_CLEANUP ( slotsync_failure_callback , PointerGetDatum ( wrconn ) ) ;
{
check_and_set_sync_info ( InvalidPid ) ;
validate_remote_info ( wrconn ) ;
synchronize_slots ( wrconn ) ;
/* Cleanup the synced temporary slots */
ReplicationSlotCleanup ( true ) ;
/* We are done with sync, so reset sync flag */
reset_syncing_flag ( ) ;
}
PG_END_ENSURE_ERROR_CLEANUP ( slotsync_failure_callback , PointerGetDatum ( wrconn ) ) ;
}