From 01d485b142e4c89aef79da9fd60f203c1bd4458b Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 6 Mar 2026 12:37:40 +0900 Subject: [PATCH] Add system view pg_stat_recovery This commit introduces pg_stat_recovery, that exposes at SQL level the state of recovery as tracked by XLogRecoveryCtlData in shared memory, maintained by the startup process. This new view includes the following fields, that are useful for monitoring purposes on a standby, once it has reached a consistent state (making the execution of the SQL function possible): - Last-successfully replayed WAL record LSN boundaries and its timeline. - Currently replaying WAL record end LSN and its timeline. - Current WAL chunk start time. - Promotion trigger state. - Timestamp of latest processed commit/abort. - Recovery pause state. Some of this data can already be recovered from different system functions, but not all of it. See pg_get_wal_replay_pause_state or pg_last_xact_replay_timestamp. This new view offers a stronger consistency guarantee, by grabbing the recovery state for all fields through one spinlock acquisition. The system view relies on a new function, called pg_stat_get_recovery(). Querying this data requires the pg_read_all_stats privilege. The view returns no rows if the node is not in recovery. This feature originates from a suggestion I have made while discussion the addition of a CONNECTING state to the WAL receiver's shared memory state, because we lacked access to some of the state data. The author has taken the time to implement it, so thanks for that. Bump catalog version. Author: Xuneng Zhou Discussion: https://postgr.es/m/CABPTF7W+Nody-+P9y4PNk37-QWuLpfUrEonHuEhrX+Vx9Kq+Kw@mail.gmail.com Discussion: https://postgr.es/m/aW13GJn_RfTJIFCa@paquier.xyz --- doc/src/sgml/monitoring.sgml | 151 +++++++++++++++++++++++++ src/backend/access/transam/xlogfuncs.c | 94 +++++++++++++++ src/backend/catalog/system_views.sql | 14 +++ src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 7 ++ src/test/recovery/t/001_stream_rep.pl | 5 + src/test/regress/expected/rules.out | 11 ++ src/test/regress/expected/sysviews.out | 7 ++ src/test/regress/sql/sysviews.sql | 3 + 9 files changed, 293 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index dcf6e6a2f48..b3d53550688 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -338,6 +338,14 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + pg_stat_recoverypg_stat_recovery + Only one row, showing statistics about the state of recovery. + See + pg_stat_recovery for details. + + + pg_stat_recovery_prefetchpg_stat_recovery_prefetch Only one row, showing statistics about blocks prefetched during recovery. @@ -1912,6 +1920,149 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + <structname>pg_stat_recovery</structname> + + + pg_stat_recovery + + + + The pg_stat_recovery view will contain only + one row, showing statistics about the recovery state of the startup + process. This view returns no row when the server is not in recovery. + + + + <structname>pg_stat_recovery</structname> View + + + + + Column Type + + + Description + + + + + + + + promote_triggered boolean + + + True if a promotion has been triggered. + + + + + + last_replayed_read_lsn pg_lsn + + + Start write-ahead log location of the last successfully replayed + WAL record. + + + + + + last_replayed_end_lsn pg_lsn + + + End write-ahead log location of the last successfully replayed + WAL record. + + + + + + last_replayed_tli integer + + + Timeline of the last successfully replayed WAL record. + + + + + + replay_end_lsn pg_lsn + + + Write-ahead log location of the record currently being replayed + (end position plus one). When no record is being actively replayed, + equals last_replayed_end_lsn. + + + + + + replay_end_tli integer + + + Timeline of the WAL record currently being replayed. + + + + + + recovery_last_xact_time timestamp with time zone + + + Timestamp of the last transaction commit or abort replayed during + recovery. This is the time at which the commit or abort WAL record + for that transaction was generated on the primary. + + + + + + current_chunk_start_time timestamp with time zone + + + Time when the startup process observed that replay had caught up + with the latest received WAL chunk. Used in recovery-conflict + timing and replay/apply-lag diagnostics. NULL if not yet + available. + + + + + + pause_state text + + + Recovery pause state. Possible values are: + + + + + not paused: Recovery is proceeding normally. + + + + + pause requested: A pause has been requested + but recovery has not yet paused. + + + + + paused: Recovery is paused. + + + + + + + + +
+ +
+ <structname>pg_stat_recovery_prefetch</structname> diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 78543055895..7c0e430b690 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -22,10 +22,12 @@ #include "access/xlog_internal.h" #include "access/xlogbackup.h" #include "access/xlogrecovery.h" +#include "catalog/pg_authid.h" #include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" +#include "utils/acl.h" #include "replication/walreceiver.h" #include "storage/fd.h" #include "storage/latch.h" @@ -763,3 +765,95 @@ pg_promote(PG_FUNCTION_ARGS) wait_seconds))); PG_RETURN_BOOL(false); } + +/* + * pg_stat_get_recovery - returns information about WAL recovery state + * + * Returns NULL when not in recovery or when the caller lacks + * pg_read_all_stats privileges; one row otherwise. + */ +Datum +pg_stat_get_recovery(PG_FUNCTION_ARGS) +{ + TupleDesc tupdesc; + Datum *values; + bool *nulls; + + /* Local copies of shared state */ + bool promote_triggered; + XLogRecPtr last_replayed_read_lsn; + XLogRecPtr last_replayed_end_lsn; + TimeLineID last_replayed_tli; + XLogRecPtr replay_end_lsn; + TimeLineID replay_end_tli; + TimestampTz recovery_last_xact_time; + TimestampTz current_chunk_start_time; + RecoveryPauseState pause_state; + + if (!RecoveryInProgress()) + PG_RETURN_NULL(); + + if (!has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS)) + PG_RETURN_NULL(); + + /* Take a lock to ensure value consistency */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + promote_triggered = XLogRecoveryCtl->SharedPromoteIsTriggered; + last_replayed_read_lsn = XLogRecoveryCtl->lastReplayedReadRecPtr; + last_replayed_end_lsn = XLogRecoveryCtl->lastReplayedEndRecPtr; + last_replayed_tli = XLogRecoveryCtl->lastReplayedTLI; + replay_end_lsn = XLogRecoveryCtl->replayEndRecPtr; + replay_end_tli = XLogRecoveryCtl->replayEndTLI; + recovery_last_xact_time = XLogRecoveryCtl->recoveryLastXTime; + current_chunk_start_time = XLogRecoveryCtl->currentChunkStartTime; + pause_state = XLogRecoveryCtl->recoveryPauseState; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + values = palloc0_array(Datum, tupdesc->natts); + nulls = palloc0_array(bool, tupdesc->natts); + + values[0] = BoolGetDatum(promote_triggered); + + if (XLogRecPtrIsValid(last_replayed_read_lsn)) + values[1] = LSNGetDatum(last_replayed_read_lsn); + else + nulls[1] = true; + + if (XLogRecPtrIsValid(last_replayed_end_lsn)) + values[2] = LSNGetDatum(last_replayed_end_lsn); + else + nulls[2] = true; + + if (XLogRecPtrIsValid(last_replayed_end_lsn)) + values[3] = Int32GetDatum(last_replayed_tli); + else + nulls[3] = true; + + if (XLogRecPtrIsValid(replay_end_lsn)) + values[4] = LSNGetDatum(replay_end_lsn); + else + nulls[4] = true; + + if (XLogRecPtrIsValid(replay_end_lsn)) + values[5] = Int32GetDatum(replay_end_tli); + else + nulls[5] = true; + + if (current_chunk_start_time != 0) + values[6] = TimestampTzGetDatum(current_chunk_start_time); + else + nulls[6] = true; + + /* recovery_last_xact_time */ + if (recovery_last_xact_time != 0) + values[7] = TimestampTzGetDatum(recovery_last_xact_time); + else + nulls[7] = true; + + values[8] = CStringGetTextDatum(GetRecoveryPauseStateString(pause_state)); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index e5c3e1855c1..2eda7d80d02 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -998,6 +998,20 @@ CREATE VIEW pg_stat_wal_receiver AS FROM pg_stat_get_wal_receiver() s WHERE s.pid IS NOT NULL; +CREATE VIEW pg_stat_recovery AS + SELECT + s.promote_triggered, + s.last_replayed_read_lsn, + s.last_replayed_end_lsn, + s.last_replayed_tli, + s.replay_end_lsn, + s.replay_end_tli, + s.recovery_last_xact_time, + s.current_chunk_start_time, + s.pause_state + FROM pg_stat_get_recovery() s + WHERE s.promote_triggered IS NOT NULL; + CREATE VIEW pg_stat_recovery_prefetch AS SELECT s.stats_reset, diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 123e7c4261b..b863edfabda 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202603051 +#define CATALOG_VERSION_NO 202603061 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 4950bff2804..361e2cfffeb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5701,6 +5701,13 @@ proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', proargnames => '{pid,status,receive_start_lsn,receive_start_tli,written_lsn,flushed_lsn,received_tli,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time,slot_name,sender_host,sender_port,conninfo}', prosrc => 'pg_stat_get_wal_receiver' }, +{ oid => '9949', descr => 'statistics: information about WAL recovery', + proname => 'pg_stat_get_recovery', proisstrict => 'f', provolatile => 's', + proparallel => 'r', prorettype => 'record', proargtypes => '', + proallargtypes => '{bool,pg_lsn,pg_lsn,int4,pg_lsn,int4,timestamptz,timestamptz,text}', + proargmodes => '{o,o,o,o,o,o,o,o,o}', + proargnames => '{promote_triggered,last_replayed_read_lsn,last_replayed_end_lsn,last_replayed_tli,replay_end_lsn,replay_end_tli,recovery_last_xact_time,current_chunk_start_time,pause_state}', + prosrc => 'pg_stat_get_recovery' }, { oid => '6169', descr => 'statistics: information about replication slot', proname => 'pg_stat_get_replication_slot', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => 'text', diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl index e9ac67813c7..a4fa4b96c61 100644 --- a/src/test/recovery/t/001_stream_rep.pl +++ b/src/test/recovery/t/001_stream_rep.pl @@ -82,6 +82,11 @@ $result = print "standby 2: $result\n"; is($result, qq(1002), 'check streamed content on standby 2'); +$result = $node_standby_1->safe_psql('postgres', + "SELECT count(*) FROM pg_stat_recovery WHERE promote_triggered IS NOT NULL" +); +is($result, qq(1), 'check recovery state on standby 1'); + # Likewise, but for a sequence $node_primary->safe_psql('postgres', "CREATE SEQUENCE seq1; SELECT nextval('seq1')"); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 78a37d9fc8f..deb6e2ad6a9 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2127,6 +2127,17 @@ pg_stat_progress_vacuum| SELECT s.pid, END AS started_by FROM (pg_stat_get_progress_info('VACUUM'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); +pg_stat_recovery| SELECT promote_triggered, + last_replayed_read_lsn, + last_replayed_end_lsn, + last_replayed_tli, + replay_end_lsn, + replay_end_tli, + recovery_last_xact_time, + current_chunk_start_time, + pause_state + FROM pg_stat_get_recovery() s(promote_triggered, last_replayed_read_lsn, last_replayed_end_lsn, last_replayed_tli, replay_end_lsn, replay_end_tli, recovery_last_xact_time, current_chunk_start_time, pause_state) + WHERE (promote_triggered IS NOT NULL); pg_stat_recovery_prefetch| SELECT stats_reset, prefetch, hit, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 3dd63fd88ed..132b56a5864 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -143,6 +143,13 @@ select count(*) = 0 as ok from pg_stat_wal_receiver; t (1 row) +-- We expect no recovery state in this test (running on primary) +select count(*) = 0 as ok from pg_stat_recovery; + ok +---- + t +(1 row) + -- This is to record the prevailing planner enable_foo settings during -- a regression test run. select name, setting from pg_settings where name like 'enable%'; diff --git a/src/test/regress/sql/sysviews.sql b/src/test/regress/sql/sysviews.sql index 004f9a70e00..507e400ad4a 100644 --- a/src/test/regress/sql/sysviews.sql +++ b/src/test/regress/sql/sysviews.sql @@ -76,6 +76,9 @@ select count(*) = 1 as ok from pg_stat_wal; -- We expect no walreceiver running in this test select count(*) = 0 as ok from pg_stat_wal_receiver; +-- We expect no recovery state in this test (running on primary) +select count(*) = 0 as ok from pg_stat_recovery; + -- This is to record the prevailing planner enable_foo settings during -- a regression test run. select name, setting from pg_settings where name like 'enable%';