mirror of https://github.com/postgres/postgres
While I was working on a patch to refactor things around xlog.c, I mixed up EndOfLogTLI and replayTLI at the end of recovery. As a result, if you recovered to a point with a lower-numbered timeline in a WAL segment that has a higher TLI in the filename, the end-of-recovery WAL record was created with invalid PrevTimeLineId. I noticed that while self-reviewing, but no tests failed. So add a test to cover that corner case. Thanks to Amul Sul who also submitted a test case for the same corner case, although this patch is different from that. Reviewed-by: Amul Sul, Michael Paquier Discussion: https://www.postgresql.org/message-id/52bc9ccd-8591-431b-0086-15d9acf25a3f@iki.fi Discussion: https://www.postgresql.org/message-id/CAAJ_b94Vjt5cXGza_1MkjLQWciNdEemsmiWuQj0d%3DM7JfjAa1g%40mail.gmail.compull/77/head
parent
1383d52faa
commit
50e5bc582a
@ -0,0 +1,176 @@ |
||||
# Copyright (c) 2022, PostgreSQL Global Development Group |
||||
|
||||
# Test recovering to a point-in-time using WAL archive, such that the |
||||
# target point is physically in a WAL segment with a higher TLI than |
||||
# the target point's TLI. For example, imagine that the following WAL |
||||
# segments exist in the WAL archive: |
||||
# |
||||
# 000000010000000000000001 |
||||
# 000000010000000000000002 |
||||
# 000000020000000000000003 |
||||
# |
||||
# The timeline switch happened in the middle of WAL segment 3, but it |
||||
# was never archived on timeline 1. The first half of |
||||
# 000000020000000000000003 contains the WAL from timeline 1 up to the |
||||
# point where the timeline switch happened. If you now perform |
||||
# archive recovery with recovery target point in that first half of |
||||
# segment 3, archive recovery will find the WAL up to that point in |
||||
# segment 000000020000000000000003, but it will not follow the |
||||
# timeline switch to timeline 2, and creates a timeline switching |
||||
# end-of-recovery record with TLI 1 -> 3. That's what this test case |
||||
# tests. |
||||
# |
||||
# The comments below contain lists of WAL segments at different points |
||||
# in the tests, to make it easier to follow along. They are correct |
||||
# as of this writing, but the exact WAL segment numbers could change |
||||
# if the backend logic for when it switches to a new segment changes. |
||||
# The actual checks are not sensitive to that. |
||||
|
||||
use strict; |
||||
use warnings; |
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
use File::Compare; |
||||
|
||||
# Initialize and start primary node with WAL archiving |
||||
my $node_primary = PostgreSQL::Test::Cluster->new('primary'); |
||||
$node_primary->init(has_archiving => 1, allows_streaming => 1); |
||||
$node_primary->start; |
||||
|
||||
# Take a backup. |
||||
my $backup_name = 'my_backup'; |
||||
$node_primary->backup($backup_name); |
||||
|
||||
# Workload with some transactions, and the target restore point. |
||||
$node_primary->psql( |
||||
'postgres', qq{ |
||||
CREATE TABLE foo(i int); |
||||
INSERT INTO foo VALUES(1); |
||||
SELECT pg_create_restore_point('rp'); |
||||
INSERT INTO foo VALUES(2); |
||||
}); |
||||
|
||||
# Contents of the WAL archive at this point: |
||||
# |
||||
# 000000010000000000000001 |
||||
# 000000010000000000000002 |
||||
# 000000010000000000000002.00000028.backup |
||||
# |
||||
# The operations on the test table and the restore point went into WAL |
||||
# segment 3, but it hasn't been archived yet. |
||||
|
||||
# Start a standby node, and wait for it to catch up. |
||||
my $node_standby = PostgreSQL::Test::Cluster->new('standby'); |
||||
$node_standby->init_from_backup( |
||||
$node_primary, $backup_name, |
||||
standby => 1, |
||||
has_streaming => 1, |
||||
has_archiving => 1, |
||||
has_restoring => 0); |
||||
$node_standby->append_conf('postgresql.conf', 'archive_mode = always'); |
||||
$node_standby->start; |
||||
$node_primary->wait_for_catchup($node_standby); |
||||
|
||||
# Check that it's really caught up. |
||||
my $result = $node_standby->safe_psql('postgres', "SELECT max(i) FROM foo;"); |
||||
is($result, qq{2}, "check table contents after archive recovery"); |
||||
|
||||
# Kill the old primary, before it archives the most recent WAL segment that |
||||
# contains all the INSERTs. |
||||
$node_primary->stop('immediate'); |
||||
|
||||
# Promote the standby, and switch WAL so that it archives a WAL segment |
||||
# that contains all the INSERTs, on a new timeline. |
||||
$node_standby->promote; |
||||
|
||||
# Find next WAL segment to be archived. |
||||
my $walfile_to_be_archived = $node_standby->safe_psql('postgres', |
||||
"SELECT pg_walfile_name(pg_current_wal_lsn());"); |
||||
|
||||
# Make WAL segment eligible for archival |
||||
$node_standby->safe_psql('postgres', 'SELECT pg_switch_wal()'); |
||||
|
||||
# Wait until the WAL segment has been archived. |
||||
my $archive_wait_query = |
||||
"SELECT '$walfile_to_be_archived' <= last_archived_wal FROM pg_stat_archiver;"; |
||||
$node_standby->poll_query_until('postgres', $archive_wait_query) |
||||
or die "Timed out while waiting for WAL segment to be archived"; |
||||
my $last_archived_wal_file = $walfile_to_be_archived; |
||||
|
||||
# Ok, the standby has now archived the WAL on timeline 2. We don't |
||||
# need the standby anymore. |
||||
$node_standby->stop; |
||||
|
||||
# Contents of the WAL archive at this point: |
||||
# |
||||
# 000000010000000000000001 |
||||
# 000000010000000000000002 |
||||
# 000000010000000000000002.00000028.backup |
||||
# 000000010000000000000003.partial |
||||
# 000000020000000000000003 |
||||
# 00000002.history |
||||
# |
||||
# The operations on the test table and the restore point are in |
||||
# segment 3. They are part of timeline 1, but were not archived by |
||||
# the primary yet. However, they were copied into the beginning of |
||||
# segment 000000020000000000000003, before the timeline switching |
||||
# record. (They are also present in the |
||||
# 000000010000000000000003.partial file, but .partial files are not |
||||
# used automatically.) |
||||
|
||||
# Now test PITR to the recovery target. It should find the WAL in |
||||
# segment 000000020000000000000003, but not follow the timeline switch |
||||
# to timeline 2. |
||||
my $node_pitr = PostgreSQL::Test::Cluster->new('node_pitr'); |
||||
$node_pitr->init_from_backup( |
||||
$node_primary, $backup_name, |
||||
standby => 0, |
||||
has_restoring => 1); |
||||
$node_pitr->append_conf( |
||||
'postgresql.conf', qq{ |
||||
recovery_target_name = 'rp' |
||||
recovery_target_action = 'promote' |
||||
}); |
||||
|
||||
$node_pitr->start; |
||||
|
||||
# Wait until recovery finishes. |
||||
$node_pitr->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") |
||||
or die "Timed out while waiting for PITR promotion"; |
||||
|
||||
# Check that we see the data we expect. |
||||
$result = $node_pitr->safe_psql('postgres', "SELECT max(i) FROM foo;"); |
||||
is($result, qq{1}, "check table contents after point-in-time recovery"); |
||||
|
||||
# Insert a row so that we can check later that we successfully recover |
||||
# back to this timeline. |
||||
$node_pitr->safe_psql('postgres', "INSERT INTO foo VALUES(3);"); |
||||
|
||||
# Stop the node. This archives the last segment. |
||||
$node_pitr->stop(); |
||||
|
||||
# Test archive recovery on the timeline created by the PITR. This |
||||
# replays the end-of-recovery record that switches from timeline 1 to |
||||
# 3. |
||||
my $node_pitr2 = PostgreSQL::Test::Cluster->new('node_pitr2'); |
||||
$node_pitr2->init_from_backup( |
||||
$node_primary, $backup_name, |
||||
standby => 0, |
||||
has_restoring => 1); |
||||
$node_pitr2->append_conf( |
||||
'postgresql.conf', qq{ |
||||
recovery_target_action = 'promote' |
||||
}); |
||||
|
||||
$node_pitr2->start; |
||||
|
||||
# Wait until recovery finishes. |
||||
$node_pitr2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") |
||||
or die "Timed out while waiting for PITR promotion"; |
||||
|
||||
# Verify that we can see the row inserted after the PITR. |
||||
$result = $node_pitr2->safe_psql('postgres', "SELECT max(i) FROM foo;"); |
||||
is($result, qq{3}, "check table contents after point-in-time recovery"); |
||||
|
||||
done_testing(); |
||||
Loading…
Reference in new issue