mirror of https://github.com/postgres/postgres
78ea8b5
has fixed an issue related to the recycling of WAL segments on
standbys depending on archive_mode. However, it has introduced a
regression with the handling of WAL segments ready to be archived during
crash recovery, causing those files to be recycled without getting
archived.
This commit fixes the regression by tracking in shared memory if a live
cluster is either in crash recovery or archive recovery as the handling
of WAL segments ready to be archived is different in both cases (those
WAL segments should not be removed during crash recovery), and by using
this new shared memory state to decide if a segment can be recycled or
not. Previously, it was not possible to know if a cluster was in crash
recovery or archive recovery as the shared state was able to track only
if recovery was happening or not, leading to the problem.
A set of TAP tests is added to close the gap here, making sure that WAL
segments ready to be archived are correctly handled when a cluster is in
archive or crash recovery with archive_mode set to "on" or "always", for
both standby and primary.
Reported-by: Benoît Lobréau
Author: Jehan-Guillaume de Rorthais
Reviewed-by: Kyotaro Horiguchi, Fujii Masao, Michael Paquier
Discussion: https://postgr.es/m/20200331172229.40ee00dc@firost
Backpatch-through: 9.5
pull/54/head
parent
3436c5e283
commit
4e87c4836a
@ -0,0 +1,214 @@ |
||||
# |
||||
# Tests related to WAL archiving and recovery. |
||||
# |
||||
use strict; |
||||
use warnings; |
||||
use PostgresNode; |
||||
use TestLib; |
||||
use Test::More tests => 16; |
||||
use Config; |
||||
|
||||
my $primary = get_new_node('master'); |
||||
$primary->init( |
||||
has_archiving => 1, |
||||
allows_streaming => 1); |
||||
$primary->append_conf('postgresql.conf', 'autovacuum = off'); |
||||
$primary->start; |
||||
my $primary_data = $primary->data_dir; |
||||
|
||||
# Temporarily use an archive_command value to make the archiver fail, |
||||
# knowing that archiving is enabled. Note that we cannot use a command |
||||
# that does not exist as in this case the archiver process would just exit |
||||
# without reporting the failure to pg_stat_archiver. This also cannot |
||||
# use a plain "false" as that's unportable on Windows. So, instead, as |
||||
# a portable solution, use an archive command based on a command known to |
||||
# work but will fail: copy with an incorrect original path. |
||||
my $incorrect_command = |
||||
$TestLib::windows_os |
||||
? qq{copy "%p_does_not_exist" "%f_does_not_exist"} |
||||
: qq{cp "%p_does_not_exist" "%f_does_not_exist"}; |
||||
$primary->safe_psql( |
||||
'postgres', qq{ |
||||
ALTER SYSTEM SET archive_command TO '$incorrect_command'; |
||||
SELECT pg_reload_conf(); |
||||
}); |
||||
|
||||
# Save the WAL segment currently in use and switch to a new segment. |
||||
# This will be used to track the activity of the archiver. |
||||
my $segment_name_1 = $primary->safe_psql('postgres', |
||||
q{SELECT pg_walfile_name(pg_current_wal_lsn())}); |
||||
my $segment_path_1 = "pg_wal/archive_status/$segment_name_1"; |
||||
my $segment_path_1_ready = "$segment_path_1.ready"; |
||||
my $segment_path_1_done = "$segment_path_1.done"; |
||||
$primary->safe_psql( |
||||
'postgres', q{ |
||||
CREATE TABLE mine AS SELECT generate_series(1,10) AS x; |
||||
SELECT pg_switch_wal(); |
||||
CHECKPOINT; |
||||
}); |
||||
|
||||
# Wait for an archive failure. |
||||
$primary->poll_query_until('postgres', |
||||
q{SELECT failed_count > 0 FROM pg_stat_archiver}, 't') |
||||
or die "Timed out while waiting for archiving to fail"; |
||||
ok( -f "$primary_data/$segment_path_1_ready", |
||||
".ready file exists for WAL segment $segment_name_1 waiting to be archived" |
||||
); |
||||
ok( !-f "$primary_data/$segment_path_1_done", |
||||
".done file does not exist for WAL segment $segment_name_1 waiting to be archived" |
||||
); |
||||
|
||||
is( $primary->safe_psql( |
||||
'postgres', q{ |
||||
SELECT archived_count, last_failed_wal |
||||
FROM pg_stat_archiver |
||||
}), |
||||
"0|$segment_name_1", |
||||
'pg_stat_archiver failed to archive $segment_name_1'); |
||||
|
||||
# Crash the cluster for the next test in charge of checking that non-archived |
||||
# WAL segments are not removed. |
||||
$primary->stop('immediate'); |
||||
|
||||
# Recovery tests for the archiving with a standby partially check |
||||
# the recovery behavior when restoring a backup taken using a |
||||
# snapshot with no pg_start/stop_backup. In this situation, |
||||
# the recovered standby should enter first crash recovery then |
||||
# switch to regular archive recovery. Note that the base backup |
||||
# is taken here so as archive_command will fail. This is necessary |
||||
# for the assumptions of the tests done with the standbys below. |
||||
$primary->backup_fs_cold('backup'); |
||||
|
||||
$primary->start; |
||||
ok( -f "$primary_data/$segment_path_1_ready", |
||||
".ready file for WAL segment $segment_name_1 still exists after crash recovery on primary" |
||||
); |
||||
|
||||
# Allow WAL archiving again and wait for a success. |
||||
$primary->safe_psql( |
||||
'postgres', q{ |
||||
ALTER SYSTEM RESET archive_command; |
||||
SELECT pg_reload_conf(); |
||||
}); |
||||
|
||||
$primary->poll_query_until('postgres', |
||||
q{SELECT archived_count FROM pg_stat_archiver}, '1') |
||||
or die "Timed out while waiting for archiving to finish"; |
||||
|
||||
ok(!-f "$primary_data/$segment_path_1_ready", |
||||
".ready file for archived WAL segment $segment_name_1 removed"); |
||||
|
||||
ok(-f "$primary_data/$segment_path_1_done", |
||||
".done file for archived WAL segment $segment_name_1 exists"); |
||||
|
||||
is( $primary->safe_psql( |
||||
'postgres', q{ SELECT last_archived_wal FROM pg_stat_archiver }), |
||||
$segment_name_1, |
||||
"archive success reported in pg_stat_archiver for WAL segment $segment_name_1" |
||||
); |
||||
|
||||
# Create some WAL activity and a new checkpoint so as the next standby can |
||||
# create a restartpoint. As this standby starts in crash recovery because |
||||
# of the cold backup taken previously, it needs a clean restartpoint to deal |
||||
# with existing status files. |
||||
my $segment_name_2 = $primary->safe_psql('postgres', |
||||
q{SELECT pg_walfile_name(pg_current_wal_lsn())}); |
||||
my $segment_path_2 = "pg_wal/archive_status/$segment_name_2"; |
||||
my $segment_path_2_ready = "$segment_path_2.ready"; |
||||
my $segment_path_2_done = "$segment_path_2.done"; |
||||
$primary->safe_psql( |
||||
'postgres', q{ |
||||
INSERT INTO mine SELECT generate_series(10,20) AS x; |
||||
SELECT pg_switch_wal(); |
||||
CHECKPOINT; |
||||
}); |
||||
|
||||
$primary->poll_query_until('postgres', |
||||
q{ SELECT last_archived_wal FROM pg_stat_archiver }, |
||||
$segment_name_2) |
||||
or die "Timed out while waiting for archiving to finish"; |
||||
|
||||
# Test standby with archive_mode = on. |
||||
my $standby1 = get_new_node('standby'); |
||||
$standby1->init_from_backup($primary, 'backup', has_restoring => 1); |
||||
$standby1->append_conf('postgresql.conf', "archive_mode = on"); |
||||
my $standby1_data = $standby1->data_dir; |
||||
$standby1->start; |
||||
$standby1->safe_psql('postgres', q{CHECKPOINT}); |
||||
|
||||
# Recovery with archive_mode=on does not keep .ready signal files inherited |
||||
# from backup. Note that this WAL segment existed in the backup. |
||||
ok( !-f "$standby1_data/$segment_path_1_ready", |
||||
".ready file for WAL segment $segment_name_1 present in backup got removed with archive_mode=on on standby" |
||||
); |
||||
|
||||
# Recovery with archive_mode=on should not create .ready files. |
||||
# Note that this segment did not exist in the backup. |
||||
ok( !-f "$standby1_data/$segment_path_2_ready", |
||||
".ready file for WAL segment $segment_name_2 not created on standby when archive_mode=on on standby" |
||||
); |
||||
|
||||
# Recovery with archive_mode = on creates .done files. |
||||
ok( -f "$standby1_data/$segment_path_2_done", |
||||
".done file for WAL segment $segment_name_2 created when archive_mode=on on standby" |
||||
); |
||||
|
||||
# Test recovery with archive_mode = always, which should always keep |
||||
# .ready files if archiving is enabled, though here we want the archive |
||||
# command to fail to persist the .ready files. Note that this node |
||||
# has inherited the archive command of the previous cold backup that |
||||
# will cause archiving failures. |
||||
my $standby2 = get_new_node('standby2'); |
||||
$standby2->init_from_backup($primary, 'backup', has_restoring => 1); |
||||
$standby2->append_conf('postgresql.conf', 'archive_mode = always'); |
||||
my $standby2_data = $standby2->data_dir; |
||||
$standby2->start; |
||||
|
||||
$standby2->safe_psql('postgres', q{CHECKPOINT}); |
||||
|
||||
ok( -f "$standby2_data/$segment_path_1_ready", |
||||
".ready file for WAL segment $segment_name_1 existing in backup is kept with archive_mode=always on standby" |
||||
); |
||||
|
||||
ok( -f "$standby2_data/$segment_path_2_ready", |
||||
".ready file for WAL segment $segment_name_2 created with archive_mode=always on standby" |
||||
); |
||||
|
||||
# Reset statistics of the archiver for the next checks. |
||||
$standby2->safe_psql('postgres', q{SELECT pg_stat_reset_shared('archiver')}); |
||||
|
||||
# Now crash the cluster to check that recovery step does not |
||||
# remove non-archived WAL segments on a standby where archiving |
||||
# is enabled. |
||||
$standby2->stop('immediate'); |
||||
$standby2->start; |
||||
|
||||
ok( -f "$standby2_data/$segment_path_1_ready", |
||||
"WAL segment still ready to archive after crash recovery on standby with archive_mode=always" |
||||
); |
||||
|
||||
# Allow WAL archiving again, and wait for the segments to be archived. |
||||
$standby2->safe_psql( |
||||
'postgres', q{ |
||||
ALTER SYSTEM RESET archive_command; |
||||
SELECT pg_reload_conf(); |
||||
}); |
||||
$standby2->poll_query_until('postgres', |
||||
q{SELECT last_archived_wal FROM pg_stat_archiver}, |
||||
$segment_name_2) |
||||
or die "Timed out while waiting for archiving to finish"; |
||||
|
||||
is( $standby2->safe_psql( |
||||
'postgres', q{SELECT archived_count FROM pg_stat_archiver}), |
||||
'2', |
||||
"correct number of WAL segments archived from standby"); |
||||
|
||||
ok( !-f "$standby2_data/$segment_path_1_ready" |
||||
&& !-f "$standby2_data/$segment_path_2_ready", |
||||
".ready files removed after archive success with archive_mode=always on standby" |
||||
); |
||||
|
||||
ok( -f "$standby2_data/$segment_path_1_done" |
||||
&& -f "$standby2_data/$segment_path_2_done", |
||||
".done files created after archive success with archive_mode=always on standby" |
||||
); |
Loading…
Reference in new issue