mirror of https://github.com/postgres/postgres
Crash recovery on standby may encounter missing directories when replaying create database WAL records. Prior to this patch, the standby would fail to recover in such a case. However, the directories could be legitimately missing. Consider a sequence of WAL records as follows: CREATE DATABASE DROP DATABASE DROP TABLESPACE If, after replaying the last WAL record and removing the tablespace directory, the standby crashes and has to replay the create database record again, the crash recovery must be able to move on. This patch adds a mechanism similar to invalid-page tracking, to keep a tally of missing directories during crash recovery. If all the missing directory references are matched with corresponding drop records at the end of crash recovery, the standby can safely continue following the primary. Backpatch to 13, at least for now. The bug is older, but fixing it in older branches requires more careful study of the interactions with commitpull/81/heade6d8069522
, which appeared in 13. A new TAP test file is added to verify the condition. However, because it depends on commitd6d317dbf6
, it can only be added to branch master. I (Álvaro) manually verified that the code behaves as expected in branch 14. It's a bit nervous-making to leave the code uncovered by tests in older branches, but leaving the bug unfixed is even worse. Also, the main reason this fix took so long is precisely that we couldn't agree on a good strategy to approach testing for the bug, so perhaps this is the best we can do. Diagnosed-by: Paul Guo <paulguo@gmail.com> Author: Paul Guo <paulguo@gmail.com> Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Author: Asim R Praveen <apraveen@pivotal.io> Discussion: https://postgr.es/m/CAEET0ZGx9AvioViLf7nbR_8tH9-=27DN5xWJ2P9-ROH16e4JUA@mail.gmail.com
parent
c64fb698d0
commit
49d9cfc68b
@ -0,0 +1,67 @@ |
||||
# Copyright (c) 2022, PostgreSQL Global Development Group |
||||
|
||||
# Test recovery involving tablespace removal. If recovery stops |
||||
# after once tablespace is removed, the next recovery should properly |
||||
# ignore the operations within the removed tablespaces. |
||||
|
||||
use strict; |
||||
use warnings; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
my $node_primary = PostgreSQL::Test::Cluster->new('primary1'); |
||||
$node_primary->init(allows_streaming => 1); |
||||
$node_primary->start; |
||||
$node_primary->psql('postgres', |
||||
qq[ |
||||
SET allow_in_place_tablespaces=on; |
||||
CREATE TABLESPACE dropme_ts1 LOCATION ''; |
||||
CREATE TABLESPACE dropme_ts2 LOCATION ''; |
||||
CREATE TABLESPACE source_ts LOCATION ''; |
||||
CREATE TABLESPACE target_ts LOCATION ''; |
||||
CREATE DATABASE template_db IS_TEMPLATE = true; |
||||
]); |
||||
my $backup_name = 'my_backup'; |
||||
$node_primary->backup($backup_name); |
||||
|
||||
my $node_standby = PostgreSQL::Test::Cluster->new('standby1'); |
||||
$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1); |
||||
$node_standby->start; |
||||
|
||||
# Make sure connection is made |
||||
$node_primary->poll_query_until( |
||||
'postgres', 'SELECT count(*) = 1 FROM pg_stat_replication'); |
||||
|
||||
$node_standby->safe_psql('postgres', 'CHECKPOINT'); |
||||
|
||||
# Do immediate shutdown just after a sequence of CREATE DATABASE / DROP |
||||
# DATABASE / DROP TABLESPACE. This causes CREATE DATABASE WAL records |
||||
# to be applied to already-removed directories. |
||||
$node_primary->safe_psql('postgres', |
||||
q[CREATE DATABASE dropme_db1 WITH TABLESPACE dropme_ts1; |
||||
CREATE DATABASE dropme_db2 WITH TABLESPACE dropme_ts2; |
||||
CREATE DATABASE moveme_db TABLESPACE source_ts; |
||||
ALTER DATABASE moveme_db SET TABLESPACE target_ts; |
||||
CREATE DATABASE newdb TEMPLATE template_db; |
||||
ALTER DATABASE template_db IS_TEMPLATE = false; |
||||
DROP DATABASE dropme_db1; |
||||
DROP DATABASE dropme_db2; DROP TABLESPACE dropme_ts2; |
||||
DROP TABLESPACE source_ts; |
||||
DROP DATABASE template_db;]); |
||||
|
||||
$node_primary->wait_for_catchup($node_standby, 'replay', |
||||
$node_primary->lsn('replay')); |
||||
$node_standby->stop('immediate'); |
||||
|
||||
# Should restart ignoring directory creation error. |
||||
is($node_standby->start, 1, "standby started successfully"); |
||||
|
||||
my $log = PostgreSQL::Test::Utils::slurp_file($node_standby->logfile); |
||||
like( |
||||
$log, |
||||
qr[WARNING: skipping replay of database creation WAL record], |
||||
"warning message is logged"); |
||||
|
||||
done_testing(); |
Loading…
Reference in new issue