mirror of https://github.com/postgres/postgres
Previously none of our tests triggered recovery conflicts. The test is primarily motivated by needing tests for recovery conflict stats for shared memory based pgstats. But it's also a decent start for recovery conflict handling in general. The only type of recovery conflict not tested yet are rcovery deadlock conflicts. By configuring log_recovery_conflict_waits the test adds some very minimal testing for that path as well. Author: Melanie Plageman <melanieplageman@gmail.com> Author: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/20220303021600.hs34ghqcw6zcokdh@alap3.anarazel.depull/81/head
parent
53b9cd20d4
commit
9f8a050f68
@ -0,0 +1,296 @@ |
||||
# Copyright (c) 2021-2022, PostgreSQL Global Development Group |
||||
|
||||
# Test that connections to a hot standby are correctly canceled when a |
||||
# recovery conflict is detected Also, test that statistics in |
||||
# pg_stat_database_conflicts are populated correctly |
||||
|
||||
# TODO: add a test for deadlock recovery conflicts. |
||||
|
||||
use strict; |
||||
use warnings; |
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
|
||||
# Set up nodes |
||||
my $node_primary = PostgreSQL::Test::Cluster->new('primary'); |
||||
$node_primary->init(allows_streaming => 1); |
||||
|
||||
my $tablespace1 = "test_recovery_conflict_tblspc"; |
||||
|
||||
$node_primary->append_conf( |
||||
'postgresql.conf', qq[ |
||||
allow_in_place_tablespaces = on |
||||
log_temp_files = 0 |
||||
|
||||
# wait some to test the wait paths as well, but not long for obvious reasons |
||||
max_standby_streaming_delay = 50ms |
||||
|
||||
temp_tablespaces = $tablespace1 |
||||
# Some of the recovery conflict logging code only gets exercised after |
||||
# deadlock_timeout. The test doesn't rely on that additional output, but it's |
||||
# nice to get some minimal coverage of that code. |
||||
log_recovery_conflict_waits = on |
||||
deadlock_timeout = 10ms |
||||
]); |
||||
$node_primary->start; |
||||
|
||||
my $backup_name = 'my_backup'; |
||||
|
||||
$node_primary->safe_psql('postgres', |
||||
qq[CREATE TABLESPACE $tablespace1 LOCATION '']); |
||||
|
||||
$node_primary->backup($backup_name); |
||||
my $node_standby = PostgreSQL::Test::Cluster->new('standby'); |
||||
$node_standby->init_from_backup($node_primary, $backup_name, |
||||
has_streaming => 1); |
||||
|
||||
$node_standby->start; |
||||
|
||||
my $test_db = "test_db"; |
||||
|
||||
# use a new database, to trigger database recovery conflict |
||||
$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db"); |
||||
|
||||
# test schema / data |
||||
my $table1 = "test_recovery_conflict_table1"; |
||||
$node_primary->safe_psql($test_db, qq[CREATE TABLE ${table1}(a int, b int);]); |
||||
$node_primary->safe_psql($test_db, |
||||
qq[INSERT INTO $table1 SELECT i % 3, 0 FROM generate_series(1,20) i]); |
||||
my $primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); |
||||
|
||||
|
||||
# a longrunning psql that we can use to trigger conflicts |
||||
my $psql_timeout = IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default); |
||||
my %psql_standby = ('stdin' => '', 'stdout' => ''); |
||||
$psql_standby{run} = |
||||
$node_standby->background_psql($test_db, \$psql_standby{stdin}, |
||||
\$psql_standby{stdout}, |
||||
$psql_timeout); |
||||
$psql_standby{stdout} = ''; |
||||
|
||||
my $expected_conflicts = 0; |
||||
|
||||
|
||||
## RECOVERY CONFLICT 1: Buffer pin conflict |
||||
my $sect = "buffer pin conflict"; |
||||
$expected_conflicts++; |
||||
|
||||
# Aborted INSERT on primary that will be cleaned up by vacuum. Has to be old |
||||
# enough so that there's not a snapshot conflict before the buffer pin |
||||
# conflict. |
||||
|
||||
$node_primary->safe_psql( |
||||
$test_db, |
||||
qq[ |
||||
BEGIN; |
||||
INSERT INTO $table1 VALUES (1,0); |
||||
ROLLBACK; |
||||
-- ensure flush, rollback doesn't do so |
||||
BEGIN; LOCK $table1; COMMIT; |
||||
]); |
||||
|
||||
$primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); |
||||
|
||||
my $cursor1 = "test_recovery_conflict_cursor"; |
||||
|
||||
# DECLARE and use a cursor on standby, causing buffer with the only block of |
||||
# the relation to be pinned on the standby |
||||
$psql_standby{stdin} .= qq[ |
||||
BEGIN; |
||||
DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1; |
||||
FETCH FORWARD FROM $cursor1; |
||||
]; |
||||
# FETCH FORWARD should have returned a 0 since all values of b in the table |
||||
# are 0 |
||||
ok(pump_until_standby(qr/^0$/m), |
||||
"$sect: cursor with conflicting pin established"); |
||||
|
||||
# to check the log starting now for recovery conflict messages |
||||
my $log_location = -s $node_standby->logfile; |
||||
|
||||
# VACUUM on the primary |
||||
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]); |
||||
|
||||
# Wait for catchup. Existing connection will be terminated before replay is |
||||
# finished, so waiting for catchup ensures that there is no race between |
||||
# encountering the recovery conflict which causes the disconnect and checking |
||||
# the logfile for the terminated connection. |
||||
$primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); |
||||
|
||||
check_conflict_log("User was holding shared buffer pin for too long"); |
||||
reconnect_and_clear(); |
||||
check_conflict_stat("bufferpin"); |
||||
|
||||
|
||||
## RECOVERY CONFLICT 2: Snapshot conflict |
||||
$sect = "snapshot conflict"; |
||||
$expected_conflicts++; |
||||
|
||||
$node_primary->safe_psql($test_db, |
||||
qq[INSERT INTO $table1 SELECT i, 0 FROM generate_series(1,20) i]); |
||||
$primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); |
||||
|
||||
# DECLARE and FETCH from cursor on the standby |
||||
$psql_standby{stdin} .= qq[ |
||||
BEGIN; |
||||
DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1; |
||||
FETCH FORWARD FROM $cursor1; |
||||
]; |
||||
ok( pump_until( |
||||
$psql_standby{run}, $psql_timeout, |
||||
\$psql_standby{stdout}, qr/^0$/m,), |
||||
"$sect: cursor with conflicting snapshot established"); |
||||
|
||||
# Do some HOT updates |
||||
$node_primary->safe_psql($test_db, |
||||
qq[UPDATE $table1 SET a = a + 1 WHERE a > 2;]); |
||||
|
||||
# VACUUM, pruning those dead tuples |
||||
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]); |
||||
|
||||
# Wait for attempted replay of PRUNE records |
||||
$primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); |
||||
|
||||
check_conflict_log( |
||||
"User query might have needed to see row versions that must be removed"); |
||||
reconnect_and_clear(); |
||||
check_conflict_stat("snapshot"); |
||||
|
||||
|
||||
## RECOVERY CONFLICT 3: Lock conflict |
||||
$sect = "lock conflict"; |
||||
$expected_conflicts++; |
||||
|
||||
# acquire lock to conflict with |
||||
$psql_standby{stdin} .= qq[ |
||||
BEGIN; |
||||
LOCK TABLE $table1 IN ACCESS SHARE MODE; |
||||
SELECT 1; |
||||
]; |
||||
ok(pump_until_standby(qr/^1$/m), "$sect: conflicting lock acquired"); |
||||
|
||||
# DROP TABLE containing block which standby has in a pinned buffer |
||||
$node_primary->safe_psql($test_db, qq[DROP TABLE $table1;]); |
||||
|
||||
$primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); |
||||
|
||||
check_conflict_log("User was holding a relation lock for too long"); |
||||
reconnect_and_clear(); |
||||
check_conflict_stat("lock"); |
||||
|
||||
|
||||
## RECOVERY CONFLICT 4: Tablespace conflict |
||||
$sect = "tablespace conflict"; |
||||
$expected_conflicts++; |
||||
|
||||
# DECLARE a cursor for a query which, with sufficiently low work_mem, will |
||||
# spill tuples into temp files in the temporary tablespace created during |
||||
# setup. |
||||
$psql_standby{stdin} .= qq[ |
||||
BEGIN; |
||||
SET work_mem = '64kB'; |
||||
DECLARE $cursor1 CURSOR FOR |
||||
SELECT count(*) FROM generate_series(1,6000); |
||||
FETCH FORWARD FROM $cursor1; |
||||
]; |
||||
ok(pump_until_standby(qr/^6000$/m), |
||||
"$sect: cursor with conflicting temp file established"); |
||||
|
||||
# Drop the tablespace currently containing spill files for the query on the |
||||
# standby |
||||
$node_primary->safe_psql($test_db, qq[DROP TABLESPACE $tablespace1;]); |
||||
|
||||
$primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); |
||||
|
||||
check_conflict_log( |
||||
"User was or might have been using tablespace that must be dropped"); |
||||
reconnect_and_clear(); |
||||
check_conflict_stat("tablespace"); |
||||
|
||||
|
||||
# Check that expected number of conflicts show in pg_stat_database. Needs to |
||||
# be tested before database is dropped, for obvious reasons. |
||||
is( $node_standby->safe_psql( |
||||
$test_db, |
||||
qq[SELECT conflicts FROM pg_stat_database WHERE datname='$test_db';]), |
||||
$expected_conflicts, |
||||
qq[$expected_conflicts recovery conflicts shown in pg_stat_database]); |
||||
|
||||
|
||||
## RECOVERY CONFLICT 5: Database conflict |
||||
$sect = "database conflict"; |
||||
|
||||
$node_primary->safe_psql('postgres', qq[DROP DATABASE $test_db;]); |
||||
|
||||
$primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn); |
||||
|
||||
check_conflict_log("User was connected to a database that must be dropped"); |
||||
|
||||
|
||||
# explicitly shut down psql instances gracefully - to avoid hangs or worse on |
||||
# windows |
||||
$psql_standby{stdin} .= "\\q\n"; |
||||
$psql_standby{run}->finish; |
||||
|
||||
$node_standby->stop(); |
||||
$node_primary->stop(); |
||||
|
||||
|
||||
done_testing(); |
||||
|
||||
|
||||
sub pump_until_standby |
||||
{ |
||||
my $match = shift; |
||||
|
||||
return pump_until($psql_standby{run}, $psql_timeout, |
||||
\$psql_standby{stdout}, $match); |
||||
} |
||||
|
||||
sub reconnect_and_clear |
||||
{ |
||||
$psql_standby{stdin} .= "\\q\n"; |
||||
$psql_standby{run}->finish; |
||||
|
||||
# restart |
||||
$psql_standby{run}->run(); |
||||
$psql_standby{stdin} = ''; |
||||
$psql_standby{stdout} = ''; |
||||
|
||||
# Run query to ensure connection has finished re-establishing |
||||
$psql_standby{stdin} .= qq[SELECT 1;\n]; |
||||
die unless pump_until_standby(qr/^1$/m); |
||||
$psql_standby{stdout} = ''; |
||||
} |
||||
|
||||
sub check_conflict_log |
||||
{ |
||||
my $message = shift; |
||||
my $old_log_location = $log_location; |
||||
|
||||
$log_location = $node_standby->wait_for_log(qr/$message/, $log_location); |
||||
|
||||
cmp_ok($log_location, '>', $old_log_location, |
||||
"$sect: logfile contains terminated connection due to recovery conflict" |
||||
); |
||||
} |
||||
|
||||
sub check_conflict_stat |
||||
{ |
||||
my $conflict_type = shift; |
||||
my $count = $node_standby->safe_psql($test_db, |
||||
qq[SELECT confl_$conflict_type FROM pg_stat_database_conflicts WHERE datname='$test_db';] |
||||
); |
||||
|
||||
is($count, 1, "$sect: stats show conflict on standby"); |
||||
} |
||||
Loading…
Reference in new issue