Add minimal tests for recovery conflict handling.

Previously none of our tests triggered recovery conflicts. The test is primarily motivated by needing tests for recovery conflict stats for shared memory based pgstats. But it's also a decent start for recovery conflict handling in general. The only type of recovery conflict not tested yet are rcovery deadlock conflicts. By configuring log_recovery_conflict_waits the test adds some very minimal testing for that path as well. Author: Melanie Plageman <melanieplageman@gmail.com> Author: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/20220303021600.hs34ghqcw6zcokdh@alap3.anarazel.de
4 years ago · 9f8a050f68
parent 53b9cd20d4
commit 9f8a050f68
1 changed files with 296 additions and 0 deletions
--- a/src/test/recovery/t/031_recovery_conflict.pl
+++ b/src/test/recovery/t/031_recovery_conflict.pl
@ -0,0 +1,296 @@
+# Copyright (c) 2021-2022, PostgreSQL Global Development Group
+
+# Test that connections to a hot standby are correctly canceled when a
+# recovery conflict is detected Also, test that statistics in
+# pg_stat_database_conflicts are populated correctly
+
+# TODO: add a test for deadlock recovery conflicts.
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+
+# Set up nodes
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+
+my $tablespace1 = "test_recovery_conflict_tblspc";
+
+$node_primary->append_conf(
+	'postgresql.conf', qq[
+allow_in_place_tablespaces = on
+log_temp_files = 0
+
+# wait some to test the wait paths as well, but not long for obvious reasons
+max_standby_streaming_delay = 50ms
+
+temp_tablespaces = $tablespace1
+# Some of the recovery conflict logging code only gets exercised after
+# deadlock_timeout. The test doesn't rely on that additional output, but it's
+# nice to get some minimal coverage of that code.
+log_recovery_conflict_waits = on
+deadlock_timeout = 10ms
+]);
+$node_primary->start;
+
+my $backup_name = 'my_backup';
+
+$node_primary->safe_psql('postgres',
+	qq[CREATE TABLESPACE $tablespace1 LOCATION '']);
+
+$node_primary->backup($backup_name);
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	has_streaming => 1);
+
+$node_standby->start;
+
+my $test_db = "test_db";
+
+# use a new database, to trigger database recovery conflict
+$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db");
+
+# test schema / data
+my $table1 = "test_recovery_conflict_table1";
+$node_primary->safe_psql($test_db, qq[CREATE TABLE ${table1}(a int, b int);]);
+$node_primary->safe_psql($test_db,
+	qq[INSERT INTO $table1 SELECT i % 3, 0 FROM generate_series(1,20) i]);
+my $primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
+
+
+# a longrunning psql that we can use to trigger conflicts
+my $psql_timeout = IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default);
+my %psql_standby = ('stdin' => '', 'stdout' => '');
+$psql_standby{run} =
+  $node_standby->background_psql($test_db, \$psql_standby{stdin},
+	\$psql_standby{stdout},
+	$psql_timeout);
+$psql_standby{stdout} = '';
+
+my $expected_conflicts = 0;
+
+
+## RECOVERY CONFLICT 1: Buffer pin conflict
+my $sect = "buffer pin conflict";
+$expected_conflicts++;
+
+# Aborted INSERT on primary that will be cleaned up by vacuum. Has to be old
+# enough so that there's not a snapshot conflict before the buffer pin
+# conflict.
+
+$node_primary->safe_psql(
+	$test_db,
+	qq[
+	BEGIN;
+	INSERT INTO $table1 VALUES (1,0);
+	ROLLBACK;
+	-- ensure flush, rollback doesn't do so
+	BEGIN; LOCK $table1; COMMIT;
+	]);
+
+$primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
+
+my $cursor1 = "test_recovery_conflict_cursor";
+
+# DECLARE and use a cursor on standby, causing buffer with the only block of
+# the relation to be pinned on the standby
+$psql_standby{stdin} .= qq[
+        BEGIN;
+        DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1;
+        FETCH FORWARD FROM $cursor1;
+        ];
+# FETCH FORWARD should have returned a 0 since all values of b in the table
+# are 0
+ok(pump_until_standby(qr/^0$/m),
+	"$sect: cursor with conflicting pin established");
+
+# to check the log starting now for recovery conflict messages
+my $log_location = -s $node_standby->logfile;
+
+# VACUUM on the primary
+$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);
+
+# Wait for catchup. Existing connection will be terminated before replay is
+# finished, so waiting for catchup ensures that there is no race between
+# encountering the recovery conflict which causes the disconnect and checking
+# the logfile for the terminated connection.
+$primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
+
+check_conflict_log("User was holding shared buffer pin for too long");
+reconnect_and_clear();
+check_conflict_stat("bufferpin");
+
+
+## RECOVERY CONFLICT 2: Snapshot conflict
+$sect = "snapshot conflict";
+$expected_conflicts++;
+
+$node_primary->safe_psql($test_db,
+	qq[INSERT INTO $table1 SELECT i, 0 FROM generate_series(1,20) i]);
+$primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
+
+# DECLARE and FETCH from cursor on the standby
+$psql_standby{stdin} .= qq[
+        BEGIN;
+        DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1;
+        FETCH FORWARD FROM $cursor1;
+        ];
+ok( pump_until(
+		$psql_standby{run},     $psql_timeout,
+		\$psql_standby{stdout}, qr/^0$/m,),
+	"$sect: cursor with conflicting snapshot established");
+
+# Do some HOT updates
+$node_primary->safe_psql($test_db,
+	qq[UPDATE $table1 SET a = a + 1 WHERE a > 2;]);
+
+# VACUUM, pruning those dead tuples
+$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);
+
+# Wait for attempted replay of PRUNE records
+$primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
+
+check_conflict_log(
+	"User query might have needed to see row versions that must be removed");
+reconnect_and_clear();
+check_conflict_stat("snapshot");
+
+
+## RECOVERY CONFLICT 3: Lock conflict
+$sect = "lock conflict";
+$expected_conflicts++;
+
+# acquire lock to conflict with
+$psql_standby{stdin} .= qq[
+        BEGIN;
+        LOCK TABLE $table1 IN ACCESS SHARE MODE;
+        SELECT 1;
+        ];
+ok(pump_until_standby(qr/^1$/m), "$sect: conflicting lock acquired");
+
+# DROP TABLE containing block which standby has in a pinned buffer
+$node_primary->safe_psql($test_db, qq[DROP TABLE $table1;]);
+
+$primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
+
+check_conflict_log("User was holding a relation lock for too long");
+reconnect_and_clear();
+check_conflict_stat("lock");
+
+
+## RECOVERY CONFLICT 4: Tablespace conflict
+$sect = "tablespace conflict";
+$expected_conflicts++;
+
+# DECLARE a cursor for a query which, with sufficiently low work_mem, will
+# spill tuples into temp files in the temporary tablespace created during
+# setup.
+$psql_standby{stdin} .= qq[
+        BEGIN;
+        SET work_mem = '64kB';
+        DECLARE $cursor1 CURSOR FOR
+          SELECT count(*) FROM generate_series(1,6000);
+        FETCH FORWARD FROM $cursor1;
+        ];
+ok(pump_until_standby(qr/^6000$/m),
+	"$sect: cursor with conflicting temp file established");
+
+# Drop the tablespace currently containing spill files for the query on the
+# standby
+$node_primary->safe_psql($test_db, qq[DROP TABLESPACE $tablespace1;]);
+
+$primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
+
+check_conflict_log(
+	"User was or might have been using tablespace that must be dropped");
+reconnect_and_clear();
+check_conflict_stat("tablespace");
+
+
+# Check that expected number of conflicts show in pg_stat_database. Needs to
+# be tested before database is dropped, for obvious reasons.
+is( $node_standby->safe_psql(
+		$test_db,
+		qq[SELECT conflicts FROM pg_stat_database WHERE datname='$test_db';]),
+	$expected_conflicts,
+	qq[$expected_conflicts recovery conflicts shown in pg_stat_database]);
+
+
+## RECOVERY CONFLICT 5: Database conflict
+$sect = "database conflict";
+
+$node_primary->safe_psql('postgres', qq[DROP DATABASE $test_db;]);
+
+$primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
+
+check_conflict_log("User was connected to a database that must be dropped");
+
+
+# explicitly shut down psql instances gracefully - to avoid hangs or worse on
+# windows
+$psql_standby{stdin} .= "\\q\n";
+$psql_standby{run}->finish;
+
+$node_standby->stop();
+$node_primary->stop();
+
+
+done_testing();
+
+
+sub pump_until_standby
+{
+	my $match = shift;
+
+	return pump_until($psql_standby{run}, $psql_timeout,
+		\$psql_standby{stdout}, $match);
+}
+
+sub reconnect_and_clear
+{
+	$psql_standby{stdin} .= "\\q\n";
+	$psql_standby{run}->finish;
+
+	# restart
+	$psql_standby{run}->run();
+	$psql_standby{stdin}  = '';
+	$psql_standby{stdout} = '';
+
+	# Run query to ensure connection has finished re-establishing
+	$psql_standby{stdin} .= qq[SELECT 1;\n];
+	die unless pump_until_standby(qr/^1$/m);
+	$psql_standby{stdout} = '';
+}
+
+sub check_conflict_log
+{
+	my $message          = shift;
+	my $old_log_location = $log_location;
+
+	$log_location = $node_standby->wait_for_log(qr/$message/, $log_location);
+
+	cmp_ok($log_location, '>', $old_log_location,
+		"$sect: logfile contains terminated connection due to recovery conflict"
+	);
+}
+
+sub check_conflict_stat
+{
+	my $conflict_type = shift;
+	my $count         = $node_standby->safe_psql($test_db,
+		qq[SELECT confl_$conflict_type FROM pg_stat_database_conflicts WHERE datname='$test_db';]
+	);
+
+	is($count, 1, "$sect: stats show conflict on standby");
+}