mirror of https://github.com/postgres/postgres
If vacuum fails to prune a tuple killed before OldestXmin, it will decide to freeze its xmax and later error out in pre-freeze checks. Add a test reproducing this scenario to the recovery suite which creates a table on a primary, updates the table to generate dead tuples for vacuum, and then, during the vacuum, uses a replica to force GlobalVisState->maybe_needed on the primary to move backwards and precede the value of OldestXmin set at the beginning of vacuuming the table. This test is coverage for a case fixed inpull/234/head83c39a1f7f
. The test was originally committed to master inaa607980ae
but later reverted inefcbb76efe
due to test instability. The test requires multiple index passes. In Postgres 17+, vacuum uses a TID store for the dead TIDs that is very space efficient. With the old minimum maintenance_work_mem of 1 MB, it required a large number of dead rows to generate enough dead TIDs to force multiple index vacuuming passes. Once the source code changes were made to allow a minimum maintenance_work_mem value of 64kB, the test could be made much faster and more stable. Author: Melanie Plageman <melanieplageman@gmail.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAAKRu_ZJBkidusDut6i%3DbDCiXzJEp93GC1%2BNFaZt4eqanYF3Kw%40mail.gmail.com Backpatch-through: 17
parent
805b85027a
commit
2c0bc47657
@ -0,0 +1,278 @@ |
||||
use strict; |
||||
use warnings; |
||||
use PostgreSQL::Test::Cluster; |
||||
use Test::More; |
||||
|
||||
# Test that vacuum prunes away all dead tuples killed before OldestXmin |
||||
# |
||||
# This test creates a table on a primary, updates the table to generate dead |
||||
# tuples for vacuum, and then, during the vacuum, uses the replica to force |
||||
# GlobalVisState->maybe_needed on the primary to move backwards and precede |
||||
# the value of OldestXmin set at the beginning of vacuuming the table. |
||||
|
||||
# Set up nodes |
||||
my $node_primary = PostgreSQL::Test::Cluster->new('primary'); |
||||
$node_primary->init(allows_streaming => 'physical'); |
||||
|
||||
# io_combine_limit is set to 1 to avoid pinning more than one buffer at a time |
||||
# to ensure test determinism. |
||||
$node_primary->append_conf( |
||||
'postgresql.conf', qq[ |
||||
hot_standby_feedback = on |
||||
autovacuum = off |
||||
log_min_messages = INFO |
||||
maintenance_work_mem = 64 |
||||
io_combine_limit = 1 |
||||
]); |
||||
$node_primary->start; |
||||
|
||||
my $node_replica = PostgreSQL::Test::Cluster->new('standby'); |
||||
|
||||
$node_primary->backup('my_backup'); |
||||
$node_replica->init_from_backup($node_primary, 'my_backup', |
||||
has_streaming => 1); |
||||
|
||||
$node_replica->start; |
||||
|
||||
my $test_db = "test_db"; |
||||
$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db"); |
||||
|
||||
# Save the original connection info for later use |
||||
my $orig_conninfo = $node_primary->connstr(); |
||||
|
||||
my $table1 = "vac_horizon_floor_table"; |
||||
|
||||
# Long-running Primary Session A |
||||
my $psql_primaryA = |
||||
$node_primary->background_psql($test_db, on_error_stop => 1); |
||||
|
||||
# Long-running Primary Session B |
||||
my $psql_primaryB = |
||||
$node_primary->background_psql($test_db, on_error_stop => 1); |
||||
|
||||
# Our test relies on two rounds of index vacuuming for reasons elaborated |
||||
# later. To trigger two rounds of index vacuuming, we must fill up the |
||||
# TIDStore with dead items partway through a vacuum of the table. The number |
||||
# of rows is just enough to ensure we exceed maintenance_work_mem on all |
||||
# supported platforms, while keeping test runtime as short as we can. |
||||
my $nrows = 2000; |
||||
|
||||
# Because vacuum's first pass, pruning, is where we use the GlobalVisState to |
||||
# check tuple visibility, GlobalVisState->maybe_needed must move backwards |
||||
# during pruning before checking the visibility for a tuple which would have |
||||
# been considered HEAPTUPLE_DEAD prior to maybe_needed moving backwards but |
||||
# HEAPTUPLE_RECENTLY_DEAD compared to the new, older value of maybe_needed. |
||||
# |
||||
# We must not only force the horizon on the primary to move backwards but also |
||||
# force the vacuuming backend's GlobalVisState to be updated. GlobalVisState |
||||
# is forced to update during index vacuuming. |
||||
# |
||||
# _bt_pendingfsm_finalize() calls GetOldestNonRemovableTransactionId() at the |
||||
# end of a round of index vacuuming, updating the backend's GlobalVisState |
||||
# and, in our case, moving maybe_needed backwards. |
||||
# |
||||
# Then vacuum's first (pruning) pass will continue and pruning will find our |
||||
# later inserted and updated tuple HEAPTUPLE_RECENTLY_DEAD when compared to |
||||
# maybe_needed but HEAPTUPLE_DEAD when compared to OldestXmin. |
||||
# |
||||
# Thus, we must force at least two rounds of index vacuuming to ensure that |
||||
# some tuple visibility checks will happen after a round of index vacuuming. |
||||
# To accomplish this, we set maintenance_work_mem to its minimum value and |
||||
# insert and delete enough rows that we force at least one round of index |
||||
# vacuuming before getting to a dead tuple which was killed after the standby |
||||
# is disconnected. |
||||
$node_primary->safe_psql($test_db, qq[ |
||||
CREATE TABLE ${table1}(col1 int) |
||||
WITH (autovacuum_enabled=false, fillfactor=10); |
||||
INSERT INTO $table1 VALUES(7); |
||||
INSERT INTO $table1 SELECT generate_series(1, $nrows) % 3; |
||||
CREATE INDEX on ${table1}(col1); |
||||
DELETE FROM $table1 WHERE col1 = 0; |
||||
INSERT INTO $table1 VALUES(7); |
||||
]); |
||||
|
||||
# We will later move the primary forward while the standby is disconnected. |
||||
# For now, however, there is no reason not to wait for the standby to catch |
||||
# up. |
||||
my $primary_lsn = $node_primary->lsn('flush'); |
||||
$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn); |
||||
|
||||
# Test that the WAL receiver is up and running. |
||||
$node_replica->poll_query_until($test_db, qq[ |
||||
SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);] , 't'); |
||||
|
||||
# Set primary_conninfo to something invalid on the replica and reload the |
||||
# config. Once the config is reloaded, the startup process will force the WAL |
||||
# receiver to restart and it will be unable to reconnect because of the |
||||
# invalid connection information. |
||||
$node_replica->safe_psql($test_db, qq[ |
||||
ALTER SYSTEM SET primary_conninfo = ''; |
||||
SELECT pg_reload_conf(); |
||||
]); |
||||
|
||||
# Wait until the WAL receiver has shut down and been unable to start up again. |
||||
$node_replica->poll_query_until($test_db, qq[ |
||||
SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);] , 'f'); |
||||
|
||||
# Now insert and update a tuple which will be visible to the vacuum on the |
||||
# primary but which will have xmax newer than the oldest xmin on the standby |
||||
# that was recently disconnected. |
||||
my $res = $psql_primaryA->query_safe( |
||||
qq[ |
||||
INSERT INTO $table1 VALUES (99); |
||||
UPDATE $table1 SET col1 = 100 WHERE col1 = 99; |
||||
SELECT 'after_update'; |
||||
] |
||||
); |
||||
|
||||
# Make sure the UPDATE finished |
||||
like($res, qr/^after_update$/m, "UPDATE occurred on primary session A"); |
||||
|
||||
# Open a cursor on the primary whose pin will keep VACUUM from getting a |
||||
# cleanup lock on the first page of the relation. We want VACUUM to be able to |
||||
# start, calculate initial values for OldestXmin and GlobalVisState and then |
||||
# be unable to proceed with pruning our dead tuples. This will allow us to |
||||
# reconnect the standby and push the horizon back before we start actual |
||||
# pruning and vacuuming. |
||||
my $primary_cursor1 = "vac_horizon_floor_cursor1"; |
||||
|
||||
# The first value inserted into the table was a 7, so FETCH FORWARD should |
||||
# return a 7. That's how we know the cursor has a pin. |
||||
# Disable index scans so the cursor pins heap pages and not index pages. |
||||
$res = $psql_primaryB->query_safe( |
||||
qq[ |
||||
BEGIN; |
||||
SET enable_bitmapscan = off; |
||||
SET enable_indexscan = off; |
||||
SET enable_indexonlyscan = off; |
||||
DECLARE $primary_cursor1 CURSOR FOR SELECT * FROM $table1 WHERE col1 = 7; |
||||
FETCH $primary_cursor1; |
||||
] |
||||
); |
||||
|
||||
is($res, 7, qq[Cursor query returned $res. Expected value 7.]); |
||||
|
||||
# Get the PID of the session which will run the VACUUM FREEZE so that we can |
||||
# use it to filter pg_stat_activity later. |
||||
my $vacuum_pid = $psql_primaryA->query_safe("SELECT pg_backend_pid();"); |
||||
|
||||
# Now start a VACUUM FREEZE on the primary. It will call vacuum_get_cutoffs() |
||||
# and establish values of OldestXmin and GlobalVisState which are newer than |
||||
# all of our dead tuples. Then it will be unable to get a cleanup lock to |
||||
# start pruning, so it will hang. |
||||
# |
||||
# We use VACUUM FREEZE because it will wait for a cleanup lock instead of |
||||
# skipping the page pinned by the cursor. Note that works because the target |
||||
# tuple's xmax precedes OldestXmin which ensures that lazy_scan_noprune() will |
||||
# return false and we will wait for the cleanup lock. |
||||
# |
||||
# Disable any prefetching, parallelism, or other concurrent I/O by vacuum. The |
||||
# pages of the heap must be processed in order by a single worker to ensure |
||||
# test stability (PARALLEL 0 shouldn't be necessary but guards against the |
||||
# possibility of parallel heap vacuuming). |
||||
$psql_primaryA->{stdin} .= qq[ |
||||
SET maintenance_io_concurrency = 0; |
||||
VACUUM (VERBOSE, FREEZE, PARALLEL 0) $table1; |
||||
\\echo VACUUM |
||||
]; |
||||
|
||||
# Make sure the VACUUM command makes it to the server. |
||||
$psql_primaryA->{run}->pump_nb(); |
||||
|
||||
# Make sure that the VACUUM has already called vacuum_get_cutoffs() and is |
||||
# just waiting on the lock to start vacuuming. We don't want the standby to |
||||
# re-establish a connection to the primary and push the horizon back until |
||||
# we've saved initial values in GlobalVisState and calculated OldestXmin. |
||||
$node_primary->poll_query_until($test_db, |
||||
qq[ |
||||
SELECT count(*) >= 1 FROM pg_stat_activity |
||||
WHERE pid = $vacuum_pid |
||||
AND wait_event = 'BufferPin'; |
||||
], |
||||
't'); |
||||
|
||||
# Ensure the WAL receiver is still not active on the replica. |
||||
$node_replica->poll_query_until($test_db, qq[ |
||||
SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);] , 'f'); |
||||
|
||||
# Allow the WAL receiver connection to re-establish. |
||||
$node_replica->safe_psql( |
||||
$test_db, qq[ |
||||
ALTER SYSTEM SET primary_conninfo = '$orig_conninfo'; |
||||
SELECT pg_reload_conf(); |
||||
]); |
||||
|
||||
# Ensure the new WAL receiver has connected. |
||||
$node_replica->poll_query_until($test_db, qq[ |
||||
SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);] , 't'); |
||||
|
||||
# Once the WAL sender is shown on the primary, the replica should have |
||||
# connected with the primary and pushed the horizon backward. Primary Session |
||||
# A won't see that until the VACUUM FREEZE proceeds and does its first round |
||||
# of index vacuuming. |
||||
$node_primary->poll_query_until($test_db, qq[ |
||||
SELECT EXISTS (SELECT * FROM pg_stat_replication);] , 't'); |
||||
|
||||
# Move the cursor forward to the next 7. We inserted the 7 much later, so |
||||
# advancing the cursor should allow vacuum to proceed vacuuming most pages of |
||||
# the relation. Because we set maintanence_work_mem sufficiently low, we |
||||
# expect that a round of index vacuuming has happened and that the vacuum is |
||||
# now waiting for the cursor to release its pin on the last page of the |
||||
# relation. |
||||
$res = $psql_primaryB->query_safe("FETCH $primary_cursor1"); |
||||
is($res, 7, |
||||
qq[Cursor query returned $res from second fetch. Expected value 7.]); |
||||
|
||||
# Prevent the test from incorrectly passing by confirming that we did indeed |
||||
# do a pass of index vacuuming. |
||||
$node_primary->poll_query_until($test_db, qq[ |
||||
SELECT index_vacuum_count > 0 |
||||
FROM pg_stat_progress_vacuum |
||||
WHERE datname='$test_db' AND relid::regclass = '$table1'::regclass; |
||||
] , 't'); |
||||
|
||||
# Commit the transaction with the open cursor so that the VACUUM can finish. |
||||
$psql_primaryB->query_until( |
||||
qr/^commit$/m, |
||||
qq[ |
||||
COMMIT; |
||||
\\echo commit |
||||
] |
||||
); |
||||
|
||||
# VACUUM proceeds with pruning and does a visibility check on each tuple. In |
||||
# older versions of Postgres, pruning found our final dead tuple |
||||
# non-removable (HEAPTUPLE_RECENTLY_DEAD) since its xmax is after the new |
||||
# value of maybe_needed. Then heap_prepare_freeze_tuple() would decide the |
||||
# tuple xmax should be frozen because it precedes OldestXmin. Vacuum would |
||||
# then error out in heap_pre_freeze_checks() with "cannot freeze committed |
||||
# xmax". This was fixed by changing pruning to find all |
||||
# HEAPTUPLE_RECENTLY_DEAD tuples with xmaxes preceding OldestXmin |
||||
# HEAPTUPLE_DEAD and removing them. |
||||
|
||||
# With the fix, VACUUM should finish successfully, incrementing the table |
||||
# vacuum_count. |
||||
$node_primary->poll_query_until($test_db, |
||||
qq[ |
||||
SELECT vacuum_count > 0 |
||||
FROM pg_stat_all_tables WHERE relname = '${table1}'; |
||||
] |
||||
, 't'); |
||||
|
||||
$primary_lsn = $node_primary->lsn('flush'); |
||||
|
||||
# Make sure something causes us to flush |
||||
$node_primary->safe_psql($test_db, "INSERT INTO $table1 VALUES (1);"); |
||||
|
||||
# Nothing on the replica should cause a recovery conflict, so this should |
||||
# finish successfully. |
||||
$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn); |
||||
|
||||
## Shut down psqls |
||||
$psql_primaryA->quit; |
||||
$psql_primaryB->quit; |
||||
|
||||
$node_replica->stop(); |
||||
$node_primary->stop(); |
||||
|
||||
done_testing(); |
Loading…
Reference in new issue