mirror of https://github.com/postgres/postgres
Commitspull/82/head4eb21763andb74e94dcintroduced a way to force every backend to close all relation files, to fix an ancient Windows-only bug. This commit extends that behavior to all operating systems and adds a couple of extra barrier points, to fix a totally different class of bug: the reuse of relfilenodes in scenarios that have no other kind of cache invalidation to prevent file descriptor mix-ups. In all releases, data corruption could occur when you moved a database to another tablespace and then back again. Despite that, no back-patch for now as the infrastructure required is too new and invasive. In master only, since commitaa010514, it could also happen when using CREATE DATABASE with a user-supplied OID or via pg_upgrade. Author: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Discussion: https://postgr.es/m/20220209220004.kb3dgtn2x2k2gtdm%40alap3.anarazel.de
parent
b74e94dc27
commit
e2f65f4255
@ -0,0 +1,233 @@ |
|||||||
|
use strict; |
||||||
|
use warnings; |
||||||
|
use PostgreSQL::Test::Cluster; |
||||||
|
use PostgreSQL::Test::Utils; |
||||||
|
use Test::More; |
||||||
|
use File::Basename; |
||||||
|
|
||||||
|
|
||||||
|
my $node_primary = PostgreSQL::Test::Cluster->new('primary'); |
||||||
|
$node_primary->init(allows_streaming => 1); |
||||||
|
$node_primary->append_conf('postgresql.conf', q[ |
||||||
|
allow_in_place_tablespaces = true |
||||||
|
log_connections=on |
||||||
|
# to avoid "repairing" corruption |
||||||
|
full_page_writes=off |
||||||
|
log_min_messages=debug2 |
||||||
|
autovacuum_naptime=1s |
||||||
|
shared_buffers=1MB |
||||||
|
]); |
||||||
|
$node_primary->start; |
||||||
|
|
||||||
|
|
||||||
|
# Create streaming standby linking to primary |
||||||
|
my $backup_name = 'my_backup'; |
||||||
|
$node_primary->backup($backup_name); |
||||||
|
my $node_standby = PostgreSQL::Test::Cluster->new('standby'); |
||||||
|
$node_standby->init_from_backup($node_primary, $backup_name, |
||||||
|
has_streaming => 1); |
||||||
|
$node_standby->start; |
||||||
|
|
||||||
|
# To avoid hanging while expecting some specific input from a psql |
||||||
|
# instance being driven by us, add a timeout high enough that it |
||||||
|
# should never trigger even on very slow machines, unless something |
||||||
|
# is really wrong. |
||||||
|
my $psql_timeout = IPC::Run::timer(300); |
||||||
|
|
||||||
|
my %psql_primary = (stdin => '', stdout => '', stderr => ''); |
||||||
|
$psql_primary{run} = IPC::Run::start( |
||||||
|
[ 'psql', '-XA', '-f', '-', '-d', $node_primary->connstr('postgres') ], |
||||||
|
'<', |
||||||
|
\$psql_primary{stdin}, |
||||||
|
'>', |
||||||
|
\$psql_primary{stdout}, |
||||||
|
'2>', |
||||||
|
\$psql_primary{stderr}, |
||||||
|
$psql_timeout); |
||||||
|
|
||||||
|
my %psql_standby = ('stdin' => '', 'stdout' => '', 'stderr' => ''); |
||||||
|
$psql_standby{run} = IPC::Run::start( |
||||||
|
[ 'psql', '-XA', '-f', '-', '-d', $node_standby->connstr('postgres') ], |
||||||
|
'<', |
||||||
|
\$psql_standby{stdin}, |
||||||
|
'>', |
||||||
|
\$psql_standby{stdout}, |
||||||
|
'2>', |
||||||
|
\$psql_standby{stderr}, |
||||||
|
$psql_timeout); |
||||||
|
|
||||||
|
|
||||||
|
# Create template database with a table that we'll update, to trigger dirty |
||||||
|
# rows. Using a template database + preexisting rows makes it a bit easier to |
||||||
|
# reproduce, because there's no cache invalidations generated. |
||||||
|
|
||||||
|
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db_template OID = 50000;"); |
||||||
|
$node_primary->safe_psql('conflict_db_template', q[ |
||||||
|
CREATE TABLE large(id serial primary key, dataa text, datab text); |
||||||
|
INSERT INTO large(dataa, datab) SELECT g.i::text, 1 FROM generate_series(1, 4000) g(i);]); |
||||||
|
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;"); |
||||||
|
|
||||||
|
$node_primary->safe_psql('postgres', q[ |
||||||
|
CREATE EXTENSION pg_prewarm; |
||||||
|
CREATE TABLE replace_sb(data text); |
||||||
|
INSERT INTO replace_sb(data) SELECT random()::text FROM generate_series(1, 15000);]); |
||||||
|
|
||||||
|
# Use longrunning transactions, so that AtEOXact_SMgr doesn't close files |
||||||
|
send_query_and_wait( |
||||||
|
\%psql_primary, |
||||||
|
q[BEGIN;], |
||||||
|
qr/BEGIN/m); |
||||||
|
send_query_and_wait( |
||||||
|
\%psql_standby, |
||||||
|
q[BEGIN;], |
||||||
|
qr/BEGIN/m); |
||||||
|
|
||||||
|
# Cause lots of dirty rows in shared_buffers |
||||||
|
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 1;"); |
||||||
|
|
||||||
|
# Now do a bunch of work in another database. That will end up needing to |
||||||
|
# write back dirty data from the previous step, opening the relevant file |
||||||
|
# descriptors |
||||||
|
cause_eviction(\%psql_primary, \%psql_standby); |
||||||
|
|
||||||
|
# drop and recreate database |
||||||
|
$node_primary->safe_psql('postgres', "DROP DATABASE conflict_db;"); |
||||||
|
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;"); |
||||||
|
|
||||||
|
verify($node_primary, $node_standby, 1, |
||||||
|
"initial contents as expected"); |
||||||
|
|
||||||
|
# Again cause lots of dirty rows in shared_buffers, but use a different update |
||||||
|
# value so we can check everything is OK |
||||||
|
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 2;"); |
||||||
|
|
||||||
|
# Again cause a lot of IO. That'll again write back dirty data, but uses (XXX |
||||||
|
# adjust after bugfix) the already opened file descriptor. |
||||||
|
# FIXME |
||||||
|
cause_eviction(\%psql_primary, \%psql_standby); |
||||||
|
|
||||||
|
verify($node_primary, $node_standby, 2, |
||||||
|
"update to reused relfilenode (due to DB oid conflict) is not lost"); |
||||||
|
|
||||||
|
|
||||||
|
$node_primary->safe_psql('conflict_db', "VACUUM FULL large;"); |
||||||
|
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 3;"); |
||||||
|
|
||||||
|
verify($node_primary, $node_standby, 3, |
||||||
|
"restored contents as expected"); |
||||||
|
|
||||||
|
# Test for old filehandles after moving a database in / out of tablespace |
||||||
|
$node_primary->safe_psql('postgres', q[CREATE TABLESPACE test_tablespace LOCATION '']); |
||||||
|
|
||||||
|
# cause dirty buffers |
||||||
|
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 4;"); |
||||||
|
# cause files to be opened in backend in other database |
||||||
|
cause_eviction(\%psql_primary, \%psql_standby); |
||||||
|
|
||||||
|
# move database back / forth |
||||||
|
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace'); |
||||||
|
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE pg_default'); |
||||||
|
|
||||||
|
# cause dirty buffers |
||||||
|
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 5;"); |
||||||
|
cause_eviction(\%psql_primary, \%psql_standby); |
||||||
|
|
||||||
|
verify($node_primary, $node_standby, 5, |
||||||
|
"post move contents as expected"); |
||||||
|
|
||||||
|
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace'); |
||||||
|
|
||||||
|
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 7;"); |
||||||
|
cause_eviction(\%psql_primary, \%psql_standby); |
||||||
|
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 8;"); |
||||||
|
$node_primary->safe_psql('postgres', 'DROP DATABASE conflict_db'); |
||||||
|
$node_primary->safe_psql('postgres', 'DROP TABLESPACE test_tablespace'); |
||||||
|
|
||||||
|
$node_primary->safe_psql('postgres', 'REINDEX TABLE pg_database'); |
||||||
|
|
||||||
|
|
||||||
|
# explicitly shut down psql instances gracefully - to avoid hangs |
||||||
|
# or worse on windows |
||||||
|
$psql_primary{stdin} .= "\\q\n"; |
||||||
|
$psql_primary{run}->finish; |
||||||
|
$psql_standby{stdin} .= "\\q\n"; |
||||||
|
$psql_standby{run}->finish; |
||||||
|
|
||||||
|
$node_primary->stop(); |
||||||
|
$node_standby->stop(); |
||||||
|
|
||||||
|
# Make sure that there weren't crashes during shutdown |
||||||
|
|
||||||
|
command_like([ 'pg_controldata', $node_primary->data_dir ], |
||||||
|
qr/Database cluster state:\s+shut down\n/, 'primary shut down ok'); |
||||||
|
command_like([ 'pg_controldata', $node_standby->data_dir ], |
||||||
|
qr/Database cluster state:\s+shut down in recovery\n/, 'standby shut down ok'); |
||||||
|
done_testing(); |
||||||
|
|
||||||
|
sub verify |
||||||
|
{ |
||||||
|
my ($primary, $standby, $counter, $message) = @_; |
||||||
|
|
||||||
|
my $query = "SELECT datab, count(*) FROM large GROUP BY 1 ORDER BY 1 LIMIT 10"; |
||||||
|
is($primary->safe_psql('conflict_db', $query), |
||||||
|
"$counter|4000", |
||||||
|
"primary: $message"); |
||||||
|
|
||||||
|
$primary->wait_for_catchup($standby); |
||||||
|
is($standby->safe_psql('conflict_db', $query), |
||||||
|
"$counter|4000", |
||||||
|
"standby: $message"); |
||||||
|
} |
||||||
|
|
||||||
|
sub cause_eviction |
||||||
|
{ |
||||||
|
my ($psql_primary, $psql_standby) = @_; |
||||||
|
|
||||||
|
send_query_and_wait( |
||||||
|
$psql_primary, |
||||||
|
q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;], |
||||||
|
qr/warmed_buffers/m); |
||||||
|
|
||||||
|
send_query_and_wait( |
||||||
|
$psql_standby, |
||||||
|
q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;], |
||||||
|
qr/warmed_buffers/m); |
||||||
|
} |
||||||
|
|
||||||
|
# Send query, wait until string matches |
||||||
|
sub send_query_and_wait |
||||||
|
{ |
||||||
|
my ($psql, $query, $untl) = @_; |
||||||
|
my $ret; |
||||||
|
|
||||||
|
# send query |
||||||
|
$$psql{stdin} .= $query; |
||||||
|
$$psql{stdin} .= "\n"; |
||||||
|
|
||||||
|
# wait for query results |
||||||
|
$$psql{run}->pump_nb(); |
||||||
|
while (1) |
||||||
|
{ |
||||||
|
last if $$psql{stdout} =~ /$untl/; |
||||||
|
|
||||||
|
if ($psql_timeout->is_expired) |
||||||
|
{ |
||||||
|
BAIL_OUT("aborting wait: program timed out\n" |
||||||
|
. "stream contents: >>$$psql{stdout}<<\n" |
||||||
|
. "pattern searched for: $untl\n"); |
||||||
|
return 0; |
||||||
|
} |
||||||
|
if (not $$psql{run}->pumpable()) |
||||||
|
{ |
||||||
|
BAIL_OUT("aborting wait: program died\n" |
||||||
|
. "stream contents: >>$$psql{stdout}<<\n" |
||||||
|
. "pattern searched for: $untl\n"); |
||||||
|
return 0; |
||||||
|
} |
||||||
|
$$psql{run}->pump(); |
||||||
|
} |
||||||
|
|
||||||
|
$$psql{stdout} = ''; |
||||||
|
|
||||||
|
return 1; |
||||||
|
} |
||||||
Loading…
Reference in new issue