mirror of https://github.com/postgres/postgres
This allows data checksums to be enabled, or disabled, in a running cluster without restricting access to the cluster during processing. Data checksums could prior to this only be enabled during initdb or when the cluster is offline using the pg_checksums app. This commit introduce functionality to enable, or disable, data checksums while the cluster is running regardless of how it was initialized. A background worker launcher process is responsible for launching a dynamic per-database background worker which will mark all buffers dirty for all relation with storage in order for them to have data checksums calculated on write. Once all relations in all databases have been processed, the data_checksums state will be set to on and the cluster will at that point be identical to one which had data checksums enabled during initialization or via offline processing. When data checksums are being enabled, concurrent I/O operations from backends other than the data checksums worker will write the checksums but not verify them on reading. Only when all backends have absorbed the procsignalbarrier for setting data_checksums to on will they also start verifying checksums on reading. The same process is repeated during disabling; all backends write checksums but do not verify them until the barrier for setting the state to off has been absorbed by all. This in-progress state is used to ensure there are no false negatives (or positives) due to reading a checksum which is not in sync with the page. A new testmodule, test_checksums, is introduced with an extensive set of tests covering both online and offline data checksum mode changes. The tests which run concurrent pgbdench during online processing are gated behind the PG_TEST_EXTRA flag due to being very expensive to run. Two levels of PG_TEST_EXTRA flags exist to turn on a subset of the expensive tests, or the full suite of multiple runs. This work is based on an earlier version of this patch which was reviewed by among others Heikki Linnakangas, Robert Haas, Andres Freund, Tomas Vondra, Michael Banck and Andrey Borodin. During the work on this new version, Tomas Vondra has given invaluable assistance with not only coding and reviewing but very in-depth testing. Author: Daniel Gustafsson <daniel@yesql.se> Author: Magnus Hagander <magnus@hagander.net> Co-authored-by: Tomas Vondra <tomas@vondra.me> Reviewed-by: Tomas Vondra <tomas@vondra.me> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi> Discussion: https://postgr.es/m/CABUevExz9hUUOLnJVr2kpw9Cx=o4MCr1SVKwbupzuxP7ckNutA@mail.gmail.com Discussion: https://postgr.es/m/20181030051643.elbxjww5jjgnjaxg@alap3.anarazel.de Discussion: https://postgr.es/m/CABUevEwE3urLtwxxqdgd5O2oQz9J717ZzMbh+ziCSa5YLLU_BA@mail.gmail.commaster
parent
8261ee24fe
commit
f19c0eccae
@ -0,0 +1,14 @@ |
||||
digraph G { |
||||
A -> B [label="SELECT pg_enable_data_checksums()"]; |
||||
B -> C; |
||||
D -> A; |
||||
C -> D [label="SELECT pg_disable_data_checksums()"]; |
||||
E -> A [label=" --no-data-checksums"]; |
||||
E -> C [label=" --data-checksums"]; |
||||
|
||||
A [label="off"]; |
||||
B [label="inprogress-on"]; |
||||
C [label="on"]; |
||||
D [label="inprogress-off"]; |
||||
E [label="initdb"]; |
||||
} |
||||
|
After Width: | Height: | Size: 4.2 KiB |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,58 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* datachecksum_state.h |
||||
* header file for data checksum helper background worker and data |
||||
* checksum state manipulation |
||||
* |
||||
* |
||||
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* src/include/postmaster/datachecksum_state.h |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#ifndef DATACHECKSUM_STATE_H |
||||
#define DATACHECKSUM_STATE_H |
||||
|
||||
#include "storage/procsignal.h" |
||||
|
||||
/* Shared memory */ |
||||
extern Size DataChecksumsShmemSize(void); |
||||
extern void DataChecksumsShmemInit(void); |
||||
|
||||
/* Possible operations the Datachecksumsworker can perform */ |
||||
typedef enum DataChecksumsWorkerOperation |
||||
{ |
||||
ENABLE_DATACHECKSUMS, |
||||
DISABLE_DATACHECKSUMS, |
||||
} DataChecksumsWorkerOperation; |
||||
|
||||
/*
|
||||
* Possible states for a database entry which has been processed. Exported |
||||
* here since we want to be able to reference this from injection point tests. |
||||
*/ |
||||
typedef enum |
||||
{ |
||||
DATACHECKSUMSWORKER_SUCCESSFUL = 0, |
||||
DATACHECKSUMSWORKER_ABORTED, |
||||
DATACHECKSUMSWORKER_FAILED, |
||||
DATACHECKSUMSWORKER_DROPDB, |
||||
} DataChecksumsWorkerResult; |
||||
|
||||
/* Prototypes for data checksum state manipulation */ |
||||
bool AbsorbDataChecksumsBarrier(ProcSignalBarrierType target_state); |
||||
void EmitAndWaitDataChecksumsBarrier(uint32 state); |
||||
|
||||
/* Prototypes for data checksum background worker */ |
||||
|
||||
/* Start the background processes for enabling or disabling checksums */ |
||||
void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, |
||||
int cost_delay, |
||||
int cost_limit); |
||||
|
||||
/* Background worker entrypoints */ |
||||
void DataChecksumsWorkerLauncherMain(Datum arg); |
||||
void DataChecksumsWorkerMain(Datum arg); |
||||
|
||||
#endif /* DATACHECKSUM_STATE_H */ |
||||
@ -0,0 +1,2 @@ |
||||
# Generated by test suite |
||||
/tmp_check/ |
||||
@ -0,0 +1,40 @@ |
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# Makefile for src/test/modules/test_checksums
|
||||
#
|
||||
# Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
|
||||
# Portions Copyright (c) 1994, Regents of the University of California
|
||||
#
|
||||
# src/test/modules/test_checksums/Makefile
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
EXTRA_INSTALL = src/test/modules/injection_points
|
||||
|
||||
export enable_injection_points |
||||
|
||||
MODULE_big = test_checksums
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
test_checksums.o
|
||||
PGFILEDESC = "test_checksums - test code for data checksums"
|
||||
|
||||
EXTENSION = test_checksums
|
||||
DATA = test_checksums--1.0.sql
|
||||
|
||||
ifdef USE_PGXS |
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS) |
||||
else |
||||
subdir = src/test/modules/test_checksums
|
||||
top_builddir = ../../../..
|
||||
include $(top_builddir)/src/Makefile.global |
||||
include $(top_srcdir)/contrib/contrib-global.mk |
||||
endif |
||||
|
||||
check: |
||||
$(prove_check)
|
||||
|
||||
installcheck: |
||||
$(prove_installcheck)
|
||||
@ -0,0 +1,30 @@ |
||||
src/test/modules/test_checksums/README |
||||
|
||||
Regression tests for data checksums |
||||
=================================== |
||||
This directory contains a test suite for enabling, and disabling, data |
||||
checksums both offline as well as in a running cluster. |
||||
|
||||
Running the tests with autoconf |
||||
=============================== |
||||
|
||||
make check |
||||
|
||||
or |
||||
|
||||
make installcheck |
||||
|
||||
Running the tests with meson |
||||
============================ |
||||
From your build directory, issue the following command: |
||||
|
||||
meson test -q --print-errorlogs --suite setup --suite test_checksums |
||||
|
||||
NOTE: This creates a temporary installation (in the case of "make check" or |
||||
"--suite setup"), with multiple nodes, be they master or standby(s) for the |
||||
purpose of the tests. |
||||
|
||||
NOTE: This test suite requires TAP tests to be enabled, a subset of the tests |
||||
also require injection points to function. In order to run the extended test |
||||
then "checksum_extended" must be set in the PG_TEST_EXTRA environment |
||||
variable. |
||||
@ -0,0 +1,38 @@ |
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
test_checksums_sources = files( |
||||
'test_checksums.c', |
||||
) |
||||
|
||||
test_checksums = shared_module('test_checksums', |
||||
test_checksums_sources, |
||||
kwargs: pg_test_mod_args, |
||||
) |
||||
test_install_libs += test_checksums |
||||
|
||||
test_install_data += files( |
||||
'test_checksums.control', |
||||
'test_checksums--1.0.sql', |
||||
) |
||||
|
||||
tests += { |
||||
'name': 'test_checksums', |
||||
'sd': meson.current_source_dir(), |
||||
'bd': meson.current_build_dir(), |
||||
'tap': { |
||||
'env': { |
||||
'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', |
||||
}, |
||||
'tests': [ |
||||
't/001_basic.pl', |
||||
't/002_restarts.pl', |
||||
't/003_standby_restarts.pl', |
||||
't/004_offline.pl', |
||||
't/005_injection.pl', |
||||
't/006_pgbench_single.pl', |
||||
't/007_pgbench_standby.pl', |
||||
't/008_pitr.pl', |
||||
't/009_fpi.pl', |
||||
], |
||||
}, |
||||
} |
||||
@ -0,0 +1,63 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
# Test suite for testing enabling data checksums in an online cluster |
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
# Initialize node with checksums disabled. |
||||
my $node = PostgreSQL::Test::Cluster->new('basic_node'); |
||||
$node->init(no_data_checksums => 1); |
||||
$node->start; |
||||
|
||||
# Create some content to have un-checksummed data in the cluster |
||||
$node->safe_psql('postgres', |
||||
"CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); |
||||
|
||||
# Ensure that checksums are turned off |
||||
test_checksum_state($node, 'off'); |
||||
|
||||
# Enable data checksums and wait for the state transition to 'on' |
||||
enable_data_checksums($node, wait => 'on'); |
||||
|
||||
# Run a dummy query just to make sure we can read back data |
||||
my $result = |
||||
$node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1 "); |
||||
is($result, '9999', 'ensure checksummed pages can be read back'); |
||||
|
||||
# Enable data checksums again which should be a no-op so we explicitly don't |
||||
# wait for any state transition as none should happen here |
||||
enable_data_checksums($node); |
||||
test_checksum_state($node, 'on'); |
||||
# ..and make sure we can still read/write data |
||||
$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); |
||||
$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '10000', 'ensure checksummed pages can be read back'); |
||||
|
||||
# Disable checksums again and wait for the state transition |
||||
disable_data_checksums($node, wait => 1); |
||||
|
||||
# Test reading data again |
||||
$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '10000', 'ensure previously checksummed pages can be read back'); |
||||
|
||||
# Re-enable checksums and make sure that the underlying data has changed to |
||||
# ensure that checksums will be different. |
||||
$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); |
||||
enable_data_checksums($node, wait => 'on'); |
||||
|
||||
# Run a dummy query just to make sure we can read back the data |
||||
$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '10000', 'ensure checksummed pages can be read back'); |
||||
|
||||
$node->stop; |
||||
done_testing(); |
||||
@ -0,0 +1,110 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
# Test suite for testing enabling data checksums in an online cluster with a |
||||
# restart which breaks processing. |
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
# Initialize node with checksums disabled. |
||||
my $node = PostgreSQL::Test::Cluster->new('restarts_node'); |
||||
$node->init(no_data_checksums => 1); |
||||
$node->start; |
||||
|
||||
# Initialize result storage for queries |
||||
my $result; |
||||
|
||||
# Create some content to have un-checksummed data in the cluster |
||||
$node->safe_psql('postgres', |
||||
"CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); |
||||
|
||||
# Ensure that checksums are disabled |
||||
test_checksum_state($node, 'off'); |
||||
|
||||
SKIP: |
||||
{ |
||||
skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 6 |
||||
if (!$ENV{PG_TEST_EXTRA} |
||||
|| $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); |
||||
|
||||
# Create a barrier for checksum enablement to block on, in this case a pre- |
||||
# existing temporary table which is kept open while processing is started. |
||||
# We can accomplish this by setting up an interactive psql process which |
||||
# keeps the temporary table created as we enable checksums in another psql |
||||
# process. |
||||
# |
||||
# This is a similar test to the synthetic variant in 005_injection.pl |
||||
# which fakes this scenario. |
||||
my $bsession = $node->background_psql('postgres'); |
||||
$bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); |
||||
|
||||
# In another session, make sure we can see the blocking temp table but |
||||
# start processing anyways and check that we are blocked with a proper |
||||
# wait event. |
||||
$result = $node->safe_psql('postgres', |
||||
"SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';" |
||||
); |
||||
is($result, 't', 'ensure we can see the temporary table'); |
||||
|
||||
# Enabling data checksums shouldn't work as the process is blocked on the |
||||
# temporary table held open by $bsession. Ensure that we reach inprogress- |
||||
# on before we do more tests. |
||||
enable_data_checksums($node, wait => 'inprogress-on'); |
||||
|
||||
# Wait for processing to finish and the worker waiting for leftover temp |
||||
# relations to be able to actually finish |
||||
$result = $node->poll_query_until( |
||||
'postgres', |
||||
"SELECT wait_event FROM pg_catalog.pg_stat_activity " |
||||
. "WHERE backend_type = 'datachecksum worker';", |
||||
'ChecksumEnableTemptableWait'); |
||||
|
||||
# The datachecksumsworker waits for temporary tables to disappear for 3 |
||||
# seconds before retrying, so sleep for 4 seconds to be guaranteed to see |
||||
# a retry cycle |
||||
sleep(4); |
||||
|
||||
# Re-check the wait event to ensure we are blocked on the right thing. |
||||
$result = $node->safe_psql('postgres', |
||||
"SELECT wait_event FROM pg_catalog.pg_stat_activity " |
||||
. "WHERE backend_type = 'datachecksum worker';"); |
||||
is($result, 'ChecksumEnableTemptableWait', |
||||
'ensure the correct wait condition is set'); |
||||
test_checksum_state($node, 'inprogress-on'); |
||||
|
||||
# Stop the cluster while bsession is still attached. We can't close the |
||||
# session first since the brief period between closing and stopping might |
||||
# be enough for checksums to get enabled. |
||||
$node->stop; |
||||
$bsession->quit; |
||||
$node->start; |
||||
|
||||
# Ensure the checksums aren't enabled across the restart. This leaves the |
||||
# cluster in the same state as before we entered the SKIP block. |
||||
test_checksum_state($node, 'off'); |
||||
} |
||||
|
||||
enable_data_checksums($node, wait => 'on'); |
||||
|
||||
$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '9999', 'ensure checksummed pages can be read back'); |
||||
|
||||
$result = $node->poll_query_until( |
||||
'postgres', |
||||
"SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksum%';", |
||||
'0'); |
||||
is($result, 1, 'await datachecksums worker/launcher termination'); |
||||
|
||||
disable_data_checksums($node, wait => 1); |
||||
|
||||
$node->stop; |
||||
done_testing(); |
||||
@ -0,0 +1,114 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
# Test suite for testing enabling data checksums in an online cluster with |
||||
# streaming replication |
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
# Initialize primary node |
||||
my $node_primary = PostgreSQL::Test::Cluster->new('standby_restarts_primary'); |
||||
$node_primary->init(allows_streaming => 1, no_data_checksums => 1); |
||||
$node_primary->start; |
||||
|
||||
my $slotname = 'physical_slot'; |
||||
$node_primary->safe_psql('postgres', |
||||
"SELECT pg_create_physical_replication_slot('$slotname')"); |
||||
|
||||
# Take backup |
||||
my $backup_name = 'my_backup'; |
||||
$node_primary->backup($backup_name); |
||||
|
||||
# Create streaming standby linking to primary |
||||
my $node_standby = PostgreSQL::Test::Cluster->new('standby_restarts_standby'); |
||||
$node_standby->init_from_backup($node_primary, $backup_name, |
||||
has_streaming => 1); |
||||
$node_standby->append_conf( |
||||
'postgresql.conf', qq[ |
||||
primary_slot_name = '$slotname' |
||||
]); |
||||
$node_standby->start; |
||||
|
||||
# Create some content on the primary to have un-checksummed data in the cluster |
||||
$node_primary->safe_psql('postgres', |
||||
"CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); |
||||
|
||||
# Wait for standbys to catch up |
||||
$node_primary->wait_for_catchup($node_standby, 'replay', |
||||
$node_primary->lsn('insert')); |
||||
|
||||
# Check that checksums are turned off on all nodes |
||||
test_checksum_state($node_primary, 'off'); |
||||
test_checksum_state($node_standby, 'off'); |
||||
|
||||
# --------------------------------------------------------------------------- |
||||
# Enable checksums for the cluster, and make sure that both the primary and |
||||
# standby change state. |
||||
# |
||||
|
||||
# Ensure that the primary switches to "inprogress-on" |
||||
enable_data_checksums($node_primary, wait => 'inprogress-on'); |
||||
# Wait for checksum enable to be replayed |
||||
$node_primary->wait_for_catchup($node_standby, 'replay'); |
||||
|
||||
# Ensure that the standby has switched to "inprogress-on" or "on". Normally it |
||||
# would be "inprogress-on", but it is theoretically possible for the primary to |
||||
# complete the checksum enabling *and* have the standby replay that record |
||||
# before we reach the check below. |
||||
my $result = $node_standby->poll_query_until( |
||||
'postgres', |
||||
"SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", |
||||
'f'); |
||||
is($result, 1, 'ensure standby has absorbed the inprogress-on barrier'); |
||||
$result = $node_standby->safe_psql('postgres', |
||||
"SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" |
||||
); |
||||
|
||||
is(($result eq 'inprogress-on' || $result eq 'on'), |
||||
1, 'ensure checksums are on, or in progress, on standby_1'); |
||||
|
||||
# Insert some more data which should be checksummed on INSERT |
||||
$node_primary->safe_psql('postgres', |
||||
"INSERT INTO t VALUES (generate_series(1, 10000));"); |
||||
|
||||
# Wait for checksums enabled on the primary and standby |
||||
wait_for_checksum_state($node_primary, 'on'); |
||||
wait_for_checksum_state($node_standby, 'on'); |
||||
|
||||
$result = |
||||
$node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); |
||||
is($result, '19998', 'ensure we can safely read all data with checksums'); |
||||
|
||||
$result = $node_primary->poll_query_until( |
||||
'postgres', |
||||
"SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksum%';", |
||||
'0'); |
||||
is($result, 1, 'await datachecksums worker/launcher termination'); |
||||
|
||||
# |
||||
# Disable checksums and ensure it's propagated to standby and that we can |
||||
# still read all data |
||||
# |
||||
|
||||
# Disable checksums and wait for the operation to be replayed |
||||
disable_data_checksums($node_primary); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay'); |
||||
# Ensure that the primary and standby has switched to off |
||||
wait_for_checksum_state($node_primary, 'off'); |
||||
wait_for_checksum_state($node_standby, 'off'); |
||||
# Doublecheck reading data without errors |
||||
$result = |
||||
$node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); |
||||
is($result, "19998", 'ensure we can safely read all data without checksums'); |
||||
|
||||
$node_standby->stop; |
||||
$node_primary->stop; |
||||
done_testing(); |
||||
@ -0,0 +1,82 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
# Test suite for testing enabling data checksums offline from various states |
||||
# of checksum processing |
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
# Initialize node with checksums disabled. |
||||
my $node = PostgreSQL::Test::Cluster->new('offline_node'); |
||||
$node->init(no_data_checksums => 1); |
||||
$node->start; |
||||
|
||||
# Create some content to have un-checksummed data in the cluster |
||||
$node->safe_psql('postgres', |
||||
"CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); |
||||
|
||||
# Ensure that checksums are disabled |
||||
test_checksum_state($node, 'off'); |
||||
|
||||
# Enable checksums offline using pg_checksums |
||||
$node->stop; |
||||
$node->checksum_enable_offline; |
||||
$node->start; |
||||
|
||||
# Ensure that checksums are enabled |
||||
test_checksum_state($node, 'on'); |
||||
|
||||
# Run a dummy query just to make sure we can read back some data |
||||
my $result = |
||||
$node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '9999', 'ensure checksummed pages can be read back'); |
||||
|
||||
# Disable checksums offline again using pg_checksums |
||||
$node->stop; |
||||
$node->checksum_disable_offline; |
||||
$node->start; |
||||
|
||||
# Ensure that checksums are disabled |
||||
test_checksum_state($node, 'off'); |
||||
|
||||
# Create a barrier for checksum enablement to block on, in this case a pre- |
||||
# existing temporary table which is kept open while processing is started. We |
||||
# can accomplish this by setting up an interactive psql process which keeps the |
||||
# temporary table created as we enable checksums in another psql process. |
||||
|
||||
my $bsession = $node->background_psql('postgres'); |
||||
$bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); |
||||
|
||||
# In another session, make sure we can see the blocking temp table but start |
||||
# processing anyways and check that we are blocked with a proper wait event. |
||||
$result = $node->safe_psql('postgres', |
||||
"SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); |
||||
is($result, 't', 'ensure we can see the temporary table'); |
||||
|
||||
enable_data_checksums($node, wait => 'inprogress-on'); |
||||
|
||||
# Turn the cluster off and enable checksums offline, then start back up |
||||
$bsession->quit; |
||||
$node->stop; |
||||
$node->checksum_enable_offline; |
||||
$node->start; |
||||
|
||||
# Ensure that checksums are now enabled even though processing wasn't |
||||
# restarted |
||||
test_checksum_state($node, 'on'); |
||||
|
||||
# Run a dummy query just to make sure we can read back some data |
||||
$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '9999', 'ensure checksummed pages can be read back'); |
||||
|
||||
$node->stop; |
||||
done_testing(); |
||||
@ -0,0 +1,74 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
# Test suite for testing enabling data checksums in an online cluster with |
||||
# injection point tests injecting failures into the processing |
||||
|
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
if ($ENV{enable_injection_points} ne 'yes') |
||||
{ |
||||
plan skip_all => 'Injection points not supported by this build'; |
||||
} |
||||
|
||||
# --------------------------------------------------------------------------- |
||||
# Test cluster setup |
||||
# |
||||
|
||||
# Initiate testcluster |
||||
my $node = PostgreSQL::Test::Cluster->new('injection_node'); |
||||
$node->init(no_data_checksums => 1); |
||||
$node->start; |
||||
|
||||
# Set up test environment |
||||
$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); |
||||
|
||||
# --------------------------------------------------------------------------- |
||||
# Inducing failures and crashes in processing |
||||
|
||||
# Force enabling checksums to fail by marking one of the databases as having |
||||
# failed in processing. |
||||
disable_data_checksums($node, wait => 1); |
||||
$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);'); |
||||
enable_data_checksums($node, wait => 'off'); |
||||
$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);'); |
||||
|
||||
# Make sure that disabling after a failure works |
||||
disable_data_checksums($node); |
||||
test_checksum_state($node, 'off'); |
||||
|
||||
# --------------------------------------------------------------------------- |
||||
# Timing and retry related tests |
||||
# |
||||
|
||||
SKIP: |
||||
{ |
||||
skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 4 |
||||
if (!$ENV{PG_TEST_EXTRA} |
||||
|| $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); |
||||
|
||||
# Inject a delay in the barrier for enabling checksums |
||||
disable_data_checksums($node, wait => 1); |
||||
$node->safe_psql('postgres', 'SELECT dcw_inject_delay_barrier();'); |
||||
enable_data_checksums($node, wait => 'on'); |
||||
|
||||
# Fake the existence of a temporary table at the start of processing, which |
||||
# will force the processing to wait and retry in order to wait for it to |
||||
# disappear. |
||||
disable_data_checksums($node, wait => 1); |
||||
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);'); |
||||
enable_data_checksums($node, wait => 'on'); |
||||
} |
||||
|
||||
$node->stop; |
||||
done_testing(); |
||||
@ -0,0 +1,275 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
# Test suite for testing enabling data checksums in an online cluster with |
||||
# concurrent activity via pgbench runs |
||||
|
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
# This test suite is expensive, or very expensive, to execute. There are two |
||||
# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite |
||||
# an "checksum_extended" for the full suite. The full suite can run for hours |
||||
# on slow or constrained systems. |
||||
my $extended = undef; |
||||
if ($ENV{PG_TEST_EXTRA}) |
||||
{ |
||||
$extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); |
||||
plan skip_all => 'Expensive data checksums test disabled' |
||||
unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); |
||||
} |
||||
else |
||||
{ |
||||
plan skip_all => 'Expensive data checksums test disabled'; |
||||
} |
||||
|
||||
if ($ENV{enable_injection_points} ne 'yes') |
||||
{ |
||||
plan skip_all => 'Injection points not supported by this build'; |
||||
} |
||||
|
||||
my $node; |
||||
my $node_loglocation = 0; |
||||
|
||||
# The number of full test iterations which will be performed. The exact number |
||||
# of tests performed and the wall time taken is non-deterministic as the test |
||||
# performs a lot of randomized actions, but 10 iterations will be a long test |
||||
# run regardless. |
||||
my $TEST_ITERATIONS = 1; |
||||
$TEST_ITERATIONS = 10 if ($extended); |
||||
|
||||
# Variables which record the current state of the cluster |
||||
my $data_checksum_state = 'off'; |
||||
my $pgbench = undef; |
||||
|
||||
# Start a pgbench run in the background against the server specified via the |
||||
# port passed as parameter. |
||||
sub background_rw_pgbench |
||||
{ |
||||
my $port = shift; |
||||
|
||||
# If a previous pgbench is still running, start by shutting it down. |
||||
$pgbench->finish if $pgbench; |
||||
|
||||
my $clients = 1; |
||||
my $runtime = 2; |
||||
|
||||
if ($extended) |
||||
{ |
||||
# Randomize the number of pgbench clients a bit (range 1-16) |
||||
$clients = 1 + int(rand(15)); |
||||
$runtime = 600; |
||||
} |
||||
my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); |
||||
|
||||
# Randomize whether we spawn connections or not |
||||
push(@cmd, '-C') if ($extended && cointoss); |
||||
# Finally add the database name to use |
||||
push(@cmd, 'postgres'); |
||||
|
||||
$pgbench = IPC::Run::start( |
||||
\@cmd, |
||||
'<' => '/dev/null', |
||||
'>' => '/dev/null', |
||||
'2>' => '/dev/null', |
||||
IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); |
||||
} |
||||
|
||||
# Invert the state of data checksums in the cluster, if data checksums are on |
||||
# then disable them and vice versa. Also performs proper validation of the |
||||
# before and after state. |
||||
sub flip_data_checksums |
||||
{ |
||||
# First, make sure the cluster is in the state we expect it to be |
||||
test_checksum_state($node, $data_checksum_state); |
||||
|
||||
if ($data_checksum_state eq 'off') |
||||
{ |
||||
# Coin-toss to see if we are injecting a retry due to a temptable |
||||
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable();') |
||||
if cointoss(); |
||||
|
||||
# log LSN right before we start changing checksums |
||||
my $result = |
||||
$node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN before enabling: " . $result . "\n"); |
||||
|
||||
# Ensure that the primary switches to "inprogress-on" |
||||
enable_data_checksums($node, wait => 'inprogress-on'); |
||||
|
||||
random_sleep() if ($extended); |
||||
|
||||
# Wait for checksums enabled on the primary |
||||
wait_for_checksum_state($node, 'on'); |
||||
|
||||
# log LSN right after the primary flips checksums to "on" |
||||
$result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN after enabling: " . $result . "\n"); |
||||
|
||||
random_sleep() if ($extended); |
||||
|
||||
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);'); |
||||
$data_checksum_state = 'on'; |
||||
} |
||||
elsif ($data_checksum_state eq 'on') |
||||
{ |
||||
random_sleep() if ($extended); |
||||
|
||||
# log LSN right before we start changing checksums |
||||
my $result = |
||||
$node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN before disabling: " . $result . "\n"); |
||||
|
||||
disable_data_checksums($node); |
||||
|
||||
# Wait for checksums disabled on the primary |
||||
wait_for_checksum_state($node, 'off'); |
||||
|
||||
# log LSN right after the primary flips checksums to "off" |
||||
$result = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN after disabling: " . $result . "\n"); |
||||
|
||||
random_sleep() if ($extended); |
||||
|
||||
$data_checksum_state = 'off'; |
||||
} |
||||
else |
||||
{ |
||||
# This should only happen due to programmer error when hacking on the |
||||
# test code, but since that might pass subtly we error out. |
||||
BAIL_OUT('data_checksum_state variable has invalid state:' |
||||
. $data_checksum_state); |
||||
} |
||||
} |
||||
|
||||
# Create and start a cluster with one node |
||||
$node = PostgreSQL::Test::Cluster->new('pgbench_single_main'); |
||||
$node->init(allows_streaming => 1, no_data_checksums => 1); |
||||
# max_connections need to be bumped in order to accommodate for pgbench clients |
||||
# and log_statement is dialled down since it otherwise will generate enormous |
||||
# amounts of logging. Page verification failures are still logged. |
||||
$node->append_conf( |
||||
'postgresql.conf', |
||||
qq[ |
||||
max_connections = 100 |
||||
log_statement = none |
||||
]); |
||||
$node->start; |
||||
$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); |
||||
# Create some content to have un-checksummed data in the cluster |
||||
$node->safe_psql('postgres', |
||||
"CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); |
||||
# Initialize pgbench |
||||
my $scalefactor = ($extended ? 10 : 1); |
||||
$node->command_ok( |
||||
[ |
||||
'pgbench', '-p', $node->port, '-i', |
||||
'-s', $scalefactor, '-q', 'postgres' |
||||
]); |
||||
# Start the test suite with pgbench running. |
||||
background_rw_pgbench($node->port); |
||||
|
||||
# Main test suite. This loop will start a pgbench run on the cluster and while |
||||
# that's running flip the state of data checksums concurrently. It will then |
||||
# randomly restart the cluster and then check for |
||||
# the desired state. The idea behind doing things randomly is to stress out |
||||
# any timing related issues by subjecting the cluster for varied workloads. |
||||
# A TODO is to generate a trace such that any test failure can be traced to |
||||
# its order of operations for debugging. |
||||
for (my $i = 0; $i < $TEST_ITERATIONS; $i++) |
||||
{ |
||||
note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS); |
||||
|
||||
if (!$node->is_alive) |
||||
{ |
||||
# Start, to do recovery, and stop |
||||
$node->start; |
||||
$node->stop('fast'); |
||||
|
||||
# Since the log isn't being written to now, parse the log and check |
||||
# for instances of checksum verification failures. |
||||
my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, |
||||
$node_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in primary log (during WAL recovery)" |
||||
); |
||||
$node_loglocation = -s $node->logfile; |
||||
|
||||
# Randomize the WAL size, to trigger checkpoints less/more often |
||||
my $sb = 64 + int(rand(1024)); |
||||
$node->append_conf('postgresql.conf', qq[max_wal_size = $sb]); |
||||
note("changing max_wal_size to " . $sb); |
||||
|
||||
$node->start; |
||||
|
||||
# Start a pgbench in the background against the primary |
||||
background_rw_pgbench($node->port); |
||||
} |
||||
|
||||
$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); |
||||
|
||||
flip_data_checksums(); |
||||
random_sleep() if ($extended); |
||||
my $result = |
||||
$node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '100000', 'ensure data pages can be read back on primary'); |
||||
|
||||
random_sleep() if ($extended); |
||||
|
||||
# Potentially powercycle the node |
||||
if (cointoss()) |
||||
{ |
||||
$node->stop(stopmode()); |
||||
|
||||
PostgreSQL::Test::Utils::system_log("pg_controldata", |
||||
$node->data_dir); |
||||
|
||||
my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, |
||||
$node_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in primary log (outside WAL recovery)" |
||||
); |
||||
$node_loglocation = -s $node->logfile; |
||||
} |
||||
|
||||
random_sleep() if ($extended); |
||||
} |
||||
|
||||
# Make sure the node is running |
||||
if (!$node->is_alive) |
||||
{ |
||||
$node->start; |
||||
} |
||||
|
||||
# Testrun is over, ensure that data reads back as expected and perform a final |
||||
# verification of the data checksum state. |
||||
my $result = |
||||
$node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '100000', 'ensure data pages can be read back on primary'); |
||||
test_checksum_state($node, $data_checksum_state); |
||||
|
||||
# Perform one final pass over the logs and hunt for unexpected errors |
||||
my $log = |
||||
PostgreSQL::Test::Utils::slurp_file($node->logfile, $node_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in primary log"); |
||||
$node_loglocation = -s $node->logfile; |
||||
|
||||
$node->teardown_node; |
||||
|
||||
done_testing(); |
||||
@ -0,0 +1,400 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
# Test suite for testing enabling data checksums in an online cluster, |
||||
# comprising of a primary and a replicated standby, with concurrent activity |
||||
# via pgbench runs |
||||
|
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
# This test suite is expensive, or very expensive, to execute. There are two |
||||
# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite |
||||
# an "checksum_extended" for the full suite. The full suite can run for hours |
||||
# on slow or constrained systems. |
||||
my $extended = undef; |
||||
if ($ENV{PG_TEST_EXTRA}) |
||||
{ |
||||
$extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); |
||||
plan skip_all => 'Expensive data checksums test disabled' |
||||
unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); |
||||
} |
||||
else |
||||
{ |
||||
plan skip_all => 'Expensive data checksums test disabled'; |
||||
} |
||||
|
||||
if ($ENV{enable_injection_points} ne 'yes') |
||||
{ |
||||
plan skip_all => 'Injection points not supported by this build'; |
||||
} |
||||
|
||||
my $node_primary_slot = 'physical_slot'; |
||||
my $node_primary_backup = 'primary_backup'; |
||||
my $node_primary; |
||||
my $node_primary_loglocation = 0; |
||||
my $node_standby; |
||||
my $node_standby_loglocation = 0; |
||||
|
||||
# The number of full test iterations which will be performed. The exact number |
||||
# of tests performed and the wall time taken is non-deterministic as the test |
||||
# performs a lot of randomized actions, but 5 iterations will be a long test |
||||
# run regardless. |
||||
my $TEST_ITERATIONS = 5; |
||||
$TEST_ITERATIONS = 1 if ($extended); |
||||
|
||||
# Variables which record the current state of the cluster |
||||
my $data_checksum_state = 'off'; |
||||
|
||||
my $pgbench_primary = undef; |
||||
my $pgbench_standby = undef; |
||||
|
||||
# Start a pgbench run in the background against the server specified via the |
||||
# port passed as parameter |
||||
sub background_pgbench |
||||
{ |
||||
my ($port, $standby) = @_; |
||||
my $pgbench = ($standby ? \$pgbench_standby : \$pgbench_primary); |
||||
|
||||
# Terminate any currently running pgbench process before continuing |
||||
$$pgbench->finish if $$pgbench; |
||||
|
||||
my $clients = 1; |
||||
my $runtime = 5; |
||||
|
||||
if ($extended) |
||||
{ |
||||
# Randomize the number of pgbench clients a bit (range 1-16) |
||||
$clients = 1 + int(rand(15)); |
||||
$runtime = 600; |
||||
} |
||||
|
||||
my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); |
||||
# Randomize whether we spawn connections or not |
||||
push(@cmd, '-C') if ($extended && cointoss()); |
||||
# If we run on a standby it needs to be a read-only benchmark |
||||
push(@cmd, '-S') if ($standby); |
||||
# Finally add the database name to use |
||||
push(@cmd, 'postgres'); |
||||
|
||||
$$pgbench = IPC::Run::start( |
||||
\@cmd, |
||||
'<' => '/dev/null', |
||||
'>' => '/dev/null', |
||||
'2>' => '/dev/null', |
||||
IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); |
||||
} |
||||
|
||||
# Invert the state of data checksums in the cluster, if data checksums are on |
||||
# then disable them and vice versa. Also performs proper validation of the |
||||
# before and after state. |
||||
sub flip_data_checksums |
||||
{ |
||||
# First, make sure the cluster is in the state we expect it to be |
||||
test_checksum_state($node_primary, $data_checksum_state); |
||||
test_checksum_state($node_standby, $data_checksum_state); |
||||
|
||||
if ($data_checksum_state eq 'off') |
||||
{ |
||||
# Coin-toss to see if we are injecting a retry due to a temptable |
||||
$node_primary->safe_psql('postgres', 'SELECT dcw_fake_temptable();') |
||||
if cointoss(); |
||||
|
||||
# log LSN right before we start changing checksums |
||||
my $result = |
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN before enabling: " . $result . "\n"); |
||||
|
||||
# Ensure that the primary switches to "inprogress-on" |
||||
enable_data_checksums($node_primary, wait => 'inprogress-on'); |
||||
|
||||
random_sleep() if ($extended); |
||||
|
||||
# Wait for checksum enable to be replayed |
||||
$node_primary->wait_for_catchup($node_standby, 'replay'); |
||||
|
||||
# Ensure that the standby has switched to "inprogress-on" or "on". |
||||
# Normally it would be "inprogress-on", but it is theoretically |
||||
# possible for the primary to complete the checksum enabling *and* have |
||||
# the standby replay that record before we reach the check below. |
||||
$result = $node_standby->poll_query_until( |
||||
'postgres', |
||||
"SELECT setting = 'off' " |
||||
. "FROM pg_catalog.pg_settings " |
||||
. "WHERE name = 'data_checksums';", |
||||
'f'); |
||||
is($result, 1, |
||||
'ensure standby has absorbed the inprogress-on barrier'); |
||||
$result = $node_standby->safe_psql('postgres', |
||||
"SELECT setting " |
||||
. "FROM pg_catalog.pg_settings " |
||||
. "WHERE name = 'data_checksums';"); |
||||
|
||||
is(($result eq 'inprogress-on' || $result eq 'on'), |
||||
1, 'ensure checksums are on, or in progress, on standby_1'); |
||||
|
||||
# Wait for checksums enabled on the primary and standby |
||||
wait_for_checksum_state($node_primary, 'on'); |
||||
|
||||
# log LSN right after the primary flips checksums to "on" |
||||
$result = |
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN after enabling: " . $result . "\n"); |
||||
|
||||
random_sleep() if ($extended); |
||||
wait_for_checksum_state($node_standby, 'on'); |
||||
|
||||
$node_primary->safe_psql('postgres', |
||||
'SELECT dcw_fake_temptable(false);'); |
||||
$data_checksum_state = 'on'; |
||||
} |
||||
elsif ($data_checksum_state eq 'on') |
||||
{ |
||||
random_sleep() if ($extended); |
||||
|
||||
# log LSN right before we start changing checksums |
||||
my $result = |
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN before disabling: " . $result . "\n"); |
||||
|
||||
disable_data_checksums($node_primary); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay'); |
||||
|
||||
# Wait for checksums disabled on the primary and standby |
||||
wait_for_checksum_state($node_primary, 'off'); |
||||
wait_for_checksum_state($node_standby, 'off'); |
||||
|
||||
# log LSN right after the primary flips checksums to "off" |
||||
$result = |
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN after disabling: " . $result . "\n"); |
||||
|
||||
random_sleep() if ($extended); |
||||
wait_for_checksum_state($node_standby, 'off'); |
||||
|
||||
$data_checksum_state = 'off'; |
||||
} |
||||
else |
||||
{ |
||||
# This should only happen due to programmer error when hacking on the |
||||
# test code, but since that might pass subtly we error out. |
||||
BAIL_OUT('data_checksum_state variable has invalid state:' |
||||
. $data_checksum_state); |
||||
} |
||||
} |
||||
|
||||
# Create and start a cluster with one primary and one standby node, and ensure |
||||
# they are caught up and in sync. |
||||
$node_primary = PostgreSQL::Test::Cluster->new('pgbench_standby_main'); |
||||
$node_primary->init(allows_streaming => 1, no_data_checksums => 1); |
||||
# max_connections need to be bumped in order to accommodate for pgbench clients |
||||
# and log_statement is dialled down since it otherwise will generate enormous |
||||
# amounts of logging. Page verification failures are still logged. |
||||
$node_primary->append_conf( |
||||
'postgresql.conf', |
||||
qq[ |
||||
max_connections = 30 |
||||
log_statement = none |
||||
]); |
||||
$node_primary->start; |
||||
$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); |
||||
# Create some content to have un-checksummed data in the cluster |
||||
$node_primary->safe_psql('postgres', |
||||
"CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); |
||||
$node_primary->safe_psql('postgres', |
||||
"SELECT pg_create_physical_replication_slot('$node_primary_slot');"); |
||||
$node_primary->backup($node_primary_backup); |
||||
|
||||
$node_standby = PostgreSQL::Test::Cluster->new('pgbench_standby_standby'); |
||||
$node_standby->init_from_backup($node_primary, $node_primary_backup, |
||||
has_streaming => 1); |
||||
$node_standby->append_conf( |
||||
'postgresql.conf', qq[ |
||||
primary_slot_name = '$node_primary_slot' |
||||
]); |
||||
$node_standby->start; |
||||
|
||||
# Initialize pgbench and wait for the objects to be created on the standby |
||||
my $scalefactor = ($extended ? 10 : 1); |
||||
$node_primary->command_ok( |
||||
[ |
||||
'pgbench', '-p', $node_primary->port, '-i', '-s', $scalefactor, '-q', |
||||
'postgres' |
||||
]); |
||||
$node_primary->wait_for_catchup($node_standby, 'replay'); |
||||
|
||||
# Start the test suite with pgbench running on all nodes |
||||
background_pgbench($node_standby->port, 1); |
||||
background_pgbench($node_primary->port, 0); |
||||
|
||||
# Main test suite. This loop will start a pgbench run on the cluster and while |
||||
# that's running flip the state of data checksums concurrently. It will then |
||||
# randomly restart the cluster and then check for |
||||
# the desired state. The idea behind doing things randomly is to stress out |
||||
# any timing related issues by subjecting the cluster for varied workloads. |
||||
# A TODO is to generate a trace such that any test failure can be traced to |
||||
# its order of operations for debugging. |
||||
for (my $i = 0; $i < $TEST_ITERATIONS; $i++) |
||||
{ |
||||
note("iteration ", ($i + 1), " of ", $TEST_ITERATIONS); |
||||
|
||||
if (!$node_primary->is_alive) |
||||
{ |
||||
# start, to do recovery, and stop |
||||
$node_primary->start; |
||||
$node_primary->stop('fast'); |
||||
|
||||
# Since the log isn't being written to now, parse the log and check |
||||
# for instances of checksum verification failures. |
||||
my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, |
||||
$node_primary_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in primary log (during WAL recovery)" |
||||
); |
||||
$node_primary_loglocation = -s $node_primary->logfile; |
||||
|
||||
# randomize the WAL size, to trigger checkpoints less/more often |
||||
my $sb = 32 + int(rand(960)); |
||||
$node_primary->append_conf('postgresql.conf', qq[max_wal_size = $sb]); |
||||
|
||||
note("changing primary max_wal_size to " . $sb); |
||||
|
||||
$node_primary->start; |
||||
|
||||
# Start a pgbench in the background against the primary |
||||
background_pgbench($node_primary->port, 0); |
||||
} |
||||
|
||||
if (!$node_standby->is_alive) |
||||
{ |
||||
$node_standby->start; |
||||
$node_standby->stop('fast'); |
||||
|
||||
# Since the log isn't being written to now, parse the log and check |
||||
# for instances of checksum verification failures. |
||||
my $log = |
||||
PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, |
||||
$node_standby_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in standby_1 log (during WAL recovery)" |
||||
); |
||||
$node_standby_loglocation = -s $node_standby->logfile; |
||||
|
||||
# randomize the WAL size, to trigger checkpoints less/more often |
||||
my $sb = 32 + int(rand(960)); |
||||
$node_standby->append_conf('postgresql.conf', qq[max_wal_size = $sb]); |
||||
|
||||
note("changing standby max_wal_size to " . $sb); |
||||
|
||||
$node_standby->start; |
||||
|
||||
# Start a read-only pgbench in the background on the standby |
||||
background_pgbench($node_standby->port, 1); |
||||
} |
||||
|
||||
$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); |
||||
$node_primary->wait_for_catchup($node_standby, 'write'); |
||||
|
||||
flip_data_checksums(); |
||||
random_sleep() if ($extended); |
||||
my $result = $node_primary->safe_psql('postgres', |
||||
"SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '100000', 'ensure data pages can be read back on primary'); |
||||
random_sleep(); |
||||
|
||||
# Potentially powercycle the cluster (the nodes independently). A TODO is |
||||
# to randomly stop the nodes in the opposite order too. |
||||
if ($extended && cointoss()) |
||||
{ |
||||
$node_primary->stop(stopmode()); |
||||
|
||||
# print the contents of the control file on the primary |
||||
PostgreSQL::Test::Utils::system_log("pg_controldata", |
||||
$node_primary->data_dir); |
||||
|
||||
# slurp the file after shutdown, so that it doesn't interfere with the recovery |
||||
my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, |
||||
$node_primary_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in primary log (outside WAL recovery)" |
||||
); |
||||
$node_primary_loglocation = -s $node_primary->logfile; |
||||
} |
||||
|
||||
random_sleep() if ($extended); |
||||
|
||||
if ($extended && cointoss()) |
||||
{ |
||||
$node_standby->stop(stopmode()); |
||||
|
||||
# print the contents of the control file on the standby |
||||
PostgreSQL::Test::Utils::system_log("pg_controldata", |
||||
$node_standby->data_dir); |
||||
|
||||
# slurp the file after shutdown, so that it doesn't interfere with the recovery |
||||
my $log = |
||||
PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, |
||||
$node_standby_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in standby_1 log (outside WAL recovery)" |
||||
); |
||||
$node_standby_loglocation = -s $node_standby->logfile; |
||||
} |
||||
} |
||||
|
||||
# make sure the nodes are running |
||||
if (!$node_primary->is_alive) |
||||
{ |
||||
$node_primary->start; |
||||
} |
||||
|
||||
if (!$node_standby->is_alive) |
||||
{ |
||||
$node_standby->start; |
||||
} |
||||
|
||||
# Testrun is over, ensure that data reads back as expected and perform a final |
||||
# verification of the data checksum state. |
||||
my $result = |
||||
$node_primary->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '100000', 'ensure data pages can be read back on primary'); |
||||
test_checksum_state($node_primary, $data_checksum_state); |
||||
test_checksum_state($node_standby, $data_checksum_state); |
||||
|
||||
# Perform one final pass over the logs and hunt for unexpected errors |
||||
my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, |
||||
$node_primary_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in primary log"); |
||||
$node_primary_loglocation = -s $node_primary->logfile; |
||||
$log = PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, |
||||
$node_standby_loglocation); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in standby_1 log"); |
||||
$node_standby_loglocation = -s $node_standby->logfile; |
||||
|
||||
$node_standby->teardown_node; |
||||
$node_primary->teardown_node; |
||||
|
||||
done_testing(); |
||||
@ -0,0 +1,189 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
# This test suite is expensive, or very expensive, to execute. There are two |
||||
# PG_TEST_EXTRA options for running it, "checksum" for a pared-down test suite |
||||
# an "checksum_extended" for the full suite. |
||||
my $extended = undef; |
||||
if ($ENV{PG_TEST_EXTRA}) |
||||
{ |
||||
$extended = 1 if ($ENV{PG_TEST_EXTRA} =~ /\bchecksum_extended\b/); |
||||
plan skip_all => 'Expensive data checksums test disabled' |
||||
unless ($ENV{PG_TEST_EXTRA} =~ /\bchecksum(_extended)?\b/); |
||||
} |
||||
else |
||||
{ |
||||
plan skip_all => 'Expensive data checksums test disabled'; |
||||
} |
||||
|
||||
|
||||
my $pgbench = undef; |
||||
my $data_checksum_state = 'off'; |
||||
|
||||
my $node_primary; |
||||
|
||||
# Invert the state of data checksums in the cluster, if data checksums are on |
||||
# then disable them and vice versa. Also performs proper validation of the |
||||
# before and after state. |
||||
sub flip_data_checksums |
||||
{ |
||||
my $lsn_pre = undef; |
||||
my $lsn_post = undef; |
||||
|
||||
# First, make sure the cluster is in the state we expect it to be |
||||
test_checksum_state($node_primary, $data_checksum_state); |
||||
|
||||
if ($data_checksum_state eq 'off') |
||||
{ |
||||
# log LSN right before we start changing checksums |
||||
$lsn_pre = |
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN before enabling: " . $lsn_pre . "\n"); |
||||
|
||||
# Wait for checksums enabled on the primary |
||||
enable_data_checksums($node_primary, wait => 'on'); |
||||
|
||||
# log LSN right after the primary flips checksums to "on" |
||||
$lsn_post = |
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
note("LSN after enabling: " . $lsn_post . "\n"); |
||||
|
||||
$data_checksum_state = 'on'; |
||||
} |
||||
elsif ($data_checksum_state eq 'on') |
||||
{ |
||||
# log LSN right before we start changing checksums |
||||
$lsn_pre = |
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
|
||||
disable_data_checksums($node_primary); |
||||
|
||||
# Wait for checksums disabled on the primary |
||||
wait_for_checksum_state($node_primary, 'off'); |
||||
|
||||
# log LSN right after the primary flips checksums to "off" |
||||
$lsn_post = |
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); |
||||
|
||||
$data_checksum_state = 'off'; |
||||
} |
||||
else |
||||
{ |
||||
# This should only happen due to programmer error when hacking on the |
||||
# test code, but since that might pass subtly we error out. |
||||
BAIL_OUT('data_checksum_state variable has invalid state:' |
||||
. $data_checksum_state); |
||||
} |
||||
|
||||
return ($lsn_pre, $lsn_post); |
||||
} |
||||
# Start a pgbench run in the background against the server specified via the |
||||
# port passed as parameter. |
||||
sub background_rw_pgbench |
||||
{ |
||||
my $port = shift; |
||||
|
||||
# If a previous pgbench is still running, start by shutting it down. |
||||
$pgbench->finish if $pgbench; |
||||
|
||||
# Randomize the number of pgbench clients in extended mode, else 1 client |
||||
my $clients = ($extended ? 1 + int(rand(15)) : 1); |
||||
my $runtime = ($extended ? 600 : 5); |
||||
|
||||
my @cmd = ('pgbench', '-p', $port, '-T', $runtime, '-c', $clients); |
||||
|
||||
# Randomize whether we spawn connections or not |
||||
push(@cmd, '-C') if ($extended && cointoss()); |
||||
# Finally add the database name to use |
||||
push(@cmd, 'postgres'); |
||||
|
||||
$pgbench = IPC::Run::start( |
||||
\@cmd, |
||||
'<' => '/dev/null', |
||||
'>' => '/dev/null', |
||||
'2>' => '/dev/null', |
||||
IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); |
||||
} |
||||
|
||||
# Start a primary node with WAL archiving enabled and with enough connections |
||||
# available to handle pgbench clients. |
||||
$node_primary = PostgreSQL::Test::Cluster->new('pitr_main'); |
||||
$node_primary->init( |
||||
has_archiving => 1, |
||||
allows_streaming => 1, |
||||
no_data_checksums => 1); |
||||
$node_primary->append_conf( |
||||
'postgresql.conf', |
||||
qq[ |
||||
max_connections = 100 |
||||
log_statement = none |
||||
]); |
||||
$node_primary->start; |
||||
|
||||
# Prime the cluster with a bit of known data which we can read back to check |
||||
# for data consistency as well as page verification faults in the logfile. |
||||
$node_primary->safe_psql('postgres', |
||||
'CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;'); |
||||
# Initialize and start pgbench in read/write mode against the cluster |
||||
my $scalefactor = ($extended ? 10 : 1); |
||||
$node_primary->command_ok( |
||||
[ |
||||
'pgbench', '-p', $node_primary->port, '-i', '-s', $scalefactor, '-q', |
||||
'postgres' |
||||
]); |
||||
background_rw_pgbench($node_primary->port); |
||||
|
||||
# Take a backup to use for PITR |
||||
my $backup_name = 'my_backup'; |
||||
$node_primary->backup($backup_name); |
||||
|
||||
my ($pre_lsn, $post_lsn) = flip_data_checksums(); |
||||
|
||||
$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); |
||||
$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('a');"); |
||||
$node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); |
||||
$node_primary->stop('immediate'); |
||||
|
||||
my $node_pitr = PostgreSQL::Test::Cluster->new('pitr_backup'); |
||||
$node_pitr->init_from_backup( |
||||
$node_primary, $backup_name, |
||||
standby => 0, |
||||
has_restoring => 1); |
||||
$node_pitr->append_conf( |
||||
'postgresql.conf', qq{ |
||||
recovery_target_lsn = '$post_lsn' |
||||
recovery_target_action = 'promote' |
||||
recovery_target_inclusive = on |
||||
}); |
||||
|
||||
$node_pitr->start; |
||||
|
||||
$node_pitr->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") |
||||
or die "Timed out while waiting for PITR promotion"; |
||||
|
||||
test_checksum_state($node_pitr, $data_checksum_state); |
||||
my $result = |
||||
$node_pitr->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); |
||||
is($result, '99999', 'ensure data pages can be read back on primary'); |
||||
|
||||
$node_pitr->stop; |
||||
|
||||
my $log = PostgreSQL::Test::Utils::slurp_file($node_pitr->logfile, 0); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in pitr log"); |
||||
|
||||
done_testing(); |
||||
@ -0,0 +1,64 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
use FindBin; |
||||
use lib $FindBin::RealBin; |
||||
|
||||
use DataChecksums::Utils; |
||||
|
||||
# Create and start a cluster with one node |
||||
my $node = PostgreSQL::Test::Cluster->new('fpi_node'); |
||||
$node->init(allows_streaming => 1, no_data_checksums => 1); |
||||
# max_connections need to be bumped in order to accommodate for pgbench clients |
||||
# and log_statement is dialled down since it otherwise will generate enormous |
||||
# amounts of logging. Page verification failures are still logged. |
||||
$node->append_conf( |
||||
'postgresql.conf', |
||||
qq[ |
||||
max_connections = 100 |
||||
log_statement = none |
||||
]); |
||||
$node->start; |
||||
$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); |
||||
# Create some content to have un-checksummed data in the cluster |
||||
$node->safe_psql('postgres', |
||||
"CREATE TABLE t AS SELECT generate_series(1, 1000000) AS a;"); |
||||
|
||||
# Enable data checksums and wait for the state transition to 'on' |
||||
enable_data_checksums($node, wait => 'on'); |
||||
|
||||
$node->safe_psql('postgres', 'UPDATE t SET a = a + 1;'); |
||||
|
||||
disable_data_checksums($node, wait => 1); |
||||
|
||||
$node->append_conf('postgresql.conf', 'full_page_writes = off'); |
||||
$node->restart; |
||||
test_checksum_state($node, 'off'); |
||||
|
||||
$node->safe_psql('postgres', 'UPDATE t SET a = a + 1;'); |
||||
$node->safe_psql('postgres', 'DELETE FROM t WHERE a < 10000;'); |
||||
|
||||
$node->adjust_conf('postgresql.conf', 'full_page_writes', 'on'); |
||||
$node->restart; |
||||
test_checksum_state($node, 'off'); |
||||
|
||||
enable_data_checksums($node, wait => 'on'); |
||||
|
||||
my $result = $node->safe_psql('postgres', 'SELECT count(*) FROM t;'); |
||||
is($result, '990003', 'Reading back all data from table t'); |
||||
|
||||
$node->stop; |
||||
my $log = PostgreSQL::Test::Utils::slurp_file($node->logfile, 0); |
||||
unlike( |
||||
$log, |
||||
qr/page verification failed,.+\d$/, |
||||
"no checksum validation errors in server log"); |
||||
|
||||
done_testing(); |
||||
@ -0,0 +1,262 @@ |
||||
|
||||
# Copyright (c) 2026, PostgreSQL Global Development Group |
||||
|
||||
=pod |
||||
|
||||
=head1 NAME |
||||
|
||||
DataChecksums::Utils - Utility functions for testing data checksums in a running cluster |
||||
|
||||
=head1 SYNOPSIS |
||||
|
||||
use PostgreSQL::Test::Cluster; |
||||
use DataChecksums::Utils qw( .. ); |
||||
|
||||
# Create, and start, a new cluster |
||||
my $node = PostgreSQL::Test::Cluster->new('primary'); |
||||
$node->init; |
||||
$node->start; |
||||
|
||||
test_checksum_state($node, 'off'); |
||||
|
||||
enable_data_checksums($node); |
||||
|
||||
wait_for_checksum_state($node, 'on'); |
||||
|
||||
|
||||
=cut |
||||
|
||||
package DataChecksums::Utils; |
||||
|
||||
use strict; |
||||
use warnings FATAL => 'all'; |
||||
use Exporter 'import'; |
||||
use PostgreSQL::Test::Cluster; |
||||
use PostgreSQL::Test::Utils; |
||||
use Test::More; |
||||
|
||||
our @EXPORT = qw( |
||||
cointoss |
||||
disable_data_checksums |
||||
enable_data_checksums |
||||
random_sleep |
||||
stopmode |
||||
test_checksum_state |
||||
wait_for_checksum_state |
||||
wait_for_cluster_crash |
||||
); |
||||
|
||||
=pod |
||||
|
||||
=head1 METHODS |
||||
|
||||
=over |
||||
|
||||
=item test_checksum_state(node, state) |
||||
|
||||
Test that the current value of the data checksum GUC in the server running |
||||
at B<node> matches B<state>. If the values differ, a test failure is logged. |
||||
Returns True if the values match, otherwise False. |
||||
|
||||
=cut |
||||
|
||||
sub test_checksum_state |
||||
{ |
||||
my ($postgresnode, $state) = @_; |
||||
|
||||
my $result = $postgresnode->safe_psql('postgres', |
||||
"SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" |
||||
); |
||||
is($result, $state, 'ensure checksums are set to ' . $state); |
||||
return $result eq $state; |
||||
} |
||||
|
||||
=item wait_for_checksum_state(node, state) |
||||
|
||||
Test the value of the data checksum GUC in the server running at B<node> |
||||
repeatedly until it matches B<state> or times out. Processing will run for |
||||
$PostgreSQL::Test::Utils::timeout_default seconds before timing out. If the |
||||
values differ when the process times out, False is returned and a test failure |
||||
is logged, otherwise True. |
||||
|
||||
=cut |
||||
|
||||
sub wait_for_checksum_state |
||||
{ |
||||
my ($postgresnode, $state) = @_; |
||||
|
||||
my $res = $postgresnode->poll_query_until( |
||||
'postgres', |
||||
"SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", |
||||
$state); |
||||
is($res, 1, 'ensure data checksums are transitioned to ' . $state); |
||||
return $res == 1; |
||||
} |
||||
|
||||
=item wait_for_cluster_crash(node, params) |
||||
|
||||
Repeatedly test if the cluster running at B<node> responds to connections |
||||
and return when it no longer does so, or when it times out. Processing will |
||||
run for $PostgreSQL::Test::Utils::timeout_default seconds unless a timeout |
||||
value is specified as a parameter. Returns True if the cluster crashed, else |
||||
False if the process timed out. |
||||
|
||||
=over |
||||
|
||||
=item timeout |
||||
|
||||
Approximate number of seconds to wait for cluster to crash, default is |
||||
$PostgreSQL::Test::Utils::timeout_default. There are no real-time guarantees |
||||
that the total process time won't exceed the timeout. |
||||
|
||||
=back |
||||
|
||||
=cut |
||||
|
||||
sub wait_for_cluster_crash |
||||
{ |
||||
my $postgresnode = shift; |
||||
my %params = @_; |
||||
my $crash = 0; |
||||
|
||||
$params{timeout} = $PostgreSQL::Test::Utils::timeout_default |
||||
unless (defined($params{timeout})); |
||||
|
||||
for (my $naps = 0; $naps < $params{timeout}; $naps++) |
||||
{ |
||||
if (!$postgresnode->is_alive) |
||||
{ |
||||
$crash = 1; |
||||
last; |
||||
} |
||||
sleep(1); |
||||
} |
||||
|
||||
return $crash == 1; |
||||
} |
||||
|
||||
=item enable_data_checksums($node, %params) |
||||
|
||||
Function for enabling data checksums in the cluster running at B<node>. |
||||
|
||||
=over |
||||
|
||||
=item cost_delay |
||||
|
||||
The B<cost_delay> to use when enabling data checksums, default is 0. |
||||
|
||||
=item cost_limit |
||||
|
||||
The B<cost_limit> to use when enabling data checksums, default is 100. |
||||
|
||||
=item wait |
||||
|
||||
If defined, the function will wait for the state defined in this parameter, |
||||
waiting timing out, before returning. The function will wait for |
||||
$PostgreSQL::Test::Utils::timeout_default seconds before timing out. |
||||
|
||||
=back |
||||
|
||||
=cut |
||||
|
||||
sub enable_data_checksums |
||||
{ |
||||
my $postgresnode = shift; |
||||
my %params = @_; |
||||
|
||||
# Set sane defaults for the parameters |
||||
$params{cost_delay} = 0 unless (defined($params{cost_delay})); |
||||
$params{cost_limit} = 100 unless (defined($params{cost_limit})); |
||||
|
||||
my $query = <<'EOQ'; |
||||
SELECT pg_enable_data_checksums(%s, %s); |
||||
EOQ |
||||
|
||||
$postgresnode->safe_psql('postgres', |
||||
sprintf($query, $params{cost_delay}, $params{cost_limit})); |
||||
|
||||
wait_for_checksum_state($postgresnode, $params{wait}) |
||||
if (defined($params{wait})); |
||||
} |
||||
|
||||
=item disable_data_checksums($node, %params) |
||||
|
||||
Function for disabling data checksums in the cluster running at B<node>. |
||||
|
||||
=over |
||||
|
||||
=item wait |
||||
|
||||
If defined, the function will wait for the state to turn to B<off>, or |
||||
waiting timing out, before returning. The function will wait for |
||||
$PostgreSQL::Test::Utils::timeout_default seconds before timing out. |
||||
Unlike in C<enable_data_checksums> the value of the parameter is discarded. |
||||
|
||||
=back |
||||
|
||||
=cut |
||||
|
||||
sub disable_data_checksums |
||||
{ |
||||
my $postgresnode = shift; |
||||
my %params = @_; |
||||
|
||||
$postgresnode->safe_psql('postgres', |
||||
'SELECT pg_disable_data_checksums();'); |
||||
|
||||
wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait})); |
||||
} |
||||
|
||||
=item cointoss |
||||
|
||||
Helper for retrieving a binary value with random distribution for deciding |
||||
whether to turn things off during testing. |
||||
|
||||
=back |
||||
|
||||
=cut |
||||
|
||||
sub cointoss |
||||
{ |
||||
return int(rand() < 0.5); |
||||
} |
||||
|
||||
=item random_sleep(max) |
||||
|
||||
Helper for injecting random sleeps here and there in the testrun. The sleep |
||||
duration will be in the range (0,B<max>), but won't be predictable in order to |
||||
avoid sleep patterns that manage to avoid race conditions and timing bugs. |
||||
The default B<max> is 3 seconds. |
||||
|
||||
=back |
||||
|
||||
=cut |
||||
|
||||
sub random_sleep |
||||
{ |
||||
my $max = shift; |
||||
return if (defined($max) && ($max == 0)); |
||||
sleep(int(rand(defined($max) ? $max : 3))) if cointoss; |
||||
} |
||||
|
||||
=item stopmode |
||||
|
||||
Small helper function for randomly selecting a valid stopmode. |
||||
|
||||
=back |
||||
|
||||
=cut |
||||
|
||||
sub stopmode |
||||
{ |
||||
return 'immediate' if (cointoss); |
||||
return 'fast'; |
||||
} |
||||
|
||||
=pod |
||||
|
||||
=back |
||||
|
||||
=cut |
||||
|
||||
1; |
||||
@ -0,0 +1,24 @@ |
||||
/* src/test/modules/test_checksums/test_checksums--1.0.sql */ |
||||
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION |
||||
\echo Use "CREATE EXTENSION test_checksums" to load this file. \quit |
||||
|
||||
CREATE FUNCTION dcw_inject_delay_barrier(attach boolean DEFAULT true) |
||||
RETURNS pg_catalog.void |
||||
AS 'MODULE_PATHNAME' LANGUAGE C; |
||||
|
||||
CREATE FUNCTION dcw_inject_launcher_delay(attach boolean DEFAULT true) |
||||
RETURNS pg_catalog.void |
||||
AS 'MODULE_PATHNAME' LANGUAGE C; |
||||
|
||||
CREATE FUNCTION dcw_inject_startup_delay(attach boolean DEFAULT true) |
||||
RETURNS pg_catalog.void |
||||
AS 'MODULE_PATHNAME' LANGUAGE C; |
||||
|
||||
CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true) |
||||
RETURNS pg_catalog.void |
||||
AS 'MODULE_PATHNAME' LANGUAGE C; |
||||
|
||||
CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true) |
||||
RETURNS pg_catalog.void |
||||
AS 'MODULE_PATHNAME' LANGUAGE C; |
||||
@ -0,0 +1,184 @@ |
||||
/*--------------------------------------------------------------------------
|
||||
* |
||||
* test_checksums.c |
||||
* Test data checksums |
||||
* |
||||
* Copyright (c) 2026, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* src/test/modules/test_checksums/test_checksums.c |
||||
* |
||||
* ------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "funcapi.h" |
||||
#include "miscadmin.h" |
||||
#include "postmaster/datachecksum_state.h" |
||||
#include "storage/latch.h" |
||||
#include "utils/injection_point.h" |
||||
#include "utils/wait_event.h" |
||||
|
||||
PG_MODULE_MAGIC; |
||||
|
||||
extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_data, void *arg); |
||||
extern PGDLLEXPORT void dc_modify_db_result(const char *name, const void *private_data, void *arg); |
||||
extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg); |
||||
|
||||
extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg); |
||||
|
||||
/*
|
||||
* Test for delaying emission of procsignalbarriers. |
||||
*/ |
||||
void |
||||
dc_delay_barrier(const char *name, const void *private_data, void *arg) |
||||
{ |
||||
(void) name; |
||||
(void) private_data; |
||||
|
||||
(void) WaitLatch(MyLatch, |
||||
WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, |
||||
(3 * 1000), |
||||
WAIT_EVENT_PG_SLEEP); |
||||
} |
||||
|
||||
PG_FUNCTION_INFO_V1(dcw_inject_delay_barrier); |
||||
Datum |
||||
dcw_inject_delay_barrier(PG_FUNCTION_ARGS) |
||||
{ |
||||
#ifdef USE_INJECTION_POINTS |
||||
bool attach = PG_GETARG_BOOL(0); |
||||
|
||||
if (attach) |
||||
InjectionPointAttach("datachecksums-enable-checksums-delay", |
||||
"test_checksums", |
||||
"dc_delay_barrier", |
||||
NULL, |
||||
0); |
||||
else |
||||
InjectionPointDetach("datachecksums-enable-checksums-delay"); |
||||
#else |
||||
elog(ERROR, |
||||
"test is not working as intended when injection points are disabled"); |
||||
#endif |
||||
PG_RETURN_VOID(); |
||||
} |
||||
|
||||
PG_FUNCTION_INFO_V1(dcw_inject_launcher_delay); |
||||
Datum |
||||
dcw_inject_launcher_delay(PG_FUNCTION_ARGS) |
||||
{ |
||||
#ifdef USE_INJECTION_POINTS |
||||
bool attach = PG_GETARG_BOOL(0); |
||||
|
||||
if (attach) |
||||
InjectionPointAttach("datachecksumsworker-launcher-delay", |
||||
"test_checksums", |
||||
"dc_delay_barrier", |
||||
NULL, |
||||
0); |
||||
else |
||||
InjectionPointDetach("datachecksumsworker-launcher-delay"); |
||||
#else |
||||
elog(ERROR, |
||||
"test is not working as intended when injection points are disabled"); |
||||
#endif |
||||
PG_RETURN_VOID(); |
||||
} |
||||
|
||||
PG_FUNCTION_INFO_V1(dcw_inject_startup_delay); |
||||
Datum |
||||
dcw_inject_startup_delay(PG_FUNCTION_ARGS) |
||||
{ |
||||
#ifdef USE_INJECTION_POINTS |
||||
bool attach = PG_GETARG_BOOL(0); |
||||
|
||||
if (attach) |
||||
InjectionPointAttach("datachecksumsworker-startup-delay", |
||||
"test_checksums", |
||||
"dc_delay_barrier", |
||||
NULL, |
||||
0); |
||||
else |
||||
InjectionPointDetach("datachecksumsworker-startup-delay"); |
||||
#else |
||||
elog(ERROR, |
||||
"test is not working as intended when injection points are disabled"); |
||||
#endif |
||||
PG_RETURN_VOID(); |
||||
} |
||||
|
||||
static uint32 db_fail = DATACHECKSUMSWORKER_FAILED; |
||||
|
||||
void |
||||
dc_modify_db_result(const char *name, const void *private_data, void *arg) |
||||
{ |
||||
DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg; |
||||
uint32 new_res = *(uint32 *) private_data; |
||||
|
||||
*res = new_res; |
||||
} |
||||
|
||||
PG_FUNCTION_INFO_V1(dcw_inject_fail_database); |
||||
Datum |
||||
dcw_inject_fail_database(PG_FUNCTION_ARGS) |
||||
{ |
||||
#ifdef USE_INJECTION_POINTS |
||||
bool attach = PG_GETARG_BOOL(0); |
||||
|
||||
if (attach) |
||||
InjectionPointAttach("datachecksumsworker-modify-db-result", |
||||
"test_checksums", |
||||
"dc_modify_db_result", |
||||
&db_fail, |
||||
sizeof(uint32)); |
||||
else |
||||
InjectionPointDetach("datachecksumsworker-modify-db-result"); |
||||
#else |
||||
elog(ERROR, |
||||
"test is not working as intended when injection points are disabled"); |
||||
#endif |
||||
PG_RETURN_VOID(); |
||||
} |
||||
|
||||
/*
|
||||
* Test to force waiting for existing temptables. |
||||
*/ |
||||
void |
||||
dc_fake_temptable(const char *name, const void *private_data, void *arg) |
||||
{ |
||||
static bool first_pass = true; |
||||
int *numleft = (int *) arg; |
||||
|
||||
if (first_pass) |
||||
*numleft = 1; |
||||
first_pass = false; |
||||
} |
||||
|
||||
PG_FUNCTION_INFO_V1(dcw_fake_temptable); |
||||
Datum |
||||
dcw_fake_temptable(PG_FUNCTION_ARGS) |
||||
{ |
||||
#ifdef USE_INJECTION_POINTS |
||||
bool attach = PG_GETARG_BOOL(0); |
||||
|
||||
if (attach) |
||||
InjectionPointAttach("datachecksumsworker-fake-temptable-wait", |
||||
"test_checksums", |
||||
"dc_fake_temptable", |
||||
NULL, |
||||
0); |
||||
else |
||||
InjectionPointDetach("datachecksumsworker-fake-temptable-wait"); |
||||
#else |
||||
elog(ERROR, |
||||
"test is not working as intended when injection points are disabled"); |
||||
#endif |
||||
PG_RETURN_VOID(); |
||||
} |
||||
|
||||
void |
||||
crash(const char *name, const void *private_data, void *arg) |
||||
{ |
||||
abort(); |
||||
} |
||||
@ -0,0 +1,4 @@ |
||||
comment = 'Test code for data checksums' |
||||
default_version = '1.0' |
||||
module_pathname = '$libdir/test_checksums' |
||||
relocatable = true |
||||
Loading…
Reference in new issue