PG-1813 Make WAL keys TLI aware

Before this commit, WAL keys didn't mind TLI at all. But after
pg_rewind, for example, pg_wal/ may contain segments from two
timelines. And the wal reader choosing the key may pick the wrong one
because LSNs of different TLIs may overlap. There was also another bug:
There is a key with the start LSN 0/30000 in TLI 1. And after the start
in TLI 2, the wal writer creates a new key with the SN 0/30000, but in
TLI 2. But the reader wouldn't fetch the latest key because w/o TLI,
these are the same.

This commit adds TLI to the Internal keys and makes use of it along
with LSN for key compares.
pull/238/head
Andrew Pogrebnoy 1 month ago committed by Andrew Pogrebnoi
parent 87c55e6690
commit 1a20e9bb45
  1. 1
      contrib/pg_tde/meson.build
  2. 35
      contrib/pg_tde/src/access/pg_tde_xlog_keys.c
  3. 103
      contrib/pg_tde/src/access/pg_tde_xlog_smgr.c
  4. 44
      contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h
  5. 2
      contrib/pg_tde/src/include/pg_tde_fe.h
  6. 83
      contrib/pg_tde/t/wal_key_tli.pl

@ -126,6 +126,7 @@ tap_tests = [
't/unlogged_tables.pl',
't/wal_archiving.pl',
't/wal_encrypt.pl',
't/wal_key_tli.pl',
]
tests += {

@ -25,6 +25,7 @@
#define PG_TDE_WAL_KEY_FILE_NAME "wal_keys"
#define MaxXLogRecPtr (~(XLogRecPtr)0)
#define MaxTimeLineID (~(TimeLineID)0)
typedef struct WalKeyFileHeader
{
@ -44,7 +45,7 @@ typedef struct WalKeyFileEntry
static WALKeyCacheRec *tde_wal_key_cache = NULL;
static WALKeyCacheRec *tde_wal_key_last_rec = NULL;
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key, XLogRecPtr start_lsn);
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key);
static WalEncryptionKey *pg_tde_decrypt_wal_key(TDEPrincipalKey *principal_key, WalKeyFileEntry *entry);
static void pg_tde_initialize_wal_key_file_entry(WalKeyFileEntry *entry, const TDEPrincipalKey *principal_key, const WalEncryptionKey *rel_key_data);
static int pg_tde_open_wal_key_file_basic(const char *filename, int flags, bool ignore_missing);
@ -69,7 +70,7 @@ get_wal_key_file_path(void)
}
void
pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
pg_tde_wal_last_key_set_location(WalLocation loc)
{
LWLock *lock_pk = tde_lwlock_enc_keys();
int fd;
@ -85,9 +86,9 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
write_pos = sizeof(WalKeyFileHeader) +
(last_key_idx * sizeof(WalKeyFileEntry)) +
offsetof(WalKeyFileEntry, enc_key) +
offsetof(WalEncryptionKey, start_lsn);
offsetof(WalEncryptionKey, wal_start);
if (pg_pwrite(fd, &lsn, sizeof(XLogRecPtr), write_pos) != sizeof(XLogRecPtr))
if (pg_pwrite(fd, &loc, sizeof(WalLocation), write_pos) != sizeof(WalLocation))
{
ereport(ERROR,
errcode_for_file_access(),
@ -111,7 +112,7 @@ pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn)
errmsg("could not read previous WAL key: %m"));
}
if (prev_entry.enc_key.start_lsn >= lsn)
if (wal_location_cmp(prev_entry.enc_key.wal_start, loc) >= 0)
{
prev_entry.enc_key.type = TDE_KEY_TYPE_WAL_INVALID;
@ -160,7 +161,8 @@ pg_tde_create_wal_key(WalEncryptionKey *rel_key_data, TDEMapEntryType entry_type
/* TODO: no need in generating key if TDE_KEY_TYPE_WAL_UNENCRYPTED */
rel_key_data->type = entry_type;
rel_key_data->start_lsn = InvalidXLogRecPtr;
rel_key_data->wal_start.lsn = InvalidXLogRecPtr;
rel_key_data->wal_start.tli = 0;
if (!RAND_bytes(rel_key_data->key, INTERNAL_KEY_LEN))
ereport(ERROR,
@ -245,7 +247,7 @@ pg_tde_read_last_wal_key(void)
/* Fetches WAL keys from disk and adds them to the WAL cache */
WALKeyCacheRec *
pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
pg_tde_fetch_wal_keys(WalLocation start)
{
off_t read_pos = 0;
LWLock *lock_pk = tde_lwlock_enc_keys();
@ -276,10 +278,10 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
{
WALKeyCacheRec *wal_rec;
WalEncryptionKey stub_key = {
.start_lsn = InvalidXLogRecPtr,
.wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr},
};
wal_rec = pg_tde_add_wal_key_to_cache(&stub_key, InvalidXLogRecPtr);
wal_rec = pg_tde_add_wal_key_to_cache(&stub_key);
#ifdef FRONTEND
/* The backend frees it after copying to the cache. */
@ -299,15 +301,15 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
/*
* Skip new (just created but not updated by write) and invalid keys
*/
if (entry.enc_key.start_lsn != InvalidXLogRecPtr &&
if (wal_location_valid(entry.enc_key.wal_start) &&
(entry.enc_key.type == TDE_KEY_TYPE_WAL_UNENCRYPTED ||
entry.enc_key.type == TDE_KEY_TYPE_WAL_ENCRYPTED) &&
entry.enc_key.start_lsn >= start_lsn)
wal_location_cmp(entry.enc_key.wal_start, start) >= 0)
{
WalEncryptionKey *rel_key_data = pg_tde_decrypt_wal_key(principal_key, &entry);
WALKeyCacheRec *wal_rec;
wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data, entry.enc_key.start_lsn);
wal_rec = pg_tde_add_wal_key_to_cache(rel_key_data);
pfree(rel_key_data);
@ -325,7 +327,7 @@ pg_tde_fetch_wal_keys(XLogRecPtr start_lsn)
}
static WALKeyCacheRec *
pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
pg_tde_add_wal_key_to_cache(WalEncryptionKey *key)
{
WALKeyCacheRec *wal_rec;
#ifndef FRONTEND
@ -338,8 +340,9 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
MemoryContextSwitchTo(oldCtx);
#endif
wal_rec->start_lsn = start_lsn;
wal_rec->end_lsn = MaxXLogRecPtr;
wal_rec->start = key->wal_start;
wal_rec->end.tli = MaxTimeLineID;
wal_rec->end.lsn = MaxXLogRecPtr;
wal_rec->key = *key;
wal_rec->crypt_ctx = NULL;
if (!tde_wal_key_last_rec)
@ -350,7 +353,7 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key, XLogRecPtr start_lsn)
else
{
tde_wal_key_last_rec->next = wal_rec;
tde_wal_key_last_rec->end_lsn = wal_rec->start_lsn;
tde_wal_key_last_rec->end = wal_rec->start;
tde_wal_key_last_rec = wal_rec;
}

@ -45,7 +45,7 @@ static void *EncryptionCryptCtx = NULL;
static WalEncryptionKey EncryptionKey =
{
.type = MAP_ENTRY_EMPTY,
.start_lsn = InvalidXLogRecPtr,
.wal_start = {.tli = 0,.lsn = InvalidXLogRecPtr},
};
/*
@ -65,7 +65,12 @@ static WalEncryptionKey EncryptionKey =
typedef struct EncryptionStateData
{
pg_atomic_uint64 enc_key_lsn; /* to sync with readers */
/*
* To sync with readers. We sync on LSN only and TLI here just to
* communicate its value to readers.
*/
pg_atomic_uint32 enc_key_tli;
pg_atomic_uint64 enc_key_lsn;
} EncryptionStateData;
static EncryptionStateData *EncryptionState = NULL;
@ -78,10 +83,24 @@ TDEXLogGetEncKeyLsn()
return (XLogRecPtr) pg_atomic_read_u64(&EncryptionState->enc_key_lsn);
}
static TimeLineID
TDEXLogGetEncKeyTli()
{
return (TimeLineID) pg_atomic_read_u32(&EncryptionState->enc_key_tli);
}
static void
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
TDEXLogSetEncKeyLocation(WalLocation loc)
{
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, start_lsn);
/*
* Write TLI first and then LSN. The barrier ensures writes won't be
* reordered. When reading, the opposite must be done (with a matching
* barrier in between), so we always see a valid TLI after observing a
* valid LSN.
*/
pg_atomic_write_u32(&EncryptionState->enc_key_tli, loc.tli);
pg_write_barrier();
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, loc.lsn);
}
static Size TDEXLogEncryptBuffSize(void);
@ -166,7 +185,8 @@ TDEXLogShmemInit(void)
typedef struct EncryptionStateData
{
XLogRecPtr enc_key_lsn; /* to sync with reader */
TimeLineID enc_key_tli;
XLogRecPtr enc_key_lsn;
} EncryptionStateData;
static EncryptionStateData EncryptionStateD = {0};
@ -181,10 +201,17 @@ TDEXLogGetEncKeyLsn()
return (XLogRecPtr) EncryptionState->enc_key_lsn;
}
static TimeLineID
TDEXLogGetEncKeyTli()
{
return (TimeLineID) EncryptionState->enc_key_tli;
}
static void
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
TDEXLogSetEncKeyLocation(WalLocation loc)
{
EncryptionState->enc_key_lsn = EncryptionKey.start_lsn;
EncryptionState->enc_key_tli = loc.tli;
EncryptionState->enc_key_lsn = loc.lsn;
}
#endif /* FRONTEND */
@ -216,7 +243,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog)
else if (key)
{
EncryptionKey = *key;
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
}
if (key)
@ -231,7 +258,7 @@ TDEXLogSmgrInitWriteReuseKey()
if (key)
{
EncryptionKey = *key;
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
pfree(key);
}
}
@ -252,8 +279,8 @@ TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
#endif
#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn));
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %u_%X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno), key->wal_start.tli, LSN_FORMAT_ARGS(key->wal_start.lsn));
#endif
CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
@ -279,13 +306,13 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
*/
if (EncryptionKey.type != MAP_ENTRY_EMPTY && TDEXLogGetEncKeyLsn() == 0)
{
XLogRecPtr lsn;
WalLocation loc = {.tli = tli};
XLogSegNoOffsetToRecPtr(segno, offset, segSize, lsn);
XLogSegNoOffsetToRecPtr(segno, offset, segSize, loc.lsn);
pg_tde_wal_last_key_set_lsn(lsn);
EncryptionKey.start_lsn = lsn;
TDEXLogSetEncKeyLsn(lsn);
pg_tde_wal_last_key_set_location(loc);
EncryptionKey.wal_start = loc;
TDEXLogSetEncKeyLocation(EncryptionKey.wal_start);
}
if (EncryptionKey.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
@ -304,12 +331,12 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
ssize_t readsz;
WALKeyCacheRec *keys = pg_tde_get_wal_cache_keys();
XLogRecPtr write_key_lsn;
XLogRecPtr data_start;
XLogRecPtr data_end;
WalLocation data_end = {.tli = tli};
WalLocation data_start = {.tli = tli};
#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno));
elog(DEBUG1, "read from a WAL segment, size: %lu offset: %ld [%lX], seg: %u_%X/%X",
count, offset, offset, tli, LSN_FORMAT_ARGS(segno));
#endif
readsz = pg_pread(fd, buf, count, offset);
@ -319,30 +346,38 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
if (!keys)
{
WalLocation start = {.tli = 1,.lsn = 0};
/* cache is empty, try to read keys from disk */
keys = pg_tde_fetch_wal_keys(InvalidXLogRecPtr);
keys = pg_tde_fetch_wal_keys(start);
}
/*
* The barrier ensures that we always read a vaild TLI after the valid
* LSN. See the comment in TDEXLogSetEncKeyLocation()
*/
write_key_lsn = TDEXLogGetEncKeyLsn();
pg_read_barrier();
if (!XLogRecPtrIsInvalid(write_key_lsn))
{
WALKeyCacheRec *last_key = pg_tde_get_last_wal_key();
WalLocation write_loc = {.tli = TDEXLogGetEncKeyTli(),.lsn = write_key_lsn};
Assert(last_key);
/* write has generated a new key, need to fetch it */
if (last_key->start_lsn < write_key_lsn)
if (wal_location_cmp(last_key->start, write_loc) < 0)
{
pg_tde_fetch_wal_keys(write_key_lsn);
pg_tde_fetch_wal_keys(write_loc);
/* in case cache was empty before */
keys = pg_tde_get_wal_cache_keys();
}
}
XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start);
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end);
XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start.lsn);
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end.lsn);
/*
* TODO: this is higly ineffective. We should get rid of linked list and
@ -351,24 +386,24 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
for (WALKeyCacheRec *curr_key = keys; curr_key != NULL; curr_key = curr_key->next)
{
#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "WAL key %X/%X-%X/%X, encrypted: %s",
LSN_FORMAT_ARGS(curr_key->start_lsn),
LSN_FORMAT_ARGS(curr_key->end_lsn),
elog(DEBUG1, "WAL key %u_%X/%X - %u_%X/%X, encrypted: %s",
curr_key->start.tli, LSN_FORMAT_ARGS(curr_key->start.lsn),
curr_key->end.tli, LSN_FORMAT_ARGS(curr_key->end.lsn),
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED ? "yes" : "no");
#endif
if (curr_key->key.start_lsn != InvalidXLogRecPtr &&
if (wal_location_valid(curr_key->key.wal_start) &&
curr_key->key.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
{
/*
* Check if the key's range overlaps with the buffer's and decypt
* the part that does.
*/
if (data_start < curr_key->end_lsn && data_end > curr_key->start_lsn)
if (wal_location_cmp(data_start, curr_key->end) < 0 && wal_location_cmp(data_end, curr_key->start) > 0)
{
char iv_prefix[16];
off_t dec_off = XLogSegmentOffset(Max(data_start, curr_key->start_lsn), segSize);
off_t dec_end = XLogSegmentOffset(Min(data_end, curr_key->end_lsn), segSize);
off_t dec_off = XLogSegmentOffset(Max(data_start.lsn, curr_key->start.lsn), segSize);
off_t dec_end = XLogSegmentOffset(Min(data_end.lsn, curr_key->end.lsn), segSize);
size_t dec_sz;
char *dec_buf = (char *) buf + (dec_off - offset);
@ -385,8 +420,8 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
dec_sz = dec_end - dec_off;
#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %X/%X",
dec_off, dec_off - offset, dec_sz, LSN_FORMAT_ARGS(curr_key->key->start_lsn));
elog(DEBUG1, "decrypt WAL, dec_off: %lu [buff_off %lu], sz: %lu | key %u_%X/%X",
dec_off, dec_off - offset, dec_sz, curr_key->key.wal_start.tli, LSN_FORMAT_ARGS(curr_key->key.wal_start.lsn));
#endif
pg_tde_stream_crypt(iv_prefix,
dec_off,

@ -6,13 +6,47 @@
#include "access/pg_tde_tdemap.h"
#include "catalog/tde_principal_key.h"
typedef struct WalLocation
{
XLogRecPtr lsn;
TimeLineID tli;
} WalLocation;
/*
* Compares given WAL locations and returns -1 if l1 < l2, 0 if l1 == l2,
* and 1 if l1 > l2
*/
static inline int
wal_location_cmp(WalLocation l1, WalLocation l2)
{
if (unlikely(l1.tli < l2.tli))
return -1;
if (unlikely(l1.tli > l2.tli))
return 1;
if (l1.lsn < l2.lsn)
return -1;
if (l1.lsn > l2.lsn)
return 1;
return 0;
}
static inline bool
wal_location_valid(WalLocation loc)
{
return loc.tli != 0 && loc.lsn != InvalidXLogRecPtr;
}
typedef struct WalEncryptionKey
{
uint8 key[INTERNAL_KEY_LEN];
uint8 base_iv[INTERNAL_KEY_IV_LEN];
uint32 type;
XLogRecPtr start_lsn;
WalLocation wal_start;
} WalEncryptionKey;
/*
@ -21,8 +55,8 @@ typedef struct WalEncryptionKey
*/
typedef struct WALKeyCacheRec
{
XLogRecPtr start_lsn;
XLogRecPtr end_lsn;
WalLocation start;
WalLocation end;
WalEncryptionKey key;
void *crypt_ctx;
@ -33,7 +67,7 @@ typedef struct WALKeyCacheRec
extern int pg_tde_count_wal_keys_in_file(void);
extern void pg_tde_create_wal_key(WalEncryptionKey *rel_key_data, TDEMapEntryType entry_type);
extern void pg_tde_delete_server_key(void);
extern WALKeyCacheRec *pg_tde_fetch_wal_keys(XLogRecPtr start_lsn);
extern WALKeyCacheRec *pg_tde_fetch_wal_keys(WalLocation start);
extern WALKeyCacheRec *pg_tde_get_last_wal_key(void);
extern TDESignedPrincipalKeyInfo *pg_tde_get_server_key_info(void);
extern WALKeyCacheRec *pg_tde_get_wal_cache_keys(void);
@ -41,6 +75,6 @@ extern void pg_tde_perform_rotate_server_key(TDEPrincipalKey *principal_key, TDE
extern WalEncryptionKey *pg_tde_read_last_wal_key(void);
extern void pg_tde_save_server_key(const TDEPrincipalKey *principal_key, bool write_xlog);
extern void pg_tde_save_server_key_redo(const TDESignedPrincipalKeyInfo *signed_key_info);
extern void pg_tde_wal_last_key_set_lsn(XLogRecPtr lsn);
extern void pg_tde_wal_last_key_set_location(WalLocation loc);
#endif /* PG_TDE_XLOG_KEYS_H */

@ -88,6 +88,8 @@ static int tde_fe_error_level = 0;
#define FreeFile(file) fclose(file)
#define pg_fsync(fd) fsync(fd)
#define pg_read_barrier() NULL
#endif /* FRONTEND */
#endif /* PG_TDE_EREPORT_H */

@ -0,0 +1,83 @@
# Copyright (c) 2021-2024, PostgreSQL Global Development Group
# A copy pg_rewind_databases with added restart of the standby, which forces two
# WAL keys with the same LSN but different TLI on the primary after pg_rewind.
use strict;
use warnings FATAL => 'all';
use PostgreSQL::Test::Utils;
use Test::More;
use FindBin;
use lib $FindBin::RealBin;
use RewindTest;
sub run_test
{
my $test_mode = shift;
RewindTest::setup_cluster($test_mode, ['-g']);
RewindTest::start_primary();
# Create a database in primary with a table.
primary_psql('CREATE DATABASE inprimary');
primary_psql('CREATE TABLE inprimary_tab (a int)', 'inprimary');
RewindTest::create_standby($test_mode);
# Generates a new WAL key with the start LSN 0/300000. After running
# pg_rewind, the primary will end up with that key and another one with the
# same LSN 0/300000, but different TLI.
$node_standby->restart;
# Create another database with another table, the creation is
# replicated to the standby.
primary_psql('CREATE DATABASE beforepromotion');
primary_psql('CREATE TABLE beforepromotion_tab (a int)',
'beforepromotion');
RewindTest::promote_standby();
# Create databases in the old primary and the new promoted standby.
primary_psql('CREATE DATABASE primary_afterpromotion');
primary_psql('CREATE TABLE primary_promotion_tab (a int)',
'primary_afterpromotion');
standby_psql('CREATE DATABASE standby_afterpromotion');
standby_psql('CREATE TABLE standby_promotion_tab (a int)',
'standby_afterpromotion');
# The clusters are now diverged.
RewindTest::run_pg_rewind($test_mode);
# Check that the correct databases are present after pg_rewind.
check_query(
'SELECT datname FROM pg_database ORDER BY 1',
qq(beforepromotion
inprimary
postgres
standby_afterpromotion
template0
template1
),
'database names');
# Permissions on PGDATA should have group permissions
SKIP:
{
skip "unix-style permissions not supported on Windows", 1
if ($windows_os || $Config::Config{osname} eq 'cygwin');
ok(check_mode_recursive($node_primary->data_dir(), 0750, 0640),
'check PGDATA permissions');
}
RewindTest::clean_rewind_test();
return;
}
run_test('remote');
done_testing();
Loading…
Cancel
Save