From ff8a389bfdb32ec4451c4be2f5a1431818cbdb5f Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Mon, 18 Aug 2025 08:11:49 +0100 Subject: [PATCH] PG-1604 fix: preallocate one more record for the cache There is at lesat one corner case scenario where we have to load the last record into the cache during a write: * replica crashes, receives last segment from primary * replica replays last segment, reaches end * replica activtes new key * replica replays prepared transaction, has to use old keys again * old key write function sees that we generated a new key, tries to load it In this scenario we could get away by detecting that we are in a write, and asserting if we tried to use the last key. But in a release build assertions are not fired, and we would end up writing some non encrypted data to disk, and later if we have to run recovery failing. It could be a FATAL, but that would still crash the server, and the next startup would crash again and again... Instead, to properly avoid this situation we preallocate memory for one more key in the cache during initialization. Since we can only add one extra key to the cache during the servers run, this means we no longer try to allocate in the critical section in any corner case. While this is not the nicest solution, it is simple and keeps the current cache and decrypt/encrypt logic the same as before. Any other solution would be more complex, and even more of a hack, as it would require dealing with a possibly out of date cache. --- contrib/pg_tde/src/access/pg_tde_xlog_keys.c | 38 ++++++++++++++++++- contrib/pg_tde/src/access/pg_tde_xlog_smgr.c | 11 +++--- .../src/include/access/pg_tde_xlog_keys.h | 1 + 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/contrib/pg_tde/src/access/pg_tde_xlog_keys.c b/contrib/pg_tde/src/access/pg_tde_xlog_keys.c index 460fced8026..9486dfe4ec9 100644 --- a/contrib/pg_tde/src/access/pg_tde_xlog_keys.c +++ b/contrib/pg_tde/src/access/pg_tde_xlog_keys.c @@ -16,6 +16,7 @@ #include "common/pg_tde_utils.h" #include "encryption/enc_aes.h" #include "encryption/enc_tde.h" +#include "utils/palloc.h" #ifdef FRONTEND #include "pg_tde_fe.h" @@ -43,7 +44,9 @@ typedef struct WalKeyFileEntry } WalKeyFileEntry; static WALKeyCacheRec *tde_wal_key_cache = NULL; +static WALKeyCacheRec *tde_wal_prealloc_record = NULL; static WALKeyCacheRec *tde_wal_key_last_rec = NULL; +static WalEncryptionKey *tde_wal_prealloc_key = NULL; static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(WalEncryptionKey *cached_key); static WalEncryptionKey *pg_tde_decrypt_wal_key(const TDEPrincipalKey *principal_key, WalKeyFileEntry *entry); @@ -326,6 +329,34 @@ pg_tde_fetch_wal_keys(WalLocation start) return return_wal_rec; } +/* + * In special cases, we have to add one more record to the WAL key cache during write (in the critical section, when we can't allocate). + * This method is a helper to deal with that: when adding a single key, we potentially allocate 2 records. + * These variables help preallocate them, so in the critical section we can just use the already allocated objects, and update the cache with them. + * + * While this is somewhat a hack, it is also simple, safe, reliable, and way easier to implement than to refactor the cache or the decryption/encryption loop. + */ +void +pg_tde_wal_cache_extra_palloc(void) +{ +#ifndef FRONTEND + MemoryContext oldCtx; + + oldCtx = MemoryContextSwitchTo(TopMemoryContext); +#endif + if (tde_wal_prealloc_record == NULL) + { + tde_wal_prealloc_record = palloc0_object(WALKeyCacheRec); + } + if (tde_wal_prealloc_key == NULL) + { + tde_wal_prealloc_key = palloc0_object(WalEncryptionKey); + } +#ifndef FRONTEND + MemoryContextSwitchTo(oldCtx); +#endif +} + static WALKeyCacheRec * pg_tde_add_wal_key_to_cache(WalEncryptionKey *key) { @@ -335,7 +366,8 @@ pg_tde_add_wal_key_to_cache(WalEncryptionKey *key) oldCtx = MemoryContextSwitchTo(TopMemoryContext); #endif - wal_rec = palloc0_object(WALKeyCacheRec); + wal_rec = tde_wal_prealloc_record == NULL ? palloc0_object(WALKeyCacheRec) : tde_wal_prealloc_record; + tde_wal_prealloc_record = NULL; #ifndef FRONTEND MemoryContextSwitchTo(oldCtx); #endif @@ -553,7 +585,9 @@ pg_tde_write_wal_key_file_entry(const WalEncryptionKey *rel_key_data, static WalEncryptionKey * pg_tde_decrypt_wal_key(const TDEPrincipalKey *principal_key, WalKeyFileEntry *entry) { - WalEncryptionKey *key = palloc_object(WalEncryptionKey); + WalEncryptionKey *key = tde_wal_prealloc_key == NULL ? palloc_object(WalEncryptionKey) : tde_wal_prealloc_key; + + tde_wal_prealloc_key = NULL; Assert(principal_key); diff --git a/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c b/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c index 2b25bcb7135..abf86c1ccba 100644 --- a/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c +++ b/contrib/pg_tde/src/access/pg_tde_xlog_smgr.c @@ -248,6 +248,7 @@ TDEXLogSmgrInitWrite(bool encrypt_xlog) /* cache is empty, prefetch keys from disk */ pg_tde_fetch_wal_keys(start); + pg_tde_wal_cache_extra_palloc(); } if (key) @@ -284,8 +285,8 @@ TDEXLogWriteEncryptedPagesOldKeys(int fd, const void *buf, size_t count, off_t o memcpy(enc_buff, buf, count); /* - * This method potentially allocates, but only in very early execution - * Shouldn't happen in a write, where we are in a critical section + * This method potentially allocates, but only in very early execution Can + * happen during a write, but we have one more cache entry preallocated. */ TDEXLogCryptBuffer(buf, enc_buff, count, offset, tli, segno, segSize); @@ -356,7 +357,7 @@ tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset, lastKeyUsable = (writeKeyLoc.lsn != 0); afterWriteKey = wal_location_cmp(writeKeyLoc, loc) <= 0; - if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && !lastKeyUsable) + if (EncryptionKey.type != WAL_KEY_TYPE_INVALID && !lastKeyUsable && afterWriteKey) { WALKeyCacheRec *last_key = pg_tde_get_last_wal_key(); @@ -442,10 +443,8 @@ TDEXLogCryptBuffer(const void *buf, void *out_buf, size_t count, off_t offset, WALKeyCacheRec *last_key = pg_tde_get_last_wal_key(); WalLocation write_loc = {.tli = TDEXLogGetEncKeyTli(),.lsn = write_key_lsn}; - Assert(last_key); - /* write has generated a new key, need to fetch it */ - if (wal_location_cmp(last_key->start, write_loc) < 0) + if (last_key != NULL && wal_location_cmp(last_key->start, write_loc) < 0) { pg_tde_fetch_wal_keys(write_loc); diff --git a/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h b/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h index e36e7e59a85..ca00cb45a03 100644 --- a/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h +++ b/contrib/pg_tde/src/include/access/pg_tde_xlog_keys.h @@ -82,5 +82,6 @@ extern WalEncryptionKey *pg_tde_read_last_wal_key(void); extern void pg_tde_save_server_key(const TDEPrincipalKey *principal_key, bool write_xlog); extern void pg_tde_save_server_key_redo(const TDESignedPrincipalKeyInfo *signed_key_info); extern void pg_tde_wal_last_key_set_location(WalLocation loc); +extern void pg_tde_wal_cache_extra_palloc(void); #endif /* PG_TDE_XLOG_KEYS_H */