PG-1617 Remove the relation key cache

Since the relation keys are cached in the SMGR cache (or arguably the
relation cache) the double layers of caching only complicated the code
and caused an issue with possible key re-use or even data corruption on
oid wraparound.

This will slow down some code paths like pg_tde_is_encrypted() but the
code simplifcation and fixing of the oid wraparound bug makes it worth
it and some of that performance loss can be added back in future
commits.

This loses us the mlock() protection of the relation keys in the cache
but since the keys in the SMGR were not protected anyway this is not
a significant loss.
pull/230/head
Andreas Karlsson 4 months ago committed by Andreas Karlsson
parent a6d47dfb42
commit eafc34e335
  1. 213
      contrib/pg_tde/src/access/pg_tde_tdemap.c
  2. 40
      contrib/pg_tde/src/smgr/pg_tde_smgr.c

@ -32,7 +32,6 @@
#include <openssl/rand.h> #include <openssl/rand.h>
#include <openssl/err.h> #include <openssl/err.h>
#include <sys/mman.h>
#include <unistd.h> #include <unistd.h>
#include "pg_tde_defines.h" #include "pg_tde_defines.h"
@ -67,39 +66,6 @@ typedef struct TDEFileHeader
TDESignedPrincipalKeyInfo signed_key_info; TDESignedPrincipalKeyInfo signed_key_info;
} TDEFileHeader; } TDEFileHeader;
typedef struct RelKeyCacheRec
{
RelFileLocator locator;
InternalKey key;
} RelKeyCacheRec;
/*
* Relation keys cache.
*
* This is a slice backed by memory `*data`. Initially, we allocate one memory
* page (usually 4Kb). We reallocate it by adding another page when we run out
* of space. This memory is locked in the RAM so it won't be paged to the swap
* (we don't want decrypted keys on disk). We do allocations in mem pages as
* these are the units `mlock()` operations are performed in.
*
* Currently, the cache can only grow (no eviction). The data is located in
* TopMemoryContext hence being wiped when the process exits, as well as memory
* is being unlocked by OS.
*/
typedef struct RelKeyCache
{
RelKeyCacheRec *data; /* must be a multiple of a memory page
* (usually 4Kb) */
int len; /* num of RelKeyCacheRecs currenty in cache */
int cap; /* max amount of RelKeyCacheRec data can fit */
} RelKeyCache;
RelKeyCache tde_rel_key_cache = {
.data = NULL,
.len = 0,
.cap = 0,
};
typedef struct typedef struct
{ {
RelFileLocator rel; RelFileLocator rel;
@ -118,9 +84,6 @@ static HTAB *TempRelKeys = NULL;
#endif #endif
/*
* TODO: WAL should have its own RelKeyCache
*/
static WALKeyCacheRec *tde_wal_key_cache = NULL; static WALKeyCacheRec *tde_wal_key_cache = NULL;
static WALKeyCacheRec *tde_wal_key_last_rec = NULL; static WALKeyCacheRec *tde_wal_key_last_rec = NULL;
@ -132,9 +95,7 @@ static void pg_tde_file_header_read(const char *tde_filename, int fd, TDEFileHea
static bool pg_tde_read_one_map_entry(int fd, TDEMapEntry *map_entry, off_t *offset); static bool pg_tde_read_one_map_entry(int fd, TDEMapEntry *map_entry, off_t *offset);
static void pg_tde_read_one_map_entry2(int keydata_fd, int32 key_index, TDEMapEntry *map_entry, Oid databaseId); static void pg_tde_read_one_map_entry2(int keydata_fd, int32 key_index, TDEMapEntry *map_entry, Oid databaseId);
static int pg_tde_open_file_read(const char *tde_filename, bool ignore_missing, off_t *curr_pos); static int pg_tde_open_file_read(const char *tde_filename, bool ignore_missing, off_t *curr_pos);
static InternalKey *pg_tde_get_key_from_cache(const RelFileLocator *rlocator, uint32 key_type);
static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(InternalKey *cached_key, XLogRecPtr start_lsn); static WALKeyCacheRec *pg_tde_add_wal_key_to_cache(InternalKey *cached_key, XLogRecPtr start_lsn);
static InternalKey *pg_tde_put_key_into_cache(const RelFileLocator *locator, InternalKey *key);
#ifndef FRONTEND #ifndef FRONTEND
static InternalKey *pg_tde_create_smgr_key_temp(const RelFileLocator *newrlocator); static InternalKey *pg_tde_create_smgr_key_temp(const RelFileLocator *newrlocator);
@ -161,9 +122,12 @@ pg_tde_create_smgr_key(const RelFileLocatorBackend *newrlocator)
static InternalKey * static InternalKey *
pg_tde_create_smgr_key_temp(const RelFileLocator *newrlocator) pg_tde_create_smgr_key_temp(const RelFileLocator *newrlocator)
{ {
InternalKey *rel_key_data = palloc_object(InternalKey);
TempRelKeyEntry *entry; TempRelKeyEntry *entry;
bool found; bool found;
pg_tde_generate_internal_key(rel_key_data, TDE_KEY_TYPE_SMGR);
if (TempRelKeys == NULL) if (TempRelKeys == NULL)
{ {
HASHCTL ctl; HASHCTL ctl;
@ -181,22 +145,22 @@ pg_tde_create_smgr_key_temp(const RelFileLocator *newrlocator)
HASH_ENTER, &found); HASH_ENTER, &found);
Assert(!found); Assert(!found);
pg_tde_generate_internal_key(&entry->key, TDE_KEY_TYPE_SMGR); entry->key = *rel_key_data;
return &entry->key; return rel_key_data;
} }
static InternalKey * static InternalKey *
pg_tde_create_smgr_key_perm(const RelFileLocator *newrlocator) pg_tde_create_smgr_key_perm(const RelFileLocator *newrlocator)
{ {
InternalKey rel_key_data; InternalKey *rel_key_data = palloc_object(InternalKey);
TDEPrincipalKey *principal_key; TDEPrincipalKey *principal_key;
LWLock *lock_pk = tde_lwlock_enc_keys(); LWLock *lock_pk = tde_lwlock_enc_keys();
XLogRelKey xlrec = { XLogRelKey xlrec = {
.rlocator = *newrlocator, .rlocator = *newrlocator,
}; };
pg_tde_generate_internal_key(&rel_key_data, TDE_KEY_TYPE_SMGR); pg_tde_generate_internal_key(rel_key_data, TDE_KEY_TYPE_SMGR);
LWLockAcquire(lock_pk, LW_EXCLUSIVE); LWLockAcquire(lock_pk, LW_EXCLUSIVE);
principal_key = GetPrincipalKey(newrlocator->dbOid, LW_EXCLUSIVE); principal_key = GetPrincipalKey(newrlocator->dbOid, LW_EXCLUSIVE);
@ -207,7 +171,7 @@ pg_tde_create_smgr_key_perm(const RelFileLocator *newrlocator)
errhint("create one using pg_tde_set_key before using encrypted tables")); errhint("create one using pg_tde_set_key before using encrypted tables"));
} }
pg_tde_write_key_map_entry(newrlocator, &rel_key_data, principal_key); pg_tde_write_key_map_entry(newrlocator, rel_key_data, principal_key);
LWLockRelease(lock_pk); LWLockRelease(lock_pk);
/* /*
@ -218,7 +182,7 @@ pg_tde_create_smgr_key_perm(const RelFileLocator *newrlocator)
XLogRegisterData((char *) &xlrec, sizeof(xlrec)); XLogRegisterData((char *) &xlrec, sizeof(xlrec));
XLogInsert(RM_TDERMGR_ID, XLOG_TDE_ADD_RELATION_KEY); XLogInsert(RM_TDERMGR_ID, XLOG_TDE_ADD_RELATION_KEY);
return pg_tde_put_key_into_cache(newrlocator, &rel_key_data); return rel_key_data;
} }
void void
@ -281,13 +245,12 @@ tde_sprint_key(InternalKey *k)
} }
/* /*
* Generates a new internal key for WAL and adds it to the _dat file. It doesn't * Generates a new internal key for WAL and adds it to the _dat file.
* add unecnrypted key into cache but rather sets it in `rel_key_data`.
* *
* We have a special function for WAL as it is being called during recovery * We have a special function for WAL as it is being called during recovery
* (start) so there should be no XLog records, aquired locks, and reads from * start so there should be no XLog records and aquired locks. The key is
* cache. The key is always created with start_lsn = InvalidXLogRecPtr. Which * always created with start_lsn = InvalidXLogRecPtr. Which will be updated
* will be updated with the actual lsn by the first WAL write. * with the actual lsn by the first WAL write.
*/ */
void void
pg_tde_create_wal_key(InternalKey *rel_key_data, const RelFileLocator *newrlocator, uint32 entry_type) pg_tde_create_wal_key(InternalKey *rel_key_data, const RelFileLocator *newrlocator, uint32 entry_type)
@ -317,8 +280,8 @@ DeleteSMGRRelationKey(RelFileLocatorBackend rel)
{ {
if (RelFileLocatorBackendIsTemp(rel)) if (RelFileLocatorBackendIsTemp(rel))
{ {
if (TempRelKeys) Assert(TempRelKeys);
hash_search(TempRelKeys, &rel.locator, HASH_REMOVE, NULL); hash_search(TempRelKeys, &rel.locator, HASH_REMOVE, NULL);
} }
else else
pg_tde_free_key_map_entry(&rel.locator); pg_tde_free_key_map_entry(&rel.locator);
@ -1109,7 +1072,12 @@ pg_tde_get_temporary_rel_key(const RelFileLocator *rel)
entry = hash_search(TempRelKeys, rel, HASH_FIND, NULL); entry = hash_search(TempRelKeys, rel, HASH_FIND, NULL);
if (entry) if (entry)
return &entry->key; {
InternalKey *key = palloc_object(InternalKey);
*key = entry->key;
return key;
}
#endif #endif
return NULL; return NULL;
@ -1117,8 +1085,7 @@ pg_tde_get_temporary_rel_key(const RelFileLocator *rel)
/* /*
* Figures out whether a relation is encrypted or not, but without trying to * Figures out whether a relation is encrypted or not, but without trying to
* decrypt the key if it is. This also means that this function cannot push the * decrypt the key if it is.
* key to cache.
*/ */
bool bool
IsSMGRRelationEncrypted(RelFileLocatorBackend rel) IsSMGRRelationEncrypted(RelFileLocatorBackend rel)
@ -1130,9 +1097,11 @@ IsSMGRRelationEncrypted(RelFileLocatorBackend rel)
Assert(rel.locator.relNumber != InvalidRelFileNumber); Assert(rel.locator.relNumber != InvalidRelFileNumber);
if (RelFileLocatorBackendIsTemp(rel)) if (RelFileLocatorBackendIsTemp(rel))
return pg_tde_get_temporary_rel_key(&rel.locator) != NULL; #ifndef FRONTEND
else if (pg_tde_get_key_from_cache(&rel.locator, TDE_KEY_TYPE_SMGR)) return TempRelKeys && hash_search(TempRelKeys, &rel.locator, HASH_FIND, NULL);
return true; #else
return false;
#endif
pg_tde_set_db_file_path(rel.locator.dbOid, db_map_path); pg_tde_set_db_file_path(rel.locator.dbOid, db_map_path);
@ -1149,8 +1118,6 @@ IsSMGRRelationEncrypted(RelFileLocatorBackend rel)
/* /*
* Returns TDE key for a given relation. * Returns TDE key for a given relation.
* First it looks in a cache. If nothing found in the cache, it reads data from
* the tde fork file and populates cache.
*/ */
InternalKey * InternalKey *
GetSMGRRelationKey(RelFileLocatorBackend rel) GetSMGRRelationKey(RelFileLocatorBackend rel)
@ -1160,40 +1127,7 @@ GetSMGRRelationKey(RelFileLocatorBackend rel)
if (RelFileLocatorBackendIsTemp(rel)) if (RelFileLocatorBackendIsTemp(rel))
return pg_tde_get_temporary_rel_key(&rel.locator); return pg_tde_get_temporary_rel_key(&rel.locator);
else else
{ return pg_tde_get_key_from_file(&rel.locator, TDE_KEY_TYPE_SMGR);
InternalKey *key;
key = pg_tde_get_key_from_cache(&rel.locator, TDE_KEY_TYPE_SMGR);
if (key)
return key;
key = pg_tde_get_key_from_file(&rel.locator, TDE_KEY_TYPE_SMGR);
if (key)
{
InternalKey *cached_key = pg_tde_put_key_into_cache(&rel.locator, key);
pfree(key);
return cached_key;
}
return NULL;
}
}
static InternalKey *
pg_tde_get_key_from_cache(const RelFileLocator *rlocator, uint32 key_type)
{
for (int i = 0; i < tde_rel_key_cache.len; i++)
{
RelKeyCacheRec *rec = tde_rel_key_cache.data + i;
if (RelFileLocatorEquals(rec->locator, *rlocator) && rec->key.type & key_type)
{
return &rec->key;
}
}
return NULL;
} }
/* /*
@ -1363,94 +1297,3 @@ pg_tde_add_wal_key_to_cache(InternalKey *key, XLogRecPtr start_lsn)
return wal_rec; return wal_rec;
} }
/*
* Add key to cache. See comments on `RelKeyCache`.
*/
static InternalKey *
pg_tde_put_key_into_cache(const RelFileLocator *rlocator, InternalKey *key)
{
static long pageSize = 0;
RelKeyCacheRec *rec;
MemoryContext oldCtx;
if (pageSize == 0)
{
#ifndef _SC_PAGESIZE
pageSize = getpagesize();
#else
pageSize = sysconf(_SC_PAGESIZE);
#endif
}
if (tde_rel_key_cache.data == NULL)
{
#ifndef FRONTEND
oldCtx = MemoryContextSwitchTo(TopMemoryContext);
tde_rel_key_cache.data = palloc_aligned(pageSize, pageSize, MCXT_ALLOC_ZERO);
MemoryContextSwitchTo(oldCtx);
#else
tde_rel_key_cache.data = aligned_alloc(pageSize, pageSize);
memset(tde_rel_key_cache.data, 0, pageSize);
#endif
if (mlock(tde_rel_key_cache.data, pageSize) == -1)
elog(ERROR, "could not mlock internal key initial cache page: %m");
tde_rel_key_cache.len = 0;
tde_rel_key_cache.cap = (pageSize - 1) / sizeof(RelKeyCacheRec);
}
/*
* Add another mem page if there is no more room left for another key. We
* allocate `current_memory_size` + 1 page and copy data there.
*/
if (tde_rel_key_cache.len == tde_rel_key_cache.cap)
{
size_t size;
size_t old_size;
RelKeyCacheRec *cachePage;
old_size = TYPEALIGN(pageSize, tde_rel_key_cache.cap * sizeof(RelKeyCacheRec));
/*
* TODO: consider some formula for less allocations when caching a
* lot of objects. But on the other, hand it'll use more memory...
* E.g.: if (old_size < 0x8000) size = old_size * 2; else size =
* TYPEALIGN(pageSize, old_size + ((old_size + 3*256) >> 2));
*
*/
size = old_size + pageSize;
#ifndef FRONTEND
oldCtx = MemoryContextSwitchTo(TopMemoryContext);
cachePage = palloc_aligned(size, pageSize, MCXT_ALLOC_ZERO);
MemoryContextSwitchTo(oldCtx);
#else
cachePage = aligned_alloc(pageSize, size);
memset(cachePage, 0, size);
#endif
memcpy(cachePage, tde_rel_key_cache.data, old_size);
explicit_bzero(tde_rel_key_cache.data, old_size);
if (munlock(tde_rel_key_cache.data, old_size) == -1)
elog(WARNING, "could not munlock internal key cache pages: %m");
pfree(tde_rel_key_cache.data);
tde_rel_key_cache.data = cachePage;
if (mlock(tde_rel_key_cache.data, size) == -1)
elog(WARNING, "could not mlock internal key cache pages: %m");
tde_rel_key_cache.cap = (size - 1) / sizeof(RelKeyCacheRec);
}
rec = tde_rel_key_cache.data + tde_rel_key_cache.len;
rec->locator = *rlocator;
rec->key = *key;
tde_rel_key_cache.len++;
return &rec->key;
}

@ -86,8 +86,7 @@ tde_smgr_should_encrypt(const RelFileLocatorBackend *smgr_rlocator, RelFileLocat
.backend = smgr_rlocator->backend, .backend = smgr_rlocator->backend,
}; };
/* Actually get the key here to ensure result is cached. */ return IsSMGRRelationEncrypted(old_smgr_locator);
return GetSMGRRelationKey(old_smgr_locator) != 0;
} }
} }
@ -106,19 +105,19 @@ tde_mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
} }
else else
{ {
InternalKey *int_key;
unsigned char *local_blocks = palloc(BLCKSZ * (nblocks + 1)); unsigned char *local_blocks = palloc(BLCKSZ * (nblocks + 1));
unsigned char *local_blocks_aligned = (unsigned char *) TYPEALIGN(PG_IO_ALIGN_SIZE, local_blocks); unsigned char *local_blocks_aligned = (unsigned char *) TYPEALIGN(PG_IO_ALIGN_SIZE, local_blocks);
void **local_buffers = palloc_array(void *, nblocks); void **local_buffers = palloc_array(void *, nblocks);
if (tdereln->encryption_status == RELATION_KEY_NOT_AVAILABLE) if (tdereln->encryption_status == RELATION_KEY_NOT_AVAILABLE)
{ {
tdereln->relKey = *tde_smgr_get_key(&reln->smgr_rlocator); InternalKey *int_key = tde_smgr_get_key(&reln->smgr_rlocator);
tdereln->relKey = *int_key;
tdereln->encryption_status = RELATION_KEY_AVAILABLE; tdereln->encryption_status = RELATION_KEY_AVAILABLE;
pfree(int_key);
} }
int_key = &tdereln->relKey;
for (int i = 0; i < nblocks; ++i) for (int i = 0; i < nblocks; ++i)
{ {
BlockNumber bn = blocknum + i; BlockNumber bn = blocknum + i;
@ -126,9 +125,9 @@ tde_mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
local_buffers[i] = &local_blocks_aligned[i * BLCKSZ]; local_buffers[i] = &local_blocks_aligned[i * BLCKSZ];
CalcBlockIv(forknum, bn, int_key->base_iv, iv); CalcBlockIv(forknum, bn, tdereln->relKey.base_iv, iv);
AesEncrypt(int_key->key, iv, ((unsigned char **) buffers)[i], BLCKSZ, local_buffers[i]); AesEncrypt(tdereln->relKey.key, iv, ((unsigned char **) buffers)[i], BLCKSZ, local_buffers[i]);
} }
mdwritev(reln, forknum, blocknum, mdwritev(reln, forknum, blocknum,
@ -178,22 +177,22 @@ tde_mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
} }
else else
{ {
InternalKey *int_key;
unsigned char *local_blocks = palloc(BLCKSZ * (1 + 1)); unsigned char *local_blocks = palloc(BLCKSZ * (1 + 1));
unsigned char *local_blocks_aligned = (unsigned char *) TYPEALIGN(PG_IO_ALIGN_SIZE, local_blocks); unsigned char *local_blocks_aligned = (unsigned char *) TYPEALIGN(PG_IO_ALIGN_SIZE, local_blocks);
unsigned char iv[16]; unsigned char iv[16];
if (tdereln->encryption_status == RELATION_KEY_NOT_AVAILABLE) if (tdereln->encryption_status == RELATION_KEY_NOT_AVAILABLE)
{ {
tdereln->relKey = *tde_smgr_get_key(&reln->smgr_rlocator); InternalKey *int_key = tde_smgr_get_key(&reln->smgr_rlocator);
tdereln->relKey = *int_key;
tdereln->encryption_status = RELATION_KEY_AVAILABLE; tdereln->encryption_status = RELATION_KEY_AVAILABLE;
pfree(int_key);
} }
int_key = &tdereln->relKey; CalcBlockIv(forknum, blocknum, tdereln->relKey.base_iv, iv);
CalcBlockIv(forknum, blocknum, int_key->base_iv, iv); AesEncrypt(tdereln->relKey.key, iv, ((unsigned char *) buffer), BLCKSZ, local_blocks_aligned);
AesEncrypt(int_key->key, iv, ((unsigned char *) buffer), BLCKSZ, local_blocks_aligned);
mdextend(reln, forknum, blocknum, local_blocks_aligned, skipFsync); mdextend(reln, forknum, blocknum, local_blocks_aligned, skipFsync);
@ -206,7 +205,6 @@ tde_mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void **buffers, BlockNumber nblocks) void **buffers, BlockNumber nblocks)
{ {
TDESMgrRelation *tdereln = (TDESMgrRelation *) reln; TDESMgrRelation *tdereln = (TDESMgrRelation *) reln;
InternalKey *int_key;
mdreadv(reln, forknum, blocknum, buffers, nblocks); mdreadv(reln, forknum, blocknum, buffers, nblocks);
@ -214,12 +212,13 @@ tde_mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
return; return;
else if (tdereln->encryption_status == RELATION_KEY_NOT_AVAILABLE) else if (tdereln->encryption_status == RELATION_KEY_NOT_AVAILABLE)
{ {
tdereln->relKey = *tde_smgr_get_key(&reln->smgr_rlocator); InternalKey *int_key = tde_smgr_get_key(&reln->smgr_rlocator);
tdereln->relKey = *int_key;
tdereln->encryption_status = RELATION_KEY_AVAILABLE; tdereln->encryption_status = RELATION_KEY_AVAILABLE;
pfree(int_key);
} }
int_key = &tdereln->relKey;
for (int i = 0; i < nblocks; ++i) for (int i = 0; i < nblocks; ++i)
{ {
bool allZero = true; bool allZero = true;
@ -246,9 +245,9 @@ tde_mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
if (allZero) if (allZero)
continue; continue;
CalcBlockIv(forknum, bn, int_key->base_iv, iv); CalcBlockIv(forknum, bn, tdereln->relKey.base_iv, iv);
AesDecrypt(int_key->key, iv, ((unsigned char **) buffers)[i], BLCKSZ, ((unsigned char **) buffers)[i]); AesDecrypt(tdereln->relKey.key, iv, ((unsigned char **) buffers)[i], BLCKSZ, ((unsigned char **) buffers)[i]);
} }
} }
@ -292,6 +291,7 @@ tde_mdcreate(RelFileLocator relold, SMgrRelation reln, ForkNumber forknum, bool
{ {
tdereln->encryption_status = RELATION_KEY_AVAILABLE; tdereln->encryption_status = RELATION_KEY_AVAILABLE;
tdereln->relKey = *key; tdereln->relKey = *key;
pfree(key);
} }
else else
{ {

Loading…
Cancel
Save