diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index f94445bdd07..6aed998d001 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -84,7 +84,6 @@ #include "pg_trace.h" #include "pgstat.h" #include "postmaster/autovacuum.h" -#include "storage/condition_variable.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -276,12 +275,6 @@ typedef struct MultiXactStateData /* support for members anti-wraparound measures */ MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* - * This is used to sleep until a multixact offset is written when we want - * to create the next one. - */ - ConditionVariable nextoff_cv; - /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -386,6 +379,9 @@ static MemoryContext MXactContext = NULL; #define debug_elog6(a,b,c,d,e,f) #endif +/* hack to deal with WAL generated with older minor versions */ +static int64 pre_initialized_offsets_page = -1; + /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, @@ -922,13 +918,65 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int entryno; int slotno; MultiXactOffset *offptr; - int i; + MultiXactId next; + int64 next_pageno; + int next_entryno; + MultiXactOffset *next_offptr; LWLock *lock; LWLock *prevlock = NULL; + /* position of this multixid in the offsets SLRU area */ pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* position of the next multixid */ + next = multi + 1; + if (next < FirstMultiXactId) + next = FirstMultiXactId; + next_pageno = MultiXactIdToOffsetPage(next); + next_entryno = MultiXactIdToOffsetEntry(next); + + /* + * Older minor versions didn't set the next multixid's offset in this + * function, and therefore didn't initialize the next page until the next + * multixid was assigned. If we're replaying WAL that was generated by + * such a version, the next page might not be initialized yet. Initialize + * it now. + */ + if (InRecovery && + next_pageno != pageno && + pg_atomic_read_u64(&MultiXactOffsetCtl->shared->latest_page_number) == pageno) + { + elog(DEBUG1, "next offsets page is not initialized, initializing it now"); + + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Create and zero the page */ + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno); + + /* Make sure it's written out */ + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + + LWLockRelease(lock); + + /* + * Remember that we initialized the page, so that we don't zero it + * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record. + */ + pre_initialized_offsets_page = next_pageno; + } + + /* + * Set the starting offset of this multixid's members. + * + * In the common case, it was already be set by the previous + * RecordNewMultiXact call, as this was the next multixid of the previous + * multixid. But if multiple backends are generating multixids + * concurrently, we might race ahead and get called before the previous + * multixid. + */ lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); @@ -943,22 +991,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - *offptr = offset; + if (*offptr != offset) + { + /* should already be set to the correct value, or not at all */ + Assert(*offptr == 0); + *offptr = offset; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } + + /* + * Set the next multixid's offset to the end of this multixid's members. + */ + if (next_pageno == pageno) + { + next_offptr = offptr + 1; + } + else + { + /* must be the first entry on the page */ + Assert(next_entryno == 0 || next == FirstMultiXactId); + + /* Swap the lock for a lock on the next page */ + LWLockRelease(lock); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next); + next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + next_offptr += next_entryno; + } - MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + if (*next_offptr != offset + nmembers) + { + /* should already be set to the correct value, or not at all */ + Assert(*next_offptr == 0); + *next_offptr = offset + nmembers; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } /* Release MultiXactOffset SLRU lock. */ LWLockRelease(lock); - /* - * If anybody was waiting to know the offset of this multixact ID we just - * wrote, they can read it now, so wake them up. - */ - ConditionVariableBroadcast(&MultiXactState->nextoff_cv); - prev_pageno = -1; - for (i = 0; i < nmembers; i++, offset++) + for (int i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr; uint32 *flagsptr; @@ -1148,8 +1224,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) result = FirstMultiXactId; } - /* Make sure there is room for the MXID in the file. */ - ExtendMultiXactOffset(result); + /* + * Make sure there is room for the next MXID in the file. Assigning this + * MXID sets the next MXID's offset already. + */ + ExtendMultiXactOffset(result + 1); /* * Reserve the members space, similarly to above. Also, be careful not to @@ -1314,7 +1393,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactOffset nextOffset; MultiXactMember *ptr; LWLock *lock; - bool slept = false; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -1391,23 +1469,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * one's. However, there are some corner cases to worry about: * * 1. This multixact may be the latest one created, in which case there is - * no next one to look at. In this case the nextOffset value we just - * saved is the correct endpoint. + * no next one to look at. The next multixact's offset should be set + * already, as we set it in RecordNewMultiXact(), but we used to not do + * that in older minor versions. To cope with that case, if this + * multixact is the latest one created, use the nextOffset value we read + * above as the endpoint. * - * 2. The next multixact may still be in process of being filled in: that - * is, another process may have done GetNewMultiXactId but not yet written - * the offset entry for that ID. In that scenario, it is guaranteed that - * the offset entry for that multixact exists (because GetNewMultiXactId - * won't release MultiXactGenLock until it does) but contains zero - * (because we are careful to pre-zero offset pages). Because - * GetNewMultiXactId will never return zero as the starting offset for a - * multixact, when we read zero as the next multixact's offset, we know we - * have this case. We handle this by sleeping on the condition variable - * we have just for this; the process in charge will signal the CV as soon - * as it has finished writing the multixact offset. - * - * 3. Because GetNewMultiXactId increments offset zero to offset one to - * handle case #2, there is an ambiguity near the point of offset + * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero + * for to mean "unset", there is an ambiguity near the point of offset * wraparound. If we see next multixact's offset is one, is that our * multixact's actual endpoint, or did it end at zero with a subsequent * increment? We handle this using the knowledge that if the zero'th @@ -1419,7 +1488,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. */ -retry: pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -1482,18 +1550,10 @@ retry: nextMXOffset = *offptr; if (nextMXOffset == 0) - { - /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(lock); - CHECK_FOR_INTERRUPTS(); - - INJECTION_POINT("multixact-get-members-cv-sleep", NULL); - - ConditionVariableSleep(&MultiXactState->nextoff_cv, - WAIT_EVENT_MULTIXACT_CREATION); - slept = true; - goto retry; - } + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has invalid next offset", + multi))); length = nextMXOffset - offset; } @@ -1501,12 +1561,6 @@ retry: LWLockRelease(lock); lock = NULL; - /* - * If we slept above, clean up state; it's no longer needed. - */ - if (slept) - ConditionVariableCancelSleep(); - ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); truelength = 0; @@ -1549,7 +1603,7 @@ retry: if (!TransactionIdIsValid(*xactptr)) { - /* Corner case 3: we must be looking at unused slot zero */ + /* Corner case 2: we must be looking at unused slot zero */ Assert(offset == 0); continue; } @@ -1996,7 +2050,6 @@ MultiXactShmemInit(void) /* Make sure we zero out the per-backend state */ MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); - ConditionVariableInit(&MultiXactState->nextoff_cv); } else Assert(found); @@ -2203,26 +2256,34 @@ TrimMultiXact(void) pageno); /* - * Zero out the remainder of the current offsets page. See notes in - * TrimCLOG() for background. Unlike CLOG, some WAL record covers every - * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL - * rule "write xlog before data," nextMXact successors may carry obsolete, - * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() - * operates normally. + * Set the offset of nextMXact on the offsets page. This is normally done + * in RecordNewMultiXact() of the previous multixact, but we used to not + * do that in older minor versions. To ensure that the next offset is set + * if the binary was just upgraded from an older minor version, do it now. + * + * Zero out the remainder of the page. See notes in TrimCLOG() for + * background. Unlike CLOG, some WAL record covers every pg_multixact + * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write + * xlog before data," nextMXact successors may carry obsolete, nonzero + * offset values. */ entryno = MultiXactIdToOffsetEntry(nextMXact); - if (entryno != 0) { int slotno; MultiXactOffset *offptr; LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); + if (entryno == 0) + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + else + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + *offptr = offset; + if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ) + MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset)); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; LWLockRelease(lock); @@ -3407,14 +3468,24 @@ multixact_redo(XLogReaderState *record) memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); + /* + * Skip the record if we already initialized the page at the previous + * XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact(). + */ + if (pre_initialized_offsets_page != pageno) + { + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - LWLockRelease(lock); + LWLockRelease(lock); + } + else + elog(DEBUG1, "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation", pageno); + pre_initialized_offsets_page = -1; } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { @@ -3440,6 +3511,22 @@ multixact_redo(XLogReaderState *record) TransactionId max_xid; int i; + if (pre_initialized_offsets_page != -1) + { + /* + * If we implicitly initialized the next offsets page while + * replaying an XLOG_MULTIXACT_CREATE_ID record that was generated + * with an older minor version, we still expect to see an + * XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other + * XLOG_MULTIXACT_CREATE_ID records. Therefore this case should + * not happen. If it does, we'll continue with the replay, but + * log a message to note that something's funny. + */ + elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page " INT64_FORMAT " that was implicitly initialized earlier", + pre_initialized_offsets_page); + pre_initialized_offsets_page = -1; + } + /* Store the data back into the SLRU files */ RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, xlrec->members); diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl index e2b567a603d..7837eb810f0 100644 --- a/src/test/modules/test_slru/t/001_multixact.pl +++ b/src/test/modules/test_slru/t/001_multixact.pl @@ -1,10 +1,6 @@ # Copyright (c) 2024-2025, PostgreSQL Global Development Group -# This test verifies edge case of reading a multixact: -# when we have multixact that is followed by exactly one another multixact, -# and another multixact have no offset yet, we must wait until this offset -# becomes observable. Previously we used to wait for 1ms in a loop in this -# case, but now we use CV for this. This test is exercising such a sleep. +# Test multixid corner cases. use strict; use warnings FATAL => 'all'; @@ -19,9 +15,7 @@ if ($ENV{enable_injection_points} ne 'yes') plan skip_all => 'Injection points not supported by this build'; } -my ($node, $result); - -$node = PostgreSQL::Test::Cluster->new('mike'); +my $node = PostgreSQL::Test::Cluster->new('main'); $node->init; $node->append_conf('postgresql.conf', "shared_preload_libraries = 'test_slru,injection_points'"); @@ -29,95 +23,47 @@ $node->start; $node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); $node->safe_psql('postgres', q(CREATE EXTENSION test_slru)); -# Test for Multixact generation edge case -$node->safe_psql('postgres', - q{select injection_points_attach('test-multixact-read','wait')}); -$node->safe_psql('postgres', - q{select injection_points_attach('multixact-get-members-cv-sleep','wait')} -); +# This test creates three multixacts. The middle one is never +# WAL-logged or recorded on the offsets page, because we pause the +# backend and crash the server before that. After restart, verify that +# the other multixacts are readable, despite the middle one being +# lost. -# This session must observe sleep on the condition variable while generating a -# multixact. To achieve this it first will create a multixact, then pause -# before reading it. -my $observer = $node->background_psql('postgres'); - -# This query will create a multixact, and hang just before reading it. -$observer->query_until( - qr/start/, - q{ - \echo start - SELECT test_read_multixact(test_create_multixact()); -}); -$node->wait_for_event('client backend', 'test-multixact-read'); - -# This session will create the next Multixact. This is necessary to avoid -# multixact.c's non-sleeping edge case 1. -my $creator = $node->background_psql('postgres'); +# Create the first multixact +my $bg_psql = $node->background_psql('postgres'); +my $multi1 = $bg_psql->query_safe(q(SELECT test_create_multixact();)); + +# Assign the middle multixact. Use an injection point to prevent it +# from being fully recorded. $node->safe_psql('postgres', q{SELECT injection_points_attach('multixact-create-from-members','wait');} ); -# We expect this query to hang in the critical section after generating new -# multixact, but before filling its offset into SLRU. -# Running an injection point inside a critical section requires it to be -# loaded beforehand. -$creator->query_until( - qr/start/, q{ - \echo start +$bg_psql->query_until( + qr/assigning lost multi/, q( +\echo assigning lost multi SELECT test_create_multixact(); -}); +)); $node->wait_for_event('client backend', 'multixact-create-from-members'); - -# Ensure we have the backends waiting that we expect -is( $node->safe_psql( - 'postgres', - q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event) - FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'} - ), - 'multixact-create-from-members, test-multixact-read', - "matching injection point waits"); - -# Now wake observer to get it to read the initial multixact. A subsequent -# multixact already exists, but that one doesn't have an offset assigned, so -# this will hit multixact.c's edge case 2. -$node->safe_psql('postgres', - q{SELECT injection_points_wakeup('test-multixact-read')}); -$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep'); - -# Ensure we have the backends waiting that we expect -is( $node->safe_psql( - 'postgres', - q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event) - FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'} - ), - 'multixact-create-from-members, multixact-get-members-cv-sleep', - "matching injection point waits"); - -# Now we have two backends waiting in multixact-create-from-members and -# multixact-get-members-cv-sleep. Also we have 3 injections points set to wait. -# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must -# detach it first. So let's detach all injection points, then wake up all -# backends. - -$node->safe_psql('postgres', - q{SELECT injection_points_detach('test-multixact-read')}); $node->safe_psql('postgres', q{SELECT injection_points_detach('multixact-create-from-members')}); -$node->safe_psql('postgres', - q{SELECT injection_points_detach('multixact-get-members-cv-sleep')}); -$node->safe_psql('postgres', - q{SELECT injection_points_wakeup('multixact-create-from-members')}); -$node->safe_psql('postgres', - q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')}); +# Create the third multixid +my $multi2 = $node->safe_psql('postgres', q{SELECT test_create_multixact();}); + +# All set and done, it's time for hard restart +$node->stop('immediate'); +$node->start; +$bg_psql->{run}->finish; -# Background psql will now be able to read the result and disconnect. -$observer->quit; -$creator->quit; +# Verify that the recorded multixids are readable +is( $node->safe_psql('postgres', qq{SELECT test_read_multixact('$multi1');}), + '', + 'first recorded multi is readable'); -$node->stop; +is( $node->safe_psql('postgres', qq{SELECT test_read_multixact('$multi2');}), + '', + 'second recorded multi is readable'); -# If we reached this point - everything is OK. -ok(1); done_testing(); diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c index 6c9b0420717..8fb6c19d70f 100644 --- a/src/test/modules/test_slru/test_multixact.c +++ b/src/test/modules/test_slru/test_multixact.c @@ -17,7 +17,6 @@ #include "access/multixact.h" #include "access/xact.h" #include "fmgr.h" -#include "utils/injection_point.h" PG_FUNCTION_INFO_V1(test_create_multixact); PG_FUNCTION_INFO_V1(test_read_multixact); @@ -37,8 +36,7 @@ test_create_multixact(PG_FUNCTION_ARGS) } /* - * Reads given multixact after running an injection point. Discards local cache - * to make a real read. Tailored for multixact testing. + * Reads given multixact. Discards local cache to make a real read. */ Datum test_read_multixact(PG_FUNCTION_ARGS) @@ -46,7 +44,6 @@ test_read_multixact(PG_FUNCTION_ARGS) MultiXactId id = PG_GETARG_TRANSACTIONID(0); MultiXactMember *members; - INJECTION_POINT("test-multixact-read", NULL); /* discard caches */ AtEOXact_MultiXact();