@ -84,7 +84,6 @@
# include "pg_trace.h"
# include "pgstat.h"
# include "postmaster/autovacuum.h"
# include "storage/condition_variable.h"
# include "storage/pmsignal.h"
# include "storage/proc.h"
# include "storage/procarray.h"
@ -276,12 +275,6 @@ typedef struct MultiXactStateData
/* support for members anti-wraparound measures */
MultiXactOffset offsetStopLimit ; /* known if oldestOffsetKnown */
/*
* This is used to sleep until a multixact offset is written when we want
* to create the next one .
*/
ConditionVariable nextoff_cv ;
/*
* Per - backend data starts here . We have two arrays stored in the area
* immediately following the MultiXactStateData struct . Each is indexed by
@ -386,6 +379,9 @@ static MemoryContext MXactContext = NULL;
# define debug_elog6(a,b,c,d,e,f)
# endif
/* hack to deal with WAL generated with older minor versions */
static int64 pre_initialized_offsets_page = - 1 ;
/* internal MultiXactId management */
static void MultiXactIdSetOldestVisible ( void ) ;
static void RecordNewMultiXact ( MultiXactId multi , MultiXactOffset offset ,
@ -922,13 +918,65 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
int entryno ;
int slotno ;
MultiXactOffset * offptr ;
int i ;
MultiXactId next ;
int64 next_pageno ;
int next_entryno ;
MultiXactOffset * next_offptr ;
LWLock * lock ;
LWLock * prevlock = NULL ;
/* position of this multixid in the offsets SLRU area */
pageno = MultiXactIdToOffsetPage ( multi ) ;
entryno = MultiXactIdToOffsetEntry ( multi ) ;
/* position of the next multixid */
next = multi + 1 ;
if ( next < FirstMultiXactId )
next = FirstMultiXactId ;
next_pageno = MultiXactIdToOffsetPage ( next ) ;
next_entryno = MultiXactIdToOffsetEntry ( next ) ;
/*
* Older minor versions didn ' t set the next multixid ' s offset in this
* function , and therefore didn ' t initialize the next page until the next
* multixid was assigned . If we ' re replaying WAL that was generated by
* such a version , the next page might not be initialized yet . Initialize
* it now .
*/
if ( InRecovery & &
next_pageno ! = pageno & &
pg_atomic_read_u64 ( & MultiXactOffsetCtl - > shared - > latest_page_number ) = = pageno )
{
elog ( DEBUG1 , " next offsets page is not initialized, initializing it now " ) ;
lock = SimpleLruGetBankLock ( MultiXactOffsetCtl , next_pageno ) ;
LWLockAcquire ( lock , LW_EXCLUSIVE ) ;
/* Create and zero the page */
slotno = SimpleLruZeroPage ( MultiXactOffsetCtl , next_pageno ) ;
/* Make sure it's written out */
SimpleLruWritePage ( MultiXactOffsetCtl , slotno ) ;
Assert ( ! MultiXactOffsetCtl - > shared - > page_dirty [ slotno ] ) ;
LWLockRelease ( lock ) ;
/*
* Remember that we initialized the page , so that we don ' t zero it
* again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record .
*/
pre_initialized_offsets_page = next_pageno ;
}
/*
* Set the starting offset of this multixid ' s members .
*
* In the common case , it was already be set by the previous
* RecordNewMultiXact call , as this was the next multixid of the previous
* multixid . But if multiple backends are generating multixids
* concurrently , we might race ahead and get called before the previous
* multixid .
*/
lock = SimpleLruGetBankLock ( MultiXactOffsetCtl , pageno ) ;
LWLockAcquire ( lock , LW_EXCLUSIVE ) ;
@ -943,22 +991,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
offptr = ( MultiXactOffset * ) MultiXactOffsetCtl - > shared - > page_buffer [ slotno ] ;
offptr + = entryno ;
* offptr = offset ;
if ( * offptr ! = offset )
{
/* should already be set to the correct value, or not at all */
Assert ( * offptr = = 0 ) ;
* offptr = offset ;
MultiXactOffsetCtl - > shared - > page_dirty [ slotno ] = true ;
}
/*
* Set the next multixid ' s offset to the end of this multixid ' s members .
*/
if ( next_pageno = = pageno )
{
next_offptr = offptr + 1 ;
}
else
{
/* must be the first entry on the page */
Assert ( next_entryno = = 0 | | next = = FirstMultiXactId ) ;
/* Swap the lock for a lock on the next page */
LWLockRelease ( lock ) ;
lock = SimpleLruGetBankLock ( MultiXactOffsetCtl , next_pageno ) ;
LWLockAcquire ( lock , LW_EXCLUSIVE ) ;
slotno = SimpleLruReadPage ( MultiXactOffsetCtl , next_pageno , true , next ) ;
next_offptr = ( MultiXactOffset * ) MultiXactOffsetCtl - > shared - > page_buffer [ slotno ] ;
next_offptr + = next_entryno ;
}
MultiXactOffsetCtl - > shared - > page_dirty [ slotno ] = true ;
if ( * next_offptr ! = offset + nmembers )
{
/* should already be set to the correct value, or not at all */
Assert ( * next_offptr = = 0 ) ;
* next_offptr = offset + nmembers ;
MultiXactOffsetCtl - > shared - > page_dirty [ slotno ] = true ;
}
/* Release MultiXactOffset SLRU lock. */
LWLockRelease ( lock ) ;
/*
* If anybody was waiting to know the offset of this multixact ID we just
* wrote , they can read it now , so wake them up .
*/
ConditionVariableBroadcast ( & MultiXactState - > nextoff_cv ) ;
prev_pageno = - 1 ;
for ( i = 0 ; i < nmembers ; i + + , offset + + )
for ( int i = 0 ; i < nmembers ; i + + , offset + + )
{
TransactionId * memberptr ;
uint32 * flagsptr ;
@ -1148,8 +1224,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
result = FirstMultiXactId ;
}
/* Make sure there is room for the MXID in the file. */
ExtendMultiXactOffset ( result ) ;
/*
* Make sure there is room for the next MXID in the file . Assigning this
* MXID sets the next MXID ' s offset already .
*/
ExtendMultiXactOffset ( result + 1 ) ;
/*
* Reserve the members space , similarly to above . Also , be careful not to
@ -1314,7 +1393,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
MultiXactOffset nextOffset ;
MultiXactMember * ptr ;
LWLock * lock ;
bool slept = false ;
debug_elog3 ( DEBUG2 , " GetMembers: asked for %u " , multi ) ;
@ -1391,23 +1469,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* one ' s . However , there are some corner cases to worry about :
*
* 1. This multixact may be the latest one created , in which case there is
* no next one to look at . In this case the nextOffset value we just
* saved is the correct endpoint .
* no next one to look at . The next multixact ' s offset should be set
* already , as we set it in RecordNewMultiXact ( ) , but we used to not do
* that in older minor versions . To cope with that case , if this
* multixact is the latest one created , use the nextOffset value we read
* above as the endpoint .
*
* 2. The next multixact may still be in process of being filled in : that
* is , another process may have done GetNewMultiXactId but not yet written
* the offset entry for that ID . In that scenario , it is guaranteed that
* the offset entry for that multixact exists ( because GetNewMultiXactId
* won ' t release MultiXactGenLock until it does ) but contains zero
* ( because we are careful to pre - zero offset pages ) . Because
* GetNewMultiXactId will never return zero as the starting offset for a
* multixact , when we read zero as the next multixact ' s offset , we know we
* have this case . We handle this by sleeping on the condition variable
* we have just for this ; the process in charge will signal the CV as soon
* as it has finished writing the multixact offset .
*
* 3. Because GetNewMultiXactId increments offset zero to offset one to
* handle case # 2 , there is an ambiguity near the point of offset
* 2. Because GetNewMultiXactId skips over offset zero , to reserve zero
* for to mean " unset " , there is an ambiguity near the point of offset
* wraparound . If we see next multixact ' s offset is one , is that our
* multixact ' s actual endpoint , or did it end at zero with a subsequent
* increment ? We handle this using the knowledge that if the zero ' th
@ -1419,7 +1488,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* cases , so it seems better than holding the MultiXactGenLock for a long
* time on every multixact creation .
*/
retry :
pageno = MultiXactIdToOffsetPage ( multi ) ;
entryno = MultiXactIdToOffsetEntry ( multi ) ;
@ -1482,18 +1550,10 @@ retry:
nextMXOffset = * offptr ;
if ( nextMXOffset = = 0 )
{
/* Corner case 2: next multixact is still being filled in */
LWLockRelease ( lock ) ;
CHECK_FOR_INTERRUPTS ( ) ;
INJECTION_POINT ( " multixact-get-members-cv-sleep " , NULL ) ;
ConditionVariableSleep ( & MultiXactState - > nextoff_cv ,
WAIT_EVENT_MULTIXACT_CREATION ) ;
slept = true ;
goto retry ;
}
ereport ( ERROR ,
( errcode ( ERRCODE_DATA_CORRUPTED ) ,
errmsg ( " MultiXact %u has invalid next offset " ,
multi ) ) ) ;
length = nextMXOffset - offset ;
}
@ -1501,12 +1561,6 @@ retry:
LWLockRelease ( lock ) ;
lock = NULL ;
/*
* If we slept above , clean up state ; it ' s no longer needed .
*/
if ( slept )
ConditionVariableCancelSleep ( ) ;
ptr = ( MultiXactMember * ) palloc ( length * sizeof ( MultiXactMember ) ) ;
truelength = 0 ;
@ -1549,7 +1603,7 @@ retry:
if ( ! TransactionIdIsValid ( * xactptr ) )
{
/* Corner case 3 : we must be looking at unused slot zero */
/* Corner case 2 : we must be looking at unused slot zero */
Assert ( offset = = 0 ) ;
continue ;
}
@ -1996,7 +2050,6 @@ MultiXactShmemInit(void)
/* Make sure we zero out the per-backend state */
MemSet ( MultiXactState , 0 , SHARED_MULTIXACT_STATE_SIZE ) ;
ConditionVariableInit ( & MultiXactState - > nextoff_cv ) ;
}
else
Assert ( found ) ;
@ -2203,26 +2256,34 @@ TrimMultiXact(void)
pageno ) ;
/*
* Zero out the remainder of the current offsets page . See notes in
* TrimCLOG ( ) for background . Unlike CLOG , some WAL record covers every
* pg_multixact SLRU mutation . Since , also unlike CLOG , we ignore the WAL
* rule " write xlog before data, " nextMXact successors may carry obsolete ,
* nonzero offset values . Zero those so case 2 of GetMultiXactIdMembers ( )
* operates normally .
* Set the offset of nextMXact on the offsets page . This is normally done
* in RecordNewMultiXact ( ) of the previous multixact , but we used to not
* do that in older minor versions . To ensure that the next offset is set
* if the binary was just upgraded from an older minor version , do it now .
*
* Zero out the remainder of the page . See notes in TrimCLOG ( ) for
* background . Unlike CLOG , some WAL record covers every pg_multixact
* SLRU mutation . Since , also unlike CLOG , we ignore the WAL rule " write
* xlog before data , " nextMXact successors may carry obsolete, nonzero
* offset values .
*/
entryno = MultiXactIdToOffsetEntry ( nextMXact ) ;
if ( entryno ! = 0 )
{
int slotno ;
MultiXactOffset * offptr ;
LWLock * lock = SimpleLruGetBankLock ( MultiXactOffsetCtl , pageno ) ;
LWLockAcquire ( lock , LW_EXCLUSIVE ) ;
slotno = SimpleLruReadPage ( MultiXactOffsetCtl , pageno , true , nextMXact ) ;
if ( entryno = = 0 )
slotno = SimpleLruZeroPage ( MultiXactOffsetCtl , pageno ) ;
else
slotno = SimpleLruReadPage ( MultiXactOffsetCtl , pageno , true , nextMXact ) ;
offptr = ( MultiXactOffset * ) MultiXactOffsetCtl - > shared - > page_buffer [ slotno ] ;
offptr + = entryno ;
MemSet ( offptr , 0 , BLCKSZ - ( entryno * sizeof ( MultiXactOffset ) ) ) ;
* offptr = offset ;
if ( entryno ! = 0 & & ( entryno + 1 ) * sizeof ( MultiXactOffset ) ! = BLCKSZ )
MemSet ( offptr + 1 , 0 , BLCKSZ - ( entryno + 1 ) * sizeof ( MultiXactOffset ) ) ;
MultiXactOffsetCtl - > shared - > page_dirty [ slotno ] = true ;
LWLockRelease ( lock ) ;
@ -3407,14 +3468,24 @@ multixact_redo(XLogReaderState *record)
memcpy ( & pageno , XLogRecGetData ( record ) , sizeof ( pageno ) ) ;
lock = SimpleLruGetBankLock ( MultiXactOffsetCtl , pageno ) ;
LWLockAcquire ( lock , LW_EXCLUSIVE ) ;
/*
* Skip the record if we already initialized the page at the previous
* XLOG_MULTIXACT_CREATE_ID record . See RecordNewMultiXact ( ) .
*/
if ( pre_initialized_offsets_page ! = pageno )
{
lock = SimpleLruGetBankLock ( MultiXactOffsetCtl , pageno ) ;
LWLockAcquire ( lock , LW_EXCLUSIVE ) ;
slotno = ZeroMultiXactOffsetPage ( pageno , false ) ;
SimpleLruWritePage ( MultiXactOffsetCtl , slotno ) ;
Assert ( ! MultiXactOffsetCtl - > shared - > page_dirty [ slotno ] ) ;
slotno = ZeroMultiXactOffsetPage ( pageno , false ) ;
SimpleLruWritePage ( MultiXactOffsetCtl , slotno ) ;
Assert ( ! MultiXactOffsetCtl - > shared - > page_dirty [ slotno ] ) ;
LWLockRelease ( lock ) ;
LWLockRelease ( lock ) ;
}
else
elog ( DEBUG1 , " skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation " , pageno ) ;
pre_initialized_offsets_page = - 1 ;
}
else if ( info = = XLOG_MULTIXACT_ZERO_MEM_PAGE )
{
@ -3440,6 +3511,22 @@ multixact_redo(XLogReaderState *record)
TransactionId max_xid ;
int i ;
if ( pre_initialized_offsets_page ! = - 1 )
{
/*
* If we implicitly initialized the next offsets page while
* replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
* with an older minor version , we still expect to see an
* XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
* XLOG_MULTIXACT_CREATE_ID records . Therefore this case should
* not happen . If it does , we ' ll continue with the replay , but
* log a message to note that something ' s funny .
*/
elog ( LOG , " expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page " INT64_FORMAT " that was implicitly initialized earlier " ,
pre_initialized_offsets_page ) ;
pre_initialized_offsets_page = - 1 ;
}
/* Store the data back into the SLRU files */
RecordNewMultiXact ( xlrec - > mid , xlrec - > moff , xlrec - > nmembers ,
xlrec - > members ) ;