@ -559,6 +559,16 @@ typedef struct XLogCtlData
slock_t info_lck ; /* locks shared variables shown above */
} XLogCtlData ;
/*
* Classification of XLogRecordInsert operations .
*/
typedef enum
{
WALINSERT_NORMAL ,
WALINSERT_SPECIAL_SWITCH ,
WALINSERT_SPECIAL_CHECKPOINT
} WalInsertClass ;
static XLogCtlData * XLogCtl = NULL ;
/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
@ -739,13 +749,21 @@ XLogInsertRecord(XLogRecData *rdata,
bool inserted ;
XLogRecord * rechdr = ( XLogRecord * ) rdata - > data ;
uint8 info = rechdr - > xl_info & ~ XLR_INFO_MASK ;
bool isLogSwitch = ( rechdr - > xl_rmid = = RM_XLOG_ID & &
info = = XLOG_SWITCH ) ;
WalInsertClass class = WALINSERT_NORMAL ;
XLogRecPtr StartPos ;
XLogRecPtr EndPos ;
bool prevDoPageWrites = doPageWrites ;
TimeLineID insertTLI ;
/* Does this record type require special handling? */
if ( unlikely ( rechdr - > xl_rmid = = RM_XLOG_ID ) )
{
if ( info = = XLOG_SWITCH )
class = WALINSERT_SPECIAL_SWITCH ;
else if ( info = = XLOG_CHECKPOINT_REDO )
class = WALINSERT_SPECIAL_CHECKPOINT ;
}
/* we assume that all of the record header is in the first chunk */
Assert ( rdata - > len > = SizeOfXLogRecord ) ;
@ -793,7 +811,7 @@ XLogInsertRecord(XLogRecData *rdata,
*/
START_CRIT_SECTION ( ) ;
if ( likely ( ! isLogSwitch ) )
if ( likely ( class = = WALINSERT_NORMAL ) )
{
WALInsertLockAcquire ( ) ;
@ -843,7 +861,7 @@ XLogInsertRecord(XLogRecData *rdata,
/* Normal records are always inserted. */
inserted = true ;
}
else
else if ( class = = WALINSERT_SPECIAL_SWITCH )
{
/*
* In order to insert an XLOG_SWITCH record , we need to hold all of
@ -852,14 +870,32 @@ XLogInsertRecord(XLogRecData *rdata,
* remains in the current WAL segment and claimed all of it .
*
* Nonetheless , this case is simpler than the normal cases handled
* a bov e, which must check for changes in doPageWrites and RedoRecPtr .
* Those checks are only needed for records that can contain
* full - pages imag es, and an XLOG_SWITCH record never does .
* below , which must check for changes in doPageWrites and RedoRecPtr .
* Those checks are only needed for records that can contain buffer
* referenc es, and an XLOG_SWITCH record never does .
*/
Assert ( fpw_lsn = = InvalidXLogRecPtr ) ;
WALInsertLockAcquireExclusive ( ) ;
inserted = ReserveXLogSwitch ( & StartPos , & EndPos , & rechdr - > xl_prev ) ;
}
else
{
Assert ( class = = WALINSERT_SPECIAL_CHECKPOINT ) ;
/*
* We need to update both the local and shared copies of RedoRecPtr ,
* which means that we need to hold all the WAL insertion locks .
* However , there can ' t be any buffer references , so as above , we need
* not check RedoRecPtr before inserting the record ; we just need to
* update it afterwards .
*/
Assert ( fpw_lsn = = InvalidXLogRecPtr ) ;
WALInsertLockAcquireExclusive ( ) ;
ReserveXLogInsertLocation ( rechdr - > xl_tot_len , & StartPos , & EndPos ,
& rechdr - > xl_prev ) ;
RedoRecPtr = Insert - > RedoRecPtr = StartPos ;
inserted = true ;
}
if ( inserted )
{
@ -876,7 +912,8 @@ XLogInsertRecord(XLogRecData *rdata,
* All the record data , including the header , is now ready to be
* inserted . Copy the record in the space reserved .
*/
CopyXLogRecordToWAL ( rechdr - > xl_tot_len , isLogSwitch , rdata ,
CopyXLogRecordToWAL ( rechdr - > xl_tot_len ,
class = = WALINSERT_SPECIAL_SWITCH , rdata ,
StartPos , EndPos , insertTLI ) ;
/*
@ -935,7 +972,7 @@ XLogInsertRecord(XLogRecData *rdata,
* padding space that fills the rest of the segment , and perform
* end - of - segment actions ( eg , notifying archiver ) .
*/
if ( isLogSwitch )
if ( class = = WALINSERT_SPECIAL_SWITCH )
{
TRACE_POSTGRESQL_WAL_SWITCH ( ) ;
XLogFlush ( EndPos ) ;
@ -1054,8 +1091,12 @@ XLogInsertRecord(XLogRecData *rdata,
*
* NB : The space calculation here must match the code in CopyXLogRecordToWAL ,
* where we actually copy the record to the reserved space .
*
* NB : Testing shows that XLogInsertRecord runs faster if this code is inlined ;
* however , because there are two call sites , the compiler is reluctant to
* inline . We use pg_attribute_always_inline here to try to convince it .
*/
static void
static pg_attribute_always_inline void
ReserveXLogInsertLocation ( int size , XLogRecPtr * StartPos , XLogRecPtr * EndPos ,
XLogRecPtr * PrevPtr )
{
@ -6475,17 +6516,22 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
* In particular note that this routine is synchronous and does not pay
* attention to CHECKPOINT_WAIT .
*
* If ! shutdown then we are writing an online checkpoint . This is a very special
* kind of operation and WAL record because the checkpoint action occurs over
* a period of time yet logically occurs at just a single LSN . The logical
* position of the WAL record ( redo ptr ) is the same or earlier than the
* physical position . When we replay WAL we locate the checkpoint via its
* physical position then read the redo ptr and actually start replay at the
* earlier logical position . Note that we don ' t write * anything * to WAL at
* the logical position , so that location could be any other kind of WAL record .
* All of this mechanism allows us to continue working while we checkpoint .
* As a result , timing of actions is critical here and be careful to note that
* this function will likely take minutes to execute on a busy system .
* If ! shutdown then we are writing an online checkpoint . An XLOG_CHECKPOINT_REDO
* record is inserted into WAL at the logical location of the checkpoint , before
* flushing anything to disk , and when the checkpoint is eventually completed ,
* and it is from this point that WAL replay will begin in the case of a recovery
* from this checkpoint . Once everything is written to disk , an
* XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint , and
* points back to the earlier XLOG_CHECKPOINT_REDO record . This mechanism allows
* other write - ahead log records to be written while the checkpoint is in
* progress , but we must be very careful about order of operations . This function
* may take many minutes to execute on a busy system .
*
* On the other hand , when shutdown is true , concurrent insertion into the
* write - ahead log is impossible , so there is no need for two separate records .
* In this case , we only insert an XLOG_CHECKPOINT_SHUTDOWN record , and it ' s
* both the record marking the completion of the checkpoint and the location
* from which WAL replay would begin if needed .
*/
void
CreateCheckPoint ( int flags )
@ -6497,7 +6543,6 @@ CreateCheckPoint(int flags)
XLogCtlInsert * Insert = & XLogCtl - > Insert ;
uint32 freespace ;
XLogRecPtr PriorRedoPtr ;
XLogRecPtr curInsert ;
XLogRecPtr last_important_lsn ;
VirtualTransactionId * vxids ;
int nvxids ;
@ -6567,13 +6612,6 @@ CreateCheckPoint(int flags)
*/
last_important_lsn = GetLastImportantRecPtr ( ) ;
/*
* We must block concurrent insertions while examining insert state to
* determine the checkpoint REDO pointer .
*/
WALInsertLockAcquireExclusive ( ) ;
curInsert = XLogBytePosToRecPtr ( Insert - > CurrBytePos ) ;
/*
* If this isn ' t a shutdown or forced checkpoint , and if there has been no
* WAL activity requiring a checkpoint , skip it . The idea here is to
@ -6584,7 +6622,6 @@ CreateCheckPoint(int flags)
{
if ( last_important_lsn = = ControlFile - > checkPoint )
{
WALInsertLockRelease ( ) ;
END_CRIT_SECTION ( ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " checkpoint skipped because system is idle " ) ) ) ;
@ -6606,38 +6643,47 @@ CreateCheckPoint(int flags)
else
checkPoint . PrevTimeLineID = checkPoint . ThisTimeLineID ;
checkPoint . fullPageWrites = Insert - > fullPageWrites ;
/*
* Compute new REDO record ptr = location of next XLOG record .
*
* NB : this is NOT necessarily where the checkpoint record itself will be ,
* since other backends may insert more XLOG records while we ' re off doing
* the buffer flush work . Those XLOG records are logically after the
* checkpoint , even though physically before it . Got that ?
* We must block concurrent insertions while examining insert state .
*/
freespace = INSERT_FREESPACE ( curInsert ) ;
if ( freespace = = 0 )
WALInsertLockAcquireExclusive ( ) ;
checkPoint . fullPageWrites = Insert - > fullPageWrites ;
if ( shutdown )
{
if ( XLogSegmentOffset ( curInsert , wal_segment_size ) = = 0 )
curInsert + = SizeOfXLogLongPHD ;
else
curInsert + = SizeOfXLogShortPHD ;
}
checkPoint . redo = curInsert ;
XLogRecPtr curInsert = XLogBytePosToRecPtr ( Insert - > CurrBytePos ) ;
/*
* Here we update the shared RedoRecPtr for future XLogInsert calls ; this
* must be done while holding all the insertion locks .
*
* Note : if we fail to complete the checkpoint , RedoRecPtr will be left
* pointing past where it really needs to point . This is okay ; the only
* consequence is that XLogInsert might back up whole buffers that it
* didn ' t really need to . We can ' t postpone advancing RedoRecPtr because
* XLogInserts that happen while we are dumping buffers must assume that
* their buffer changes are not included in the checkpoint .
*/
RedoRecPtr = XLogCtl - > Insert . RedoRecPtr = checkPoint . redo ;
/*
* Compute new REDO record ptr = location of next XLOG record .
*
* Since this is a shutdown checkpoint , there can ' t be any concurrent
* WAL insertion .
*/
freespace = INSERT_FREESPACE ( curInsert ) ;
if ( freespace = = 0 )
{
if ( XLogSegmentOffset ( curInsert , wal_segment_size ) = = 0 )
curInsert + = SizeOfXLogLongPHD ;
else
curInsert + = SizeOfXLogShortPHD ;
}
checkPoint . redo = curInsert ;
/*
* Here we update the shared RedoRecPtr for future XLogInsert calls ;
* this must be done while holding all the insertion locks .
*
* Note : if we fail to complete the checkpoint , RedoRecPtr will be
* left pointing past where it really needs to point . This is okay ;
* the only consequence is that XLogInsert might back up whole buffers
* that it didn ' t really need to . We can ' t postpone advancing
* RedoRecPtr because XLogInserts that happen while we are dumping
* buffers must assume that their buffer changes are not included in
* the checkpoint .
*/
RedoRecPtr = XLogCtl - > Insert . RedoRecPtr = checkPoint . redo ;
}
/*
* Now we can release the WAL insertion locks , allowing other xacts to
@ -6645,6 +6691,33 @@ CreateCheckPoint(int flags)
*/
WALInsertLockRelease ( ) ;
/*
* If this is an online checkpoint , we have not yet determined the redo
* point . We do so now by inserting the special XLOG_CHECKPOINT_REDO
* record ; the LSN at which it starts becomes the new redo pointer . We
* don ' t do this for a shutdown checkpoint , because in that case no WAL
* can be written between the redo point and the insertion of the
* checkpoint record itself , so the checkpoint record itself serves to
* mark the redo point .
*/
if ( ! shutdown )
{
int dummy = 0 ;
/* Record must have payload to avoid assertion failure. */
XLogBeginInsert ( ) ;
XLogRegisterData ( ( char * ) & dummy , sizeof ( dummy ) ) ;
( void ) XLogInsert ( RM_XLOG_ID , XLOG_CHECKPOINT_REDO ) ;
/*
* XLogInsertRecord will have updated XLogCtl - > Insert . RedoRecPtr in
* shared memory and RedoRecPtr in backend - local memory , but we need
* to copy that into the record that will be inserted when the
* checkpoint is complete .
*/
checkPoint . redo = RedoRecPtr ;
}
/* Update the info_lck-protected copy of RedoRecPtr as well */
SpinLockAcquire ( & XLogCtl - > info_lck ) ;
XLogCtl - > RedoRecPtr = checkPoint . redo ;
@ -8105,6 +8178,10 @@ xlog_redo(XLogReaderState *record)
/* Keep track of full_page_writes */
lastFullPageWrites = fpw ;
}
else if ( info = = XLOG_CHECKPOINT_REDO )
{
/* nothing to do here, just for informational purposes */
}
}
/*