|
|
|
@ -418,11 +418,11 @@ typedef struct XLogCtlInsert |
|
|
|
|
slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */ |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* CurrBytePos is the end of reserved WAL. The next record will be inserted |
|
|
|
|
* at that position. PrevBytePos is the start position of the previously |
|
|
|
|
* inserted (or rather, reserved) record - it is copied to the prev-link |
|
|
|
|
* of the next record. These are stored as "usable byte positions" rather |
|
|
|
|
* than XLogRecPtrs (see XLogBytePosToRecPtr()). |
|
|
|
|
* CurrBytePos is the end of reserved WAL. The next record will be |
|
|
|
|
* inserted at that position. PrevBytePos is the start position of the |
|
|
|
|
* previously inserted (or rather, reserved) record - it is copied to the |
|
|
|
|
* prev-link of the next record. These are stored as "usable byte |
|
|
|
|
* positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()). |
|
|
|
|
*/ |
|
|
|
|
uint64 CurrBytePos; |
|
|
|
|
uint64 PrevBytePos; |
|
|
|
@ -504,10 +504,11 @@ typedef struct XLogCtlData |
|
|
|
|
* Latest initialized page in the cache (last byte position + 1). |
|
|
|
|
* |
|
|
|
|
* To change the identity of a buffer (and InitializedUpTo), you need to |
|
|
|
|
* hold WALBufMappingLock. To change the identity of a buffer that's still |
|
|
|
|
* dirty, the old page needs to be written out first, and for that you |
|
|
|
|
* need WALWriteLock, and you need to ensure that there are no in-progress |
|
|
|
|
* insertions to the page by calling WaitXLogInsertionsToFinish(). |
|
|
|
|
* hold WALBufMappingLock. To change the identity of a buffer that's |
|
|
|
|
* still dirty, the old page needs to be written out first, and for that |
|
|
|
|
* you need WALWriteLock, and you need to ensure that there are no |
|
|
|
|
* in-progress insertions to the page by calling |
|
|
|
|
* WaitXLogInsertionsToFinish(). |
|
|
|
|
*/ |
|
|
|
|
XLogRecPtr InitializedUpTo; |
|
|
|
|
|
|
|
|
@ -860,6 +861,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) |
|
|
|
|
if (rechdr == NULL) |
|
|
|
|
{ |
|
|
|
|
static char rechdrbuf[SizeOfXLogRecord + MAXIMUM_ALIGNOF]; |
|
|
|
|
|
|
|
|
|
rechdr = (XLogRecord *) MAXALIGN(&rechdrbuf); |
|
|
|
|
MemSet(rechdr, 0, SizeOfXLogRecord); |
|
|
|
|
} |
|
|
|
@ -1232,6 +1234,7 @@ begin:; |
|
|
|
|
{ |
|
|
|
|
TRACE_POSTGRESQL_XLOG_SWITCH(); |
|
|
|
|
XLogFlush(EndPos); |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Even though we reserved the rest of the segment for us, which is |
|
|
|
|
* reflected in EndPos, we return a pointer to just the end of the |
|
|
|
@ -1514,8 +1517,8 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If this was an xlog-switch, it's not enough to write the switch record, |
|
|
|
|
* we also have to consume all the remaining space in the WAL segment. |
|
|
|
|
* We have already reserved it for us, but we still need to make sure it's |
|
|
|
|
* we also have to consume all the remaining space in the WAL segment. We |
|
|
|
|
* have already reserved it for us, but we still need to make sure it's |
|
|
|
|
* allocated and zeroed in the WAL buffers so that when the caller (or |
|
|
|
|
* someone else) does XLogWrite(), it can really write out all the zeros. |
|
|
|
|
*/ |
|
|
|
@ -1556,14 +1559,14 @@ WALInsertLockAcquire(void) |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* It doesn't matter which of the WAL insertion locks we acquire, so try |
|
|
|
|
* the one we used last time. If the system isn't particularly busy, |
|
|
|
|
* it's a good bet that it's still available, and it's good to have some |
|
|
|
|
* the one we used last time. If the system isn't particularly busy, it's |
|
|
|
|
* a good bet that it's still available, and it's good to have some |
|
|
|
|
* affinity to a particular lock so that you don't unnecessarily bounce |
|
|
|
|
* cache lines between processes when there's no contention. |
|
|
|
|
* |
|
|
|
|
* If this is the first time through in this backend, pick a lock |
|
|
|
|
* (semi-)randomly. This allows the locks to be used evenly if you have |
|
|
|
|
* a lot of very short connections. |
|
|
|
|
* (semi-)randomly. This allows the locks to be used evenly if you have a |
|
|
|
|
* lot of very short connections. |
|
|
|
|
*/ |
|
|
|
|
static int lockToTry = -1; |
|
|
|
|
|
|
|
|
@ -1583,10 +1586,10 @@ WALInsertLockAcquire(void) |
|
|
|
|
/*
|
|
|
|
|
* If we couldn't get the lock immediately, try another lock next |
|
|
|
|
* time. On a system with more insertion locks than concurrent |
|
|
|
|
* inserters, this causes all the inserters to eventually migrate |
|
|
|
|
* to a lock that no-one else is using. On a system with more |
|
|
|
|
* inserters than locks, it still helps to distribute the inserters |
|
|
|
|
* evenly across the locks. |
|
|
|
|
* inserters, this causes all the inserters to eventually migrate to a |
|
|
|
|
* lock that no-one else is using. On a system with more inserters |
|
|
|
|
* than locks, it still helps to distribute the inserters evenly |
|
|
|
|
* across the locks. |
|
|
|
|
*/ |
|
|
|
|
lockToTry = (lockToTry + 1) % num_xloginsert_locks; |
|
|
|
|
} |
|
|
|
@ -1604,8 +1607,8 @@ WALInsertLockAcquireExclusive(void) |
|
|
|
|
/*
|
|
|
|
|
* When holding all the locks, we only update the last lock's insertingAt |
|
|
|
|
* indicator. The others are set to 0xFFFFFFFFFFFFFFFF, which is higher |
|
|
|
|
* than any real XLogRecPtr value, to make sure that no-one blocks |
|
|
|
|
* waiting on those. |
|
|
|
|
* than any real XLogRecPtr value, to make sure that no-one blocks waiting |
|
|
|
|
* on those. |
|
|
|
|
*/ |
|
|
|
|
for (i = 0; i < num_xloginsert_locks - 1; i++) |
|
|
|
|
{ |
|
|
|
@ -1716,15 +1719,16 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) |
|
|
|
|
* Loop through all the locks, sleeping on any in-progress insert older |
|
|
|
|
* than 'upto'. |
|
|
|
|
* |
|
|
|
|
* finishedUpto is our return value, indicating the point upto which |
|
|
|
|
* all the WAL insertions have been finished. Initialize it to the head |
|
|
|
|
* of reserved WAL, and as we iterate through the insertion locks, back it |
|
|
|
|
* finishedUpto is our return value, indicating the point upto which all |
|
|
|
|
* the WAL insertions have been finished. Initialize it to the head of |
|
|
|
|
* reserved WAL, and as we iterate through the insertion locks, back it |
|
|
|
|
* out for any insertion that's still in progress. |
|
|
|
|
*/ |
|
|
|
|
finishedUpto = reservedUpto; |
|
|
|
|
for (i = 0; i < num_xloginsert_locks; i++) |
|
|
|
|
{ |
|
|
|
|
XLogRecPtr insertingat = InvalidXLogRecPtr; |
|
|
|
|
|
|
|
|
|
do |
|
|
|
|
{ |
|
|
|
|
/*
|
|
|
|
@ -1797,9 +1801,9 @@ GetXLogBuffer(XLogRecPtr ptr) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The XLog buffer cache is organized so that a page is always loaded |
|
|
|
|
* to a particular buffer. That way we can easily calculate the buffer |
|
|
|
|
* a given page must be loaded into, from the XLogRecPtr alone. |
|
|
|
|
* The XLog buffer cache is organized so that a page is always loaded to a |
|
|
|
|
* particular buffer. That way we can easily calculate the buffer a given |
|
|
|
|
* page must be loaded into, from the XLogRecPtr alone. |
|
|
|
|
*/ |
|
|
|
|
idx = XLogRecPtrToBufIdx(ptr); |
|
|
|
|
|
|
|
|
@ -1827,8 +1831,8 @@ GetXLogBuffer(XLogRecPtr ptr) |
|
|
|
|
if (expectedEndPtr != endptr) |
|
|
|
|
{ |
|
|
|
|
/*
|
|
|
|
|
* Let others know that we're finished inserting the record up |
|
|
|
|
* to the page boundary. |
|
|
|
|
* Let others know that we're finished inserting the record up to the |
|
|
|
|
* page boundary. |
|
|
|
|
*/ |
|
|
|
|
WALInsertLockUpdateInsertingAt(expectedEndPtr - XLOG_BLCKSZ); |
|
|
|
|
|
|
|
|
@ -2170,8 +2174,8 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Now the next buffer slot is free and we can set it up to be the next |
|
|
|
|
* output page. |
|
|
|
|
* Now the next buffer slot is free and we can set it up to be the |
|
|
|
|
* next output page. |
|
|
|
|
*/ |
|
|
|
|
NewPageBeginPtr = XLogCtl->InitializedUpTo; |
|
|
|
|
NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ; |
|
|
|
@ -2194,6 +2198,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) |
|
|
|
|
/* NewPage->xlp_info = 0; */ /* done by memset */ |
|
|
|
|
NewPage ->xlp_tli = ThisTimeLineID; |
|
|
|
|
NewPage ->xlp_pageaddr = NewPageBeginPtr; |
|
|
|
|
|
|
|
|
|
/* NewPage->xlp_rem_len = 0; */ /* done by memset */ |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
@ -2202,12 +2207,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) |
|
|
|
|
* blocks. This allows the WAL archiver to know whether it is safe to |
|
|
|
|
* compress archived WAL data by transforming full-block records into |
|
|
|
|
* the non-full-block format. It is sufficient to record this at the |
|
|
|
|
* page level because we force a page switch (in fact a segment switch) |
|
|
|
|
* when starting a backup, so the flag will be off before any records |
|
|
|
|
* can be written during the backup. At the end of a backup, the last |
|
|
|
|
* page will be marked as all unsafe when perhaps only part is unsafe, |
|
|
|
|
* but at worst the archiver would miss the opportunity to compress a |
|
|
|
|
* few records. |
|
|
|
|
* page level because we force a page switch (in fact a segment |
|
|
|
|
* switch) when starting a backup, so the flag will be off before any |
|
|
|
|
* records can be written during the backup. At the end of a backup, |
|
|
|
|
* the last page will be marked as all unsafe when perhaps only part |
|
|
|
|
* is unsafe, but at worst the archiver would miss the opportunity to |
|
|
|
|
* compress a few records. |
|
|
|
|
*/ |
|
|
|
|
if (!Insert->forcePageWrites) |
|
|
|
|
NewPage ->xlp_info |= XLP_BKP_REMOVABLE; |
|
|
|
@ -2330,6 +2335,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) |
|
|
|
|
* last page that's been initialized by AdvanceXLInsertBuffer. |
|
|
|
|
*/ |
|
|
|
|
XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx]; |
|
|
|
|
|
|
|
|
|
if (LogwrtResult.Write >= EndPtr) |
|
|
|
|
elog(PANIC, "xlog write request %X/%X is past end of log %X/%X", |
|
|
|
|
(uint32) (LogwrtResult.Write >> 32), |
|
|
|
@ -2617,6 +2623,7 @@ XLogGetReplicationSlotMinimumLSN(void) |
|
|
|
|
/* use volatile pointer to prevent code rearrangement */ |
|
|
|
|
volatile XLogCtlData *xlogctl = XLogCtl; |
|
|
|
|
XLogRecPtr retval; |
|
|
|
|
|
|
|
|
|
SpinLockAcquire(&xlogctl->info_lck); |
|
|
|
|
retval = xlogctl->replicationSlotMinLSN; |
|
|
|
|
SpinLockRelease(&xlogctl->info_lck); |
|
|
|
@ -3828,6 +3835,7 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr) |
|
|
|
|
xlde->d_name))); |
|
|
|
|
|
|
|
|
|
#ifdef WIN32 |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* On Windows, if another process (e.g another backend) |
|
|
|
|
* holds the file open in FILE_SHARE_DELETE mode, unlink |
|
|
|
@ -4836,8 +4844,8 @@ XLOGShmemInit(void) |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Align the start of the page buffers to a full xlog block size boundary. |
|
|
|
|
* This simplifies some calculations in XLOG insertion. It is also required |
|
|
|
|
* for O_DIRECT. |
|
|
|
|
* This simplifies some calculations in XLOG insertion. It is also |
|
|
|
|
* required for O_DIRECT. |
|
|
|
|
*/ |
|
|
|
|
allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr); |
|
|
|
|
XLogCtl->pages = allocptr; |
|
|
|
@ -5464,8 +5472,8 @@ recoveryStopsBefore(XLogRecord *record) |
|
|
|
|
* |
|
|
|
|
* when testing for an xid, we MUST test for equality only, since |
|
|
|
|
* transactions are numbered in the order they start, not the order |
|
|
|
|
* they complete. A higher numbered xid will complete before you |
|
|
|
|
* about 50% of the time... |
|
|
|
|
* they complete. A higher numbered xid will complete before you about |
|
|
|
|
* 50% of the time... |
|
|
|
|
*/ |
|
|
|
|
stopsHere = (record->xl_xid == recoveryTargetXid); |
|
|
|
|
} |
|
|
|
@ -5525,8 +5533,8 @@ recoveryStopsAfter(XLogRecord *record) |
|
|
|
|
record_info = record->xl_info & ~XLR_INFO_MASK; |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* There can be many restore points that share the same name; we stop |
|
|
|
|
* at the first one. |
|
|
|
|
* There can be many restore points that share the same name; we stop at |
|
|
|
|
* the first one. |
|
|
|
|
*/ |
|
|
|
|
if (recoveryTarget == RECOVERY_TARGET_NAME && |
|
|
|
|
record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT) |
|
|
|
@ -5688,10 +5696,10 @@ recoveryApplyDelay(XLogRecord *record) |
|
|
|
|
/*
|
|
|
|
|
* Is it a COMMIT record? |
|
|
|
|
* |
|
|
|
|
* We deliberately choose not to delay aborts since they have no effect |
|
|
|
|
* on MVCC. We already allow replay of records that don't have a |
|
|
|
|
* timestamp, so there is already opportunity for issues caused by early |
|
|
|
|
* conflicts on standbys. |
|
|
|
|
* We deliberately choose not to delay aborts since they have no effect on |
|
|
|
|
* MVCC. We already allow replay of records that don't have a timestamp, |
|
|
|
|
* so there is already opportunity for issues caused by early conflicts on |
|
|
|
|
* standbys. |
|
|
|
|
*/ |
|
|
|
|
record_info = record->xl_info & ~XLR_INFO_MASK; |
|
|
|
|
if (!(record->xl_rmid == RM_XACT_ID && |
|
|
|
@ -6261,9 +6269,9 @@ StartupXLOG(void) |
|
|
|
|
StartupReorderBuffer(); |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Startup MultiXact. We need to do this early for two reasons: one |
|
|
|
|
* is that we might try to access multixacts when we do tuple freezing, |
|
|
|
|
* and the other is we need its state initialized because we attempt |
|
|
|
|
* Startup MultiXact. We need to do this early for two reasons: one is |
|
|
|
|
* that we might try to access multixacts when we do tuple freezing, and |
|
|
|
|
* the other is we need its state initialized because we attempt |
|
|
|
|
* truncation during restartpoints. |
|
|
|
|
*/ |
|
|
|
|
StartupMultiXact(); |
|
|
|
@ -6517,9 +6525,9 @@ StartupXLOG(void) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Initialize shared variables for tracking progress of WAL replay, |
|
|
|
|
* as if we had just replayed the record before the REDO location |
|
|
|
|
* (or the checkpoint record itself, if it's a shutdown checkpoint). |
|
|
|
|
* Initialize shared variables for tracking progress of WAL replay, as |
|
|
|
|
* if we had just replayed the record before the REDO location (or the |
|
|
|
|
* checkpoint record itself, if it's a shutdown checkpoint). |
|
|
|
|
*/ |
|
|
|
|
SpinLockAcquire(&xlogctl->info_lck); |
|
|
|
|
if (checkPoint.redo < RecPtr) |
|
|
|
@ -6646,17 +6654,17 @@ StartupXLOG(void) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If we've been asked to lag the master, wait on |
|
|
|
|
* latch until enough time has passed. |
|
|
|
|
* If we've been asked to lag the master, wait on latch until |
|
|
|
|
* enough time has passed. |
|
|
|
|
*/ |
|
|
|
|
if (recoveryApplyDelay(record)) |
|
|
|
|
{ |
|
|
|
|
/*
|
|
|
|
|
* We test for paused recovery again here. If |
|
|
|
|
* user sets delayed apply, it may be because |
|
|
|
|
* they expect to pause recovery in case of |
|
|
|
|
* problems, so we must test again here otherwise |
|
|
|
|
* pausing during the delay-wait wouldn't work. |
|
|
|
|
* We test for paused recovery again here. If user sets |
|
|
|
|
* delayed apply, it may be because they expect to pause |
|
|
|
|
* recovery in case of problems, so we must test again |
|
|
|
|
* here otherwise pausing during the delay-wait wouldn't |
|
|
|
|
* work. |
|
|
|
|
*/ |
|
|
|
|
if (xlogctl->recoveryPause) |
|
|
|
|
recoveryPausesHere(); |
|
|
|
@ -6996,9 +7004,9 @@ StartupXLOG(void) |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
/*
|
|
|
|
|
* There is no partial block to copy. Just set InitializedUpTo, |
|
|
|
|
* and let the first attempt to insert a log record to initialize |
|
|
|
|
* the next buffer. |
|
|
|
|
* There is no partial block to copy. Just set InitializedUpTo, and |
|
|
|
|
* let the first attempt to insert a log record to initialize the next |
|
|
|
|
* buffer. |
|
|
|
|
*/ |
|
|
|
|
XLogCtl->InitializedUpTo = EndOfLog; |
|
|
|
|
} |
|
|
|
@ -7335,6 +7343,7 @@ RecoveryInProgress(void) |
|
|
|
|
pg_memory_barrier(); |
|
|
|
|
InitXLOGAccess(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Note: We don't need a memory barrier when we're still in recovery. |
|
|
|
|
* We might exit recovery immediately after return, so the caller |
|
|
|
@ -8131,9 +8140,8 @@ CreateCheckPoint(int flags) |
|
|
|
|
* fuzzy: it is possible that we will wait for xacts we didn't really need |
|
|
|
|
* to wait for. But the delay should be short and it seems better to make |
|
|
|
|
* checkpoint take a bit longer than to hold off insertions longer than |
|
|
|
|
* necessary. |
|
|
|
|
* (In fact, the whole reason we have this issue is that xact.c does |
|
|
|
|
* commit record XLOG insertion and clog update as two separate steps |
|
|
|
|
* necessary. (In fact, the whole reason we have this issue is that xact.c |
|
|
|
|
* does commit record XLOG insertion and clog update as two separate steps |
|
|
|
|
* protected by different locks, but again that seems best on grounds of |
|
|
|
|
* minimizing lock contention.) |
|
|
|
|
* |
|
|
|
@ -8600,11 +8608,11 @@ CreateRestartPoint(int flags) |
|
|
|
|
_logSegNo--; |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Try to recycle segments on a useful timeline. If we've been promoted |
|
|
|
|
* since the beginning of this restartpoint, use the new timeline |
|
|
|
|
* chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID |
|
|
|
|
* in that case). If we're still in recovery, use the timeline we're |
|
|
|
|
* currently replaying. |
|
|
|
|
* Try to recycle segments on a useful timeline. If we've been |
|
|
|
|
* promoted since the beginning of this restartpoint, use the new |
|
|
|
|
* timeline chosen at end of recovery (RecoveryInProgress() sets |
|
|
|
|
* ThisTimeLineID in that case). If we're still in recovery, use the |
|
|
|
|
* timeline we're currently replaying. |
|
|
|
|
* |
|
|
|
|
* There is no guarantee that the WAL segments will be useful on the |
|
|
|
|
* current timeline; if recovery proceeds to a new timeline right |
|
|
|
@ -8859,8 +8867,9 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) |
|
|
|
|
* lsn updates. We assume pd_lower/upper cannot be changed without an |
|
|
|
|
* exclusive lock, so the contents bkp are not racy. |
|
|
|
|
* |
|
|
|
|
* With buffer_std set to false, XLogCheckBuffer() sets hole_length and |
|
|
|
|
* hole_offset to 0; so the following code is safe for either case. |
|
|
|
|
* With buffer_std set to false, XLogCheckBuffer() sets hole_length |
|
|
|
|
* and hole_offset to 0; so the following code is safe for either |
|
|
|
|
* case. |
|
|
|
|
*/ |
|
|
|
|
memcpy(copied_buffer, origdata, bkpb.hole_offset); |
|
|
|
|
memcpy(copied_buffer + bkpb.hole_offset, |
|
|
|
@ -9262,10 +9271,10 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) |
|
|
|
|
BkpBlock bkpb; |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Full-page image (FPI) records contain a backup block stored "inline" |
|
|
|
|
* in the normal data since the locking when writing hint records isn't |
|
|
|
|
* sufficient to use the normal backup block mechanism, which assumes |
|
|
|
|
* exclusive lock on the buffer supplied. |
|
|
|
|
* Full-page image (FPI) records contain a backup block stored |
|
|
|
|
* "inline" in the normal data since the locking when writing hint |
|
|
|
|
* records isn't sufficient to use the normal backup block mechanism, |
|
|
|
|
* which assumes exclusive lock on the buffer supplied. |
|
|
|
|
* |
|
|
|
|
* Since the only change in these backup block are hint bits, there |
|
|
|
|
* are no recovery conflicts generated. |
|
|
|
|