diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 7d894522526..c0c223b2e32 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -17,6 +17,7 @@
#include "storage/aio.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
+#include "storage/proclist.h"
BufferDescPadded *BufferDescriptors;
char *BufferBlocks;
@@ -128,9 +129,7 @@ BufferManagerShmemInit(void)
pgaio_wref_clear(&buf->io_wref);
- LWLockInitialize(BufferDescriptorGetContentLock(buf),
- LWTRANCHE_BUFFER_CONTENT);
-
+ proclist_init(&buf->lock_waiters);
ConditionVariableInit(BufferDescriptorGetIOCV(buf));
}
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b0de8e45d4d..f30cf93b04f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -58,6 +58,7 @@
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
+#include "storage/proclist.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
@@ -100,6 +101,12 @@ typedef struct PrivateRefCountData
* How many times has the buffer been pinned by this backend.
*/
int32 refcount;
+
+ /*
+ * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
+ * the buffer is not locked.
+ */
+ BufferLockMode lockmode;
} PrivateRefCountData;
typedef struct PrivateRefCountEntry
@@ -210,8 +217,10 @@ static BufferDesc *PinCountWaitBuf = NULL;
* Each buffer also has a private refcount that keeps track of the number of
* times the buffer is pinned in the current process. This is so that the
* shared refcount needs to be modified only once if a buffer is pinned more
- * than once by an individual backend. It's also used to check that no buffers
- * are still pinned at the end of transactions and when exiting.
+ * than once by an individual backend. It's also used to check that no
+ * buffers are still pinned at the end of transactions and when exiting. We
+ * also use this mechanism to track whether this backend has a buffer locked,
+ * and, if so, in what mode.
*
*
* To avoid - as we used to - requiring an array with NBuffers entries to keep
@@ -254,8 +263,8 @@ static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
static void ResOwnerReleaseBufferIO(Datum res);
static char *ResOwnerPrintBufferIO(Datum res);
-static void ResOwnerReleaseBufferPin(Datum res);
-static char *ResOwnerPrintBufferPin(Datum res);
+static void ResOwnerReleaseBuffer(Datum res);
+static char *ResOwnerPrintBuffer(Datum res);
const ResourceOwnerDesc buffer_io_resowner_desc =
{
@@ -266,13 +275,13 @@ const ResourceOwnerDesc buffer_io_resowner_desc =
.DebugPrint = ResOwnerPrintBufferIO
};
-const ResourceOwnerDesc buffer_pin_resowner_desc =
+const ResourceOwnerDesc buffer_resowner_desc =
{
- .name = "buffer pin",
+ .name = "buffer",
.release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
.release_priority = RELEASE_PRIO_BUFFER_PINS,
- .ReleaseResource = ResOwnerReleaseBufferPin,
- .DebugPrint = ResOwnerPrintBufferPin
+ .ReleaseResource = ResOwnerReleaseBuffer,
+ .DebugPrint = ResOwnerPrintBuffer
};
/*
@@ -351,6 +360,7 @@ ReservePrivateRefCountEntry(void)
/* clear the whole data member, just for future proofing */
memset(&victim_entry->data, 0, sizeof(victim_entry->data));
victim_entry->data.refcount = 0;
+ victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
PrivateRefCountOverflowed++;
}
@@ -374,6 +384,7 @@ NewPrivateRefCountEntry(Buffer buffer)
PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
res->buffer = buffer;
res->data.refcount = 0;
+ res->data.lockmode = BUFFER_LOCK_UNLOCK;
/* update cache for the next lookup */
PrivateRefCountEntryLast = ReservedRefCountSlot;
@@ -540,6 +551,7 @@ static void
ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
{
Assert(ref->data.refcount == 0);
+ Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
if (ref >= &PrivateRefCountArray[0] &&
ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
@@ -641,14 +653,27 @@ static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
static void AtProcExit_Buffers(int code, Datum arg);
static void CheckForBufferLeaks(void);
#ifdef USE_ASSERT_CHECKING
-static void AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
- void *unused_context);
+static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode);
#endif
static int rlocator_comparator(const void *p1, const void *p2);
static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
+static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
+static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr);
+static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
+static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode);
+static bool BufferLockHeldByMe(BufferDesc *buf_hdr);
+static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
+static inline int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr);
+static inline bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode);
+static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode);
+static void BufferLockDequeueSelf(BufferDesc *buf_hdr);
+static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
+static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate);
+static inline uint64 BufferLockReleaseSub(BufferLockMode mode);
+
/*
* Implementation of PrefetchBuffer() for shared buffers.
@@ -2306,6 +2331,12 @@ retry:
goto retry;
}
+ /*
+ * An invalidated buffer should not have any backends waiting to lock the
+ * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
+ */
+ Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
+
/*
* Clear out the buffer's tag and flags. We must do this to ensure that
* linear scans of the buffer array don't think the buffer is valid.
@@ -2382,6 +2413,12 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr)
return false;
}
+ /*
+ * An invalidated buffer should not have any backends waiting to lock the
+ * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
+ */
+ Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
+
/*
* Clear out the buffer's tag and flags and usagecount. This is not
* strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
@@ -2449,8 +2486,6 @@ again:
*/
if (buf_state & BM_DIRTY)
{
- LWLock *content_lock;
-
Assert(buf_state & BM_TAG_VALID);
Assert(buf_state & BM_VALID);
@@ -2468,8 +2503,7 @@ again:
* one just happens to be trying to split the page the first one got
* from StrategyGetBuffer.)
*/
- content_lock = BufferDescriptorGetContentLock(buf_hdr);
- if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+ if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE))
{
/*
* Someone else has locked the buffer, so give it up and loop back
@@ -2498,7 +2532,7 @@ again:
if (XLogNeedsFlush(lsn)
&& StrategyRejectBuffer(strategy, buf_hdr, from_ring))
{
- LWLockRelease(content_lock);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
UnpinBuffer(buf_hdr);
goto again;
}
@@ -2506,7 +2540,7 @@ again:
/* OK, do the I/O */
FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
- LWLockRelease(content_lock);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
&buf_hdr->tag);
@@ -2948,7 +2982,7 @@ BufferIsLockedByMe(Buffer buffer)
else
{
bufHdr = GetBufferDescriptor(buffer - 1);
- return LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr));
+ return BufferLockHeldByMe(bufHdr);
}
}
@@ -2973,23 +3007,8 @@ BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
}
else
{
- LWLockMode lw_mode;
-
- switch (mode)
- {
- case BUFFER_LOCK_EXCLUSIVE:
- lw_mode = LW_EXCLUSIVE;
- break;
- case BUFFER_LOCK_SHARE:
- lw_mode = LW_SHARED;
- break;
- default:
- pg_unreachable();
- }
-
bufHdr = GetBufferDescriptor(buffer - 1);
- return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
- lw_mode);
+ return BufferLockHeldByMeInMode(bufHdr, mode);
}
}
@@ -3376,7 +3395,7 @@ UnpinBufferNoOwner(BufferDesc *buf)
* I'd better not still hold the buffer content lock. Can't use
* BufferIsLockedByMe(), as that asserts the buffer is pinned.
*/
- Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
+ Assert(!BufferLockHeldByMe(buf));
/* decrement the shared reference count */
old_buf_state = pg_atomic_fetch_sub_u64(&buf->state, BUF_REFCOUNT_ONE);
@@ -4198,9 +4217,9 @@ CheckForBufferLeaks(void)
* Check for exclusive-locked catalog buffers. This is the core of
* AssertCouldGetRelation().
*
- * A backend would self-deadlock on LWLocks if the catalog scan read the
- * exclusive-locked buffer. The main threat is exclusive-locked buffers of
- * catalogs used in relcache, because a catcache search on any catalog may
+ * A backend would self-deadlock on the content lock if the catalog scan read
+ * the exclusive-locked buffer. The main threat is exclusive-locked buffers
+ * of catalogs used in relcache, because a catcache search on any catalog may
* build that catalog's relcache entry. We don't have an inventory of
* catalogs relcache uses, so just check buffers of most catalogs.
*
@@ -4214,26 +4233,45 @@ CheckForBufferLeaks(void)
void
AssertBufferLocksPermitCatalogRead(void)
{
- ForEachLWLockHeldByMe(AssertNotCatalogBufferLock, NULL);
+ PrivateRefCountEntry *res;
+
+ /* check the array */
+ for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+ {
+ if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
+ {
+ res = &PrivateRefCountArray[i];
+
+ if (res->buffer == InvalidBuffer)
+ continue;
+
+ AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
+ }
+ }
+
+ /* if necessary search the hash */
+ if (PrivateRefCountOverflowed)
+ {
+ HASH_SEQ_STATUS hstat;
+
+ hash_seq_init(&hstat, PrivateRefCountHash);
+ while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
+ }
+ }
}
static void
-AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
- void *unused_context)
+AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode)
{
- BufferDesc *bufHdr;
+ BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
BufferTag tag;
Oid relid;
- if (mode != LW_EXCLUSIVE)
+ if (mode != BUFFER_LOCK_EXCLUSIVE)
return;
- if (!((BufferDescPadded *) lock > BufferDescriptors &&
- (BufferDescPadded *) lock < BufferDescriptors + NBuffers))
- return; /* not a buffer lock */
-
- bufHdr = (BufferDesc *)
- ((char *) lock - offsetof(BufferDesc, content_lock));
tag = bufHdr->tag;
/*
@@ -4515,9 +4553,11 @@ static void
FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
IOObject io_object, IOContext io_context)
{
- LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+ Buffer buffer = BufferDescriptorGetBuffer(buf);
+
+ BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE);
FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
- LWLockRelease(BufferDescriptorGetContentLock(buf));
+ BufferLockUnlock(buffer, buf);
}
/*
@@ -5660,9 +5700,10 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
*
* Used to clean up after errors.
*
- * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
- * of releasing buffer content locks per se; the only thing we need to deal
- * with here is clearing any PIN_COUNT request that was in progress.
+ * Currently, we can expect that resource owner cleanup, via
+ * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
+ * se; the only thing we need to deal with here is clearing any PIN_COUNT
+ * request that was in progress.
*/
void
UnlockBuffers(void)
@@ -5693,25 +5734,723 @@ UnlockBuffers(void)
}
/*
- * Acquire or release the content_lock for the buffer.
+ * Acquire the buffer content lock in the specified mode
+ *
+ * If the lock is not available, sleep until it is.
+ *
+ * Side effect: cancel/die interrupts are held off until lock release.
+ *
+ * This uses almost the same locking approach as lwlock.c's
+ * LWLockAcquire(). See documentation at the top of lwlock.c for a more
+ * detailed discussion.
+ *
+ * The reason that this, and most of the other BufferLock* functions, get both
+ * the Buffer and BufferDesc* as parameters, is that looking up one from the
+ * other repeatedly shows up noticeably in profiles.
+ *
+ * Callers should provide a constant for mode, for more efficient code
+ * generation.
+ */
+static inline void
+BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
+{
+ PrivateRefCountEntry *entry;
+ int extraWaits = 0;
+
+ /*
+ * Get reference to the refcount entry before we hold the lock, it seems
+ * better to do before holding the lock.
+ */
+ entry = GetPrivateRefCountEntry(buffer, true);
+
+ /*
+ * We better not already hold a lock on the buffer.
+ */
+ Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the content lock. This ensures that interrupts will not interfere
+ * with manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ for (;;)
+ {
+ bool mustwait;
+ uint32 wait_event;
+
+ /*
+ * Try to grab the lock the first time, we're not in the waitqueue
+ * yet/anymore.
+ */
+ mustwait = BufferLockAttempt(buf_hdr, mode);
+
+ if (likely(!mustwait))
+ {
+ break;
+ }
+
+ /*
+ * Ok, at this point we couldn't grab the lock on the first try. We
+ * cannot simply queue ourselves to the end of the list and wait to be
+ * woken up because by now the lock could long have been released.
+ * Instead add us to the queue and try to grab the lock again. If we
+ * succeed we need to revert the queuing and be happy, otherwise we
+ * recheck the lock. If we still couldn't grab it, we know that the
+ * other locker will see our queue entries when releasing since they
+ * existed before we checked for the lock.
+ */
+
+ /* add to the queue */
+ BufferLockQueueSelf(buf_hdr, mode);
+
+ /* we're now guaranteed to be woken up if necessary */
+ mustwait = BufferLockAttempt(buf_hdr, mode);
+
+ /* ok, grabbed the lock the second time round, need to undo queueing */
+ if (!mustwait)
+ {
+ BufferLockDequeueSelf(buf_hdr);
+ break;
+ }
+
+ switch (mode)
+ {
+ case BUFFER_LOCK_EXCLUSIVE:
+ wait_event = WAIT_EVENT_BUFFER_EXCLUSIVE;
+ break;
+ case BUFFER_LOCK_SHARE_EXCLUSIVE:
+ wait_event = WAIT_EVENT_BUFFER_SHARE_EXCLUSIVE;
+ break;
+ case BUFFER_LOCK_SHARE:
+ wait_event = WAIT_EVENT_BUFFER_SHARED;
+ break;
+ case BUFFER_LOCK_UNLOCK:
+ pg_unreachable();
+
+ }
+ pgstat_report_wait_start(wait_event);
+
+ /*
+ * Wait until awakened.
+ *
+ * It is possible that we get awakened for a reason other than being
+ * signaled by BufferLockWakeup(). If so, loop back and wait again.
+ * Once we've gotten the lock, re-increment the sema by the number of
+ * additional signals received.
+ */
+ for (;;)
+ {
+ PGSemaphoreLock(MyProc->sem);
+ if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
+ break;
+ extraWaits++;
+ }
+
+ pgstat_report_wait_end();
+
+ /* Retrying, allow BufferLockRelease to release waiters again. */
+ pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
+ }
+
+ /* Remember that we now hold this lock */
+ entry->data.lockmode = mode;
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (unlikely(extraWaits-- > 0))
+ PGSemaphoreUnlock(MyProc->sem);
+}
+
+/*
+ * Release a previously acquired buffer content lock.
+ */
+static void
+BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
+{
+ BufferLockMode mode;
+ uint64 oldstate;
+ uint64 sub;
+
+ mode = BufferLockDisownInternal(buffer, buf_hdr);
+
+ /*
+ * Release my hold on lock, after that it can immediately be acquired by
+ * others, even if we still have to wakeup other waiters.
+ */
+ sub = BufferLockReleaseSub(mode);
+
+ oldstate = pg_atomic_sub_fetch_u64(&buf_hdr->state, sub);
+
+ BufferLockProcessRelease(buf_hdr, mode, oldstate);
+
+ /*
+ * Now okay to allow cancel/die interrupts.
+ */
+ RESUME_INTERRUPTS();
+}
+
+
+/*
+ * Acquire the content lock for the buffer, but only if we don't have to wait.
+ */
+static bool
+BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
+{
+ PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
+ bool mustwait;
+
+ /*
+ * We better not already hold a lock on the buffer.
+ */
+ Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the content lock. This ensures that interrupts will not interfere
+ * with manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /* Check for the lock */
+ mustwait = BufferLockAttempt(buf_hdr, mode);
+
+ if (mustwait)
+ {
+ /* Failed to get lock, so release interrupt holdoff */
+ RESUME_INTERRUPTS();
+ }
+ else
+ {
+ entry->data.lockmode = mode;
+ }
+
+ return !mustwait;
+}
+
+/*
+ * Internal function that tries to atomically acquire the content lock in the
+ * passed in mode.
+ *
+ * This function will not block waiting for a lock to become free - that's the
+ * caller's job.
+ *
+ * Similar to LWLockAttemptLock().
+ */
+static inline bool
+BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
+{
+ uint64 old_state;
+
+ /*
+ * Read once outside the loop, later iterations will get the newer value
+ * via compare & exchange.
+ */
+ old_state = pg_atomic_read_u64(&buf_hdr->state);
+
+ /* loop until we've determined whether we could acquire the lock or not */
+ while (true)
+ {
+ uint64 desired_state;
+ bool lock_free;
+
+ desired_state = old_state;
+
+ if (mode == BUFFER_LOCK_EXCLUSIVE)
+ {
+ lock_free = (old_state & BM_LOCK_MASK) == 0;
+ if (lock_free)
+ desired_state += BM_LOCK_VAL_EXCLUSIVE;
+ }
+ else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
+ {
+ lock_free = (old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) == 0;
+ if (lock_free)
+ desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
+ }
+ else
+ {
+ lock_free = (old_state & BM_LOCK_VAL_EXCLUSIVE) == 0;
+ if (lock_free)
+ desired_state += BM_LOCK_VAL_SHARED;
+ }
+
+ /*
+ * Attempt to swap in the state we are expecting. If we didn't see
+ * lock to be free, that's just the old value. If we saw it as free,
+ * we'll attempt to mark it acquired. The reason that we always swap
+ * in the value is that this doubles as a memory barrier. We could try
+ * to be smarter and only swap in values if we saw the lock as free,
+ * but benchmark haven't shown it as beneficial so far.
+ *
+ * Retry if the value changed since we last looked at it.
+ */
+ if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
+ &old_state, desired_state)))
+ {
+ if (lock_free)
+ {
+ /* Great! Got the lock. */
+ return false;
+ }
+ else
+ return true; /* somebody else has the lock */
+ }
+ }
+
+ pg_unreachable();
+}
+
+/*
+ * Add ourselves to the end of the content lock's wait queue.
+ */
+static void
+BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
+{
+ /*
+ * If we don't have a PGPROC structure, there's no way to wait. This
+ * should never occur, since MyProc should only be null during shared
+ * memory initialization.
+ */
+ if (MyProc == NULL)
+ elog(PANIC, "cannot wait without a PGPROC structure");
+
+ if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
+ elog(PANIC, "queueing for lock while waiting on another one");
+
+ LockBufHdr(buf_hdr);
+
+ /* setting the flag is protected by the spinlock */
+ pg_atomic_fetch_or_u64(&buf_hdr->state, BM_LOCK_HAS_WAITERS);
+
+ /*
+ * These are currently used both for lwlocks and buffer content locks,
+ * which is acceptable, although not pretty, because a backend can't wait
+ * for both types of locks at the same time.
+ */
+ MyProc->lwWaiting = LW_WS_WAITING;
+ MyProc->lwWaitMode = mode;
+
+ proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
+
+ /* Can release the mutex now */
+ UnlockBufHdr(buf_hdr);
+}
+
+/*
+ * Remove ourselves from the waitlist.
+ *
+ * This is used if we queued ourselves because we thought we needed to sleep
+ * but, after further checking, we discovered that we don't actually need to
+ * do so.
+ */
+static void
+BufferLockDequeueSelf(BufferDesc *buf_hdr)
+{
+ bool on_waitlist;
+
+ LockBufHdr(buf_hdr);
+
+ on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
+ if (on_waitlist)
+ proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
+
+ if (proclist_is_empty(&buf_hdr->lock_waiters) &&
+ (pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS) != 0)
+ {
+ pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_HAS_WAITERS);
+ }
+
+ /* XXX: combine with fetch_and above? */
+ UnlockBufHdr(buf_hdr);
+
+ /* clear waiting state again, nice for debugging */
+ if (on_waitlist)
+ MyProc->lwWaiting = LW_WS_NOT_WAITING;
+ else
+ {
+ int extraWaits = 0;
+
+
+ /*
+ * Somebody else dequeued us and has or will wake us up. Deal with the
+ * superfluous absorption of a wakeup.
+ */
+
+ /*
+ * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
+ * removed ourselves - they'll have set it.
+ */
+ pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
+
+ /*
+ * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
+ * get reset at some inconvenient point later. Most of the time this
+ * will immediately return.
+ */
+ for (;;)
+ {
+ PGSemaphoreLock(MyProc->sem);
+ if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
+ break;
+ extraWaits++;
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(MyProc->sem);
+ }
+}
+
+/*
+ * Stop treating lock as held by current backend.
+ *
+ * After calling this function it's the callers responsibility to ensure that
+ * the lock gets released, even in case of an error. This only is desirable if
+ * the lock is going to be released in a different process than the process
+ * that acquired it.
+ */
+static inline void
+BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
+{
+ BufferLockDisownInternal(buffer, buf_hdr);
+ RESUME_INTERRUPTS();
+}
+
+/*
+ * Stop treating lock as held by current backend.
+ *
+ * This is the code that can be shared between actually releasing a lock
+ * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
+ * without releasing the lock (BufferLockDisown()).
+ */
+static inline int
+BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
+{
+ BufferLockMode mode;
+ PrivateRefCountEntry *ref;
+
+ ref = GetPrivateRefCountEntry(buffer, false);
+ if (ref == NULL)
+ elog(ERROR, "lock %d is not held", buffer);
+ mode = ref->data.lockmode;
+ ref->data.lockmode = BUFFER_LOCK_UNLOCK;
+
+ return mode;
+}
+
+/*
+ * Wakeup all the lockers that currently have a chance to acquire the lock.
+ *
+ * wake_exclusive indicates whether exclusive lock waiters should be woken up.
+ */
+static void
+BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
+{
+ bool new_wake_in_progress = false;
+ bool wake_share_exclusive = true;
+ proclist_head wakeup;
+ proclist_mutable_iter iter;
+
+ proclist_init(&wakeup);
+
+ /* lock wait list while collecting backends to wake up */
+ LockBufHdr(buf_hdr);
+
+ proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ /*
+ * Already woke up a conflicting lock, so skip over this wait list
+ * entry.
+ */
+ if (!wake_exclusive && waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
+ continue;
+ if (!wake_share_exclusive && waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
+ continue;
+
+ proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
+ proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+ /*
+ * Prevent additional wakeups until retryer gets to run. Backends that
+ * are just waiting for the lock to become free don't retry
+ * automatically.
+ */
+ new_wake_in_progress = true;
+
+ /*
+ * Signal that the process isn't on the wait list anymore. This allows
+ * BufferLockDequeueSelf() to remove itself from the waitlist with a
+ * proclist_delete(), rather than having to check if it has been
+ * removed from the list.
+ */
+ Assert(waiter->lwWaiting == LW_WS_WAITING);
+ waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
+
+ /*
+ * Don't wakeup further waiters after waking a conflicting waiter.
+ */
+ if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
+ {
+ /*
+ * Share locks conflict with exclusive locks.
+ */
+ wake_exclusive = false;
+ }
+ else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
+ {
+ /*
+ * Share-exclusive locks conflict with share-exclusive and
+ * exclusive locks.
+ */
+ wake_exclusive = false;
+ wake_share_exclusive = false;
+ }
+ else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
+ {
+ /*
+ * Exclusive locks conflict with all other locks, there's no point
+ * in waking up anybody else.
+ */
+ break;
+ }
+ }
+
+ Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS);
+
+ /* unset required flags, and release lock, in one fell swoop */
+ {
+ uint64 old_state;
+ uint64 desired_state;
+
+ old_state = pg_atomic_read_u64(&buf_hdr->state);
+ while (true)
+ {
+ desired_state = old_state;
+
+ /* compute desired flags */
+
+ if (new_wake_in_progress)
+ desired_state |= BM_LOCK_WAKE_IN_PROGRESS;
+ else
+ desired_state &= ~BM_LOCK_WAKE_IN_PROGRESS;
+
+ if (proclist_is_empty(&buf_hdr->lock_waiters))
+ desired_state &= ~BM_LOCK_HAS_WAITERS;
+
+ desired_state &= ~BM_LOCKED; /* release lock */
+
+ if (pg_atomic_compare_exchange_u64(&buf_hdr->state, &old_state,
+ desired_state))
+ break;
+ }
+ }
+
+ /* Awaken any waiters I removed from the queue. */
+ proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ proclist_delete(&wakeup, iter.cur, lwWaitLink);
+
+ /*
+ * Guarantee that lwWaiting being unset only becomes visible once the
+ * unlink from the link has completed. Otherwise the target backend
+ * could be woken up for other reason and enqueue for a new lock - if
+ * that happens before the list unlink happens, the list would end up
+ * being corrupted.
+ *
+ * The barrier pairs with the LockBufHdr() when enqueuing for another
+ * lock.
+ */
+ pg_write_barrier();
+ waiter->lwWaiting = LW_WS_NOT_WAITING;
+ PGSemaphoreUnlock(waiter->sem);
+ }
+}
+
+/*
+ * Compute subtraction from buffer state for a release of a held lock in
+ * `mode`.
+ *
+ * This is separated from BufferLockUnlock() as we want to combine the lock
+ * release with other atomic operations when possible, leading to the lock
+ * release being done in multiple places, each needing to compute what to
+ * subtract from the lock state.
+ */
+static inline uint64
+BufferLockReleaseSub(BufferLockMode mode)
+{
+ /*
+ * Turns out that a switch() leads gcc to generate sufficiently worse code
+ * for this to show up in profiles...
+ */
+ if (mode == BUFFER_LOCK_EXCLUSIVE)
+ return BM_LOCK_VAL_EXCLUSIVE;
+ else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
+ return BM_LOCK_VAL_SHARE_EXCLUSIVE;
+ else
+ {
+ Assert(mode == BUFFER_LOCK_SHARE);
+ return BM_LOCK_VAL_SHARED;
+ }
+
+ return 0; /* keep compiler quiet */
+}
+
+/*
+ * Handle work that needs to be done after releasing a lock that was held in
+ * `mode`, where `lockstate` is the result of the atomic operation modifying
+ * the state variable.
+ *
+ * This is separated from BufferLockUnlock() as we want to combine the lock
+ * release with other atomic operations when possible, leading to the lock
+ * release being done in multiple places.
+ */
+static void
+BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
+{
+ bool check_waiters = false;
+ bool wake_exclusive = false;
+
+ /* nobody else can have that kind of lock */
+ Assert(!(lockstate & BM_LOCK_VAL_EXCLUSIVE));
+
+ /*
+ * If we're still waiting for backends to get scheduled, don't wake them
+ * up again. Otherwise check if we need to look through the waitqueue to
+ * wake other backends.
+ */
+ if ((lockstate & BM_LOCK_HAS_WAITERS) &&
+ !(lockstate & BM_LOCK_WAKE_IN_PROGRESS))
+ {
+ if ((lockstate & BM_LOCK_MASK) == 0)
+ {
+ /*
+ * We released a lock and the lock was, in that moment, free. We
+ * therefore can wake waiters for any kind of lock.
+ */
+ check_waiters = true;
+ wake_exclusive = true;
+ }
+ else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
+ {
+ /*
+ * We released the lock, but another backend still holds a lock.
+ * We can't have released an exclusive lock, as there couldn't
+ * have been other lock holders. If we released a share lock, no
+ * waiters need to be woken up, as there must be other share
+ * lockers. However, if we held a share-exclusive lock, another
+ * backend now could acquire a share-exclusive lock.
+ */
+ check_waiters = true;
+ wake_exclusive = false;
+ }
+ }
+
+ /*
+ * As waking up waiters requires the spinlock to be acquired, only do so
+ * if necessary.
+ */
+ if (check_waiters)
+ BufferLockWakeup(buf_hdr, wake_exclusive);
+}
+
+/*
+ * BufferLockHeldByMeInMode - test whether my process holds the content lock
+ * in the specified mode
+ *
+ * This is meant as debug support only.
+ */
+static bool
+BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
+{
+ PrivateRefCountEntry *entry =
+ GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
+
+ if (!entry)
+ return false;
+ else
+ return entry->data.lockmode == mode;
+}
+
+/*
+ * BufferLockHeldByMe - test whether my process holds the content lock in any
+ * mode
+ *
+ * This is meant as debug support only.
+ */
+static bool
+BufferLockHeldByMe(BufferDesc *buf_hdr)
+{
+ PrivateRefCountEntry *entry =
+ GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
+
+ if (!entry)
+ return false;
+ else
+ return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
+}
+
+/*
+ * Release the content lock for the buffer.
*/
void
-LockBuffer(Buffer buffer, BufferLockMode mode)
+UnlockBuffer(Buffer buffer)
{
- BufferDesc *buf;
+ BufferDesc *buf_hdr;
Assert(BufferIsPinned(buffer));
if (BufferIsLocal(buffer))
return; /* local buffers need no lock */
- buf = GetBufferDescriptor(buffer - 1);
+ buf_hdr = GetBufferDescriptor(buffer - 1);
+ BufferLockUnlock(buffer, buf_hdr);
+}
+
+/*
+ * Acquire the content_lock for the buffer.
+ */
+void
+LockBufferInternal(Buffer buffer, BufferLockMode mode)
+{
+ BufferDesc *buf_hdr;
+
+ /*
+ * We can't wait if we haven't got a PGPROC. This should only occur
+ * during bootstrap or shared memory initialization. Put an Assert here
+ * to catch unsafe coding practices.
+ */
+ Assert(!(MyProc == NULL && IsUnderPostmaster));
+
+ /* handled in LockBuffer() wrapper */
+ Assert(mode != BUFFER_LOCK_UNLOCK);
- if (mode == BUFFER_LOCK_UNLOCK)
- LWLockRelease(BufferDescriptorGetContentLock(buf));
- else if (mode == BUFFER_LOCK_SHARE)
- LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+ Assert(BufferIsPinned(buffer));
+ if (BufferIsLocal(buffer))
+ return; /* local buffers need no lock */
+
+ buf_hdr = GetBufferDescriptor(buffer - 1);
+
+ /*
+ * Test the most frequent lock modes first. While a switch (mode) would be
+ * nice, at least gcc generates considerably worse code for it.
+ *
+ * Call BufferLockAcquire() with a constant argument for mode, to generate
+ * more efficient code for the different lock modes.
+ */
+ if (mode == BUFFER_LOCK_SHARE)
+ BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE);
else if (mode == BUFFER_LOCK_EXCLUSIVE)
- LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+ BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_EXCLUSIVE);
+ else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
+ BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE);
else
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
}
@@ -5732,8 +6471,7 @@ ConditionalLockBuffer(Buffer buffer)
buf = GetBufferDescriptor(buffer - 1);
- return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
- LW_EXCLUSIVE);
+ return BufferLockConditional(buffer, buf, BUFFER_LOCK_EXCLUSIVE);
}
/*
@@ -6247,8 +6985,8 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits,
/*
* AbortBufferIO: Clean up active buffer I/O after an error.
*
- * All LWLocks we might have held have been released,
- * but we haven't yet released buffer pins, so the buffer is still pinned.
+ * All LWLocks & content locks we might have held have been released, but we
+ * haven't yet released buffer pins, so the buffer is still pinned.
*
* If I/O was in progress, we always set BM_IO_ERROR, even though it's
* possible the error condition wasn't related to the I/O.
@@ -6676,8 +7414,14 @@ ResOwnerPrintBufferIO(Datum res)
return psprintf("lost track of buffer IO on buffer %d", buffer);
}
+/*
+ * Release buffer as part of resource owner cleanup. This will only be called
+ * if the buffer is pinned. If this backend held the content lock at the time
+ * of the error we also need to release that (note that it is not possible to
+ * hold a content lock without a pin).
+ */
static void
-ResOwnerReleaseBufferPin(Datum res)
+ResOwnerReleaseBuffer(Datum res)
{
Buffer buffer = DatumGetInt32(res);
@@ -6688,11 +7432,32 @@ ResOwnerReleaseBufferPin(Datum res)
if (BufferIsLocal(buffer))
UnpinLocalBufferNoOwner(buffer);
else
+ {
+ PrivateRefCountEntry *ref;
+
+ ref = GetPrivateRefCountEntry(buffer, false);
+
+ /* not having a private refcount would imply resowner corruption */
+ Assert(ref != NULL);
+
+ /*
+ * If the buffer was locked at the time of the resowner release,
+ * release the lock now. This should only happen after errors.
+ */
+ if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
+ {
+ BufferDesc *buf = GetBufferDescriptor(buffer - 1);
+
+ HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
+ BufferLockUnlock(buffer, buf);
+ }
+
UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
+ }
}
static char *
-ResOwnerPrintBufferPin(Datum res)
+ResOwnerPrintBuffer(Datum res)
{
return DebugPrintBufferRefcount(DatumGetInt32(res));
}
@@ -6924,10 +7689,10 @@ MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc,
/* If it was not already dirty, mark it as dirty. */
if (!(buf_state & BM_DIRTY))
{
- LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE);
+ BufferLockAcquire(buf, desc, BUFFER_LOCK_EXCLUSIVE);
MarkBufferDirty(buf);
result = true;
- LWLockRelease(BufferDescriptorGetContentLock(desc));
+ BufferLockUnlock(buf, desc);
}
else
*buffer_already_dirty = true;
@@ -7178,16 +7943,12 @@ buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
*/
if (is_write && !is_temp)
{
- LWLock *content_lock;
-
- content_lock = BufferDescriptorGetContentLock(buf_hdr);
-
- Assert(LWLockHeldByMe(content_lock));
+ Assert(BufferLockHeldByMe(buf_hdr));
/*
* Lock is now owned by AIO subsystem.
*/
- LWLockDisown(content_lock);
+ BufferLockDisown(buffer, buf_hdr);
}
/*
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 7194aee3532..5537a2d2530 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -287,6 +287,9 @@ ABI_compatibility:
Section: ClassName - WaitEventBuffer
BUFFER_CLEANUP "Waiting to acquire an exclusive pin on a buffer. Buffer pin waits can be protracted if another process holds an open cursor that last read data from the buffer in question."
+BUFFER_SHARED "Waiting to acquire a shared lock on a buffer."
+BUFFER_SHARE_EXCLUSIVE "Waiting to acquire a share exclusive lock on a buffer."
+BUFFER_EXCLUSIVE "Waiting to acquire a exclusive lock on a buffer."
ABI_compatibility:
@@ -375,7 +378,6 @@ NotifyBuffer "Waiting for I/O on a NOTIFY message SLRU buffer
NotifyChannelHash "Waiting to access the NOTIFY channel hash table."
SerialBuffer "Waiting for I/O on a serializable transaction conflict SLRU buffer."
WALInsert "Waiting to insert WAL data into a memory buffer."
-BufferContent "Waiting to access a data page in memory."
ReplicationOriginState "Waiting to read or update the progress of one replication origin."
ReplicationSlotIO "Waiting for I/O on a replication slot."
LockFastPath "Waiting to read or update a process' fast-path lock information."
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index e6e788224f5..27f12502d19 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -23,6 +23,7 @@
#include "storage/condition_variable.h"
#include "storage/lwlock.h"
#include "storage/procnumber.h"
+#include "storage/proclist_types.h"
#include "storage/shmem.h"
#include "storage/smgr.h"
#include "storage/spin.h"
@@ -35,22 +36,23 @@
* State of the buffer itself (in order):
* - 18 bits refcount
* - 4 bits usage count
- * - 10 bits of flags
+ * - 12 bits of flags
+ * - 18 bits share-lock count
+ * - 1 bit share-exclusive locked
+ * - 1 bit exclusive locked
*
* Combining these values allows to perform some operations without locking
* the buffer header, by modifying them together with a CAS loop.
*
- * NB: A future commit will use a significant portion of the remaining bits to
- * implement buffer locking as part of the state variable.
- *
* The definition of buffer state components is below.
*/
#define BUF_REFCOUNT_BITS 18
#define BUF_USAGECOUNT_BITS 4
-#define BUF_FLAG_BITS 10
+#define BUF_FLAG_BITS 12
+#define BUF_LOCK_BITS (18+2)
-StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
- "parts of buffer state space need to equal 32");
+StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS + BUF_LOCK_BITS <= 64,
+ "parts of buffer state space need to be <= 64");
/* refcount related definitions */
#define BUF_REFCOUNT_ONE 1
@@ -71,6 +73,19 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
#define BUF_FLAG_MASK \
(((UINT64CONST(1) << BUF_FLAG_BITS) - 1) << BUF_FLAG_SHIFT)
+/* lock state related definitions */
+#define BM_LOCK_SHIFT \
+ (BUF_FLAG_SHIFT + BUF_FLAG_BITS)
+#define BM_LOCK_VAL_SHARED \
+ (UINT64CONST(1) << (BM_LOCK_SHIFT))
+#define BM_LOCK_VAL_SHARE_EXCLUSIVE \
+ (UINT64CONST(1) << (BM_LOCK_SHIFT + MAX_BACKENDS_BITS))
+#define BM_LOCK_VAL_EXCLUSIVE \
+ (UINT64CONST(1) << (BM_LOCK_SHIFT + MAX_BACKENDS_BITS + 1))
+#define BM_LOCK_MASK \
+ ((((uint64) MAX_BACKENDS) << BM_LOCK_SHIFT) | BM_LOCK_VAL_SHARE_EXCLUSIVE | BM_LOCK_VAL_EXCLUSIVE)
+
+
/* Get refcount and usagecount from buffer state */
#define BUF_STATE_GET_REFCOUNT(state) \
((uint32)((state) & BUF_REFCOUNT_MASK))
@@ -107,6 +122,17 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
#define BM_CHECKPOINT_NEEDED BUF_DEFINE_FLAG( 8)
/* permanent buffer (not unlogged, or init fork) */
#define BM_PERMANENT BUF_DEFINE_FLAG( 9)
+/* content lock has waiters */
+#define BM_LOCK_HAS_WAITERS BUF_DEFINE_FLAG(10)
+/* waiter for content lock has been signalled but not yet run */
+#define BM_LOCK_WAKE_IN_PROGRESS BUF_DEFINE_FLAG(11)
+
+
+StaticAssertDecl(MAX_BACKENDS_BITS <= BUF_REFCOUNT_BITS,
+ "MAX_BACKENDS_BITS needs to be <= BUF_REFCOUNT_BITS");
+StaticAssertDecl(MAX_BACKENDS_BITS <= (BUF_LOCK_BITS - 2),
+ "MAX_BACKENDS_BITS needs to be <= BUF_LOCK_BITS - 2");
+
/*
* The maximum allowed value of usage_count represents a tradeoff between
@@ -120,8 +146,6 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
StaticAssertDecl(BM_MAX_USAGE_COUNT < (UINT64CONST(1) << BUF_USAGECOUNT_BITS),
"BM_MAX_USAGE_COUNT doesn't fit in BUF_USAGECOUNT_BITS bits");
-StaticAssertDecl(MAX_BACKENDS_BITS <= BUF_REFCOUNT_BITS,
- "MAX_BACKENDS_BITS needs to be <= BUF_REFCOUNT_BITS");
/*
* Buffer tag identifies which disk block the buffer contains.
@@ -265,9 +289,6 @@ BufMappingPartitionLockByIndex(uint32 index)
* it is held. However, existing buffer pins may be released while the buffer
* header spinlock is held, using an atomic subtraction.
*
- * The LWLock can take care of itself. The buffer header lock is *not* used
- * to control access to the data in the buffer!
- *
* If we have the buffer pinned, its tag can't change underneath us, so we can
* examine the tag without locking the buffer header. Also, in places we do
* one-time reads of the flags without bothering to lock the buffer header;
@@ -280,6 +301,15 @@ BufMappingPartitionLockByIndex(uint32 index)
* wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
* there can be only one such waiter per buffer.
*
+ * The content of buffers is protected via the buffer content lock,
+ * implemented as part of the buffer state. Note that the buffer header lock
+ * is *not* used to control access to the data in the buffer! We used to use
+ * an LWLock to implement the content lock, but having a dedicated
+ * implementation of content locks allows us to implement some otherwise hard
+ * things (e.g. race-freely checking if AIO is in progress before locking a
+ * buffer exclusively) and enables otherwise impossible optimizations
+ * (e.g. unlocking and unpinning a buffer in one atomic operation).
+ *
* We use this same struct for local buffer headers, but the locks are not
* used and not all of the flag bits are useful either. To avoid unnecessary
* overhead, manipulations of the state field should be done without actual
@@ -321,7 +351,12 @@ typedef struct BufferDesc
int wait_backend_pgprocno;
PgAioWaitRef io_wref; /* set iff AIO is in progress */
- LWLock content_lock; /* to lock access to buffer contents */
+
+ /*
+ * List of PGPROCs waiting for the buffer content lock. Protected by the
+ * buffer header spinlock.
+ */
+ proclist_head lock_waiters;
} BufferDesc;
/*
@@ -408,12 +443,6 @@ BufferDescriptorGetIOCV(const BufferDesc *bdesc)
return &(BufferIOCVArray[bdesc->buf_id]).cv;
}
-static inline LWLock *
-BufferDescriptorGetContentLock(const BufferDesc *bdesc)
-{
- return (LWLock *) (&bdesc->content_lock);
-}
-
/*
* Functions for acquiring/releasing a shared buffer header's spinlock. Do
* not apply these to local buffers!
@@ -491,18 +520,18 @@ extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
/* ResourceOwner callbacks to hold buffer I/Os and pins */
extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
-extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
+extern PGDLLIMPORT const ResourceOwnerDesc buffer_resowner_desc;
/* Convenience wrappers over ResourceOwnerRemember/Forget */
static inline void
ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
{
- ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
+ ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_resowner_desc);
}
static inline void
ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
{
- ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
+ ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_resowner_desc);
}
static inline void
ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 715ae96f0f0..a40adf6b2a8 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -203,7 +203,20 @@ extern PGDLLIMPORT int32 *LocalRefCount;
typedef enum BufferLockMode
{
BUFFER_LOCK_UNLOCK,
+
+ /*
+ * A share lock conflicts with exclusive locks.
+ */
BUFFER_LOCK_SHARE,
+
+ /*
+ * A share-exclusive lock conflicts with itself and exclusive locks.
+ */
+ BUFFER_LOCK_SHARE_EXCLUSIVE,
+
+ /*
+ * An exclusive lock conflicts with every other lock type.
+ */
BUFFER_LOCK_EXCLUSIVE,
} BufferLockMode;
@@ -302,7 +315,24 @@ extern void BufferGetTag(Buffer buffer, RelFileLocator *rlocator,
extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
extern void UnlockBuffers(void);
-extern void LockBuffer(Buffer buffer, BufferLockMode mode);
+extern void UnlockBuffer(Buffer buffer);
+extern void LockBufferInternal(Buffer buffer, BufferLockMode mode);
+
+/*
+ * Handling BUFFER_LOCK_UNLOCK in bufmgr.c leads to sufficiently worse branch
+ * prediction to impact performance. Therefore handle that switch here, where
+ * most of the time `mode` will be a constant and thus can be optimized out by
+ * the compiler.
+ */
+static inline void
+LockBuffer(Buffer buffer, BufferLockMode mode)
+{
+ if (mode == BUFFER_LOCK_UNLOCK)
+ UnlockBuffer(buffer);
+ else
+ LockBufferInternal(buffer, mode);
+}
+
extern bool ConditionalLockBuffer(Buffer buffer);
extern void LockBufferForCleanup(Buffer buffer);
extern bool ConditionalLockBufferForCleanup(Buffer buffer);
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index e6b32daff99..e94ebce95b9 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -105,7 +105,6 @@ PG_LWLOCKTRANCHE(NOTIFY_BUFFER, NotifyBuffer)
PG_LWLOCKTRANCHE(NOTIFY_CHANNEL_HASH, NotifyChannelHash)
PG_LWLOCKTRANCHE(SERIAL_BUFFER, SerialBuffer)
PG_LWLOCKTRANCHE(WAL_INSERT, WALInsert)
-PG_LWLOCKTRANCHE(BUFFER_CONTENT, BufferContent)
PG_LWLOCKTRANCHE(REPLICATION_ORIGIN_STATE, ReplicationOriginState)
PG_LWLOCKTRANCHE(REPLICATION_SLOT_IO, ReplicationSlotIO)
PG_LWLOCKTRANCHE(LOCK_FASTPATH, LockFastPath)
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index de7b2e0bd2c..039bc8353be 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -242,7 +242,13 @@ struct PGPROC
*/
bool recoveryConflictPending;
- /* Info about LWLock the process is currently waiting for, if any. */
+ /*
+ * Info about LWLock the process is currently waiting for, if any.
+ *
+ * This is currently used both for lwlocks and buffer content locks, which
+ * is acceptable, although not pretty, because a backend can't wait for
+ * both types of locks at the same time.
+ */
uint8 lwWaiting; /* see LWLockWaitState */
uint8 lwWaitMode; /* lwlock mode being waited for */
proclist_node lwWaitLink; /* position in LW lock wait list */