Optimize nbtree backwards scans.

Make nbtree backwards scans optimistically access the next page to be
read to the left by following a prevPage block number that's now stashed
in currPos when the leaf page is first read.  This approach matches the
one taken during forward scans, which follow a symmetric nextPage block
number from currPos.  We stash both a prevPage and a nextPage, since the
scan direction might change (when fetching from a scrollable cursor).

Backwards scans will no longer need to lock the same page twice, except
in rare cases where the scan detects a concurrent page split (or page
deletion).  Testing has shown this optimization to be particularly
effective during parallel index-only backwards scans: ~12% reductions in
query execution time are quite possible.

We're much better off being optimistic; concurrent left sibling page
splits are rare in general.  It's possible that we'll need to lock more
pages than the pessimistic approach would have, but only when there are
_multiple_ concurrent splits of the left sibling page we now start at.
If there's just a single concurrent left sibling page split, the new
approach to scanning backwards will at least break even relative to the
old one (we'll acquire the same number of leaf page locks as before).

The optimization from this commit has long been contemplated by comments
added by commit 2ed5b87f96, which changed the rules for locking/pinning
during nbtree index scans.  The approach that that commit introduced to
leaf level link traversal when scanning forwards is now more or less
applied all the time, regardless of the direction we're scanning in.

Following uniform conventions around sibling link traversal is simpler.
The only real remaining difference between our forward and backwards
handling is that our backwards handling must still detect and recover
from any concurrent left sibling splits (and concurrent page deletions),
as documented in the nbtree README.  That is structured as a single,
isolated extra step that takes place in _bt_readnextpage.

Also use this opportunity to further simplify the functions that deal
with reading pages and traversing sibling links on the leaf level, and
to document their preconditions and postconditions (with respect to
things like buffer locks, buffer pins, and seizing the parallel scan).

This enhancement completely supersedes the one recently added by commit
3f44959f.

Author: Matthias van de Meent <boekewurm+postgres@gmail.com>
Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAEze2WgpBGRgTTxTWVPXc9+PB6fc1a7t+VyGXHzfnrFXcQVxnA@mail.gmail.com
Discussion: https://postgr.es/m/CAH2-WzkBTuFv7W2+84jJT8mWZLXVL0GHq2hMUTn6c9Vw=eYrCw@mail.gmail.com
pull/182/head
Peter Geoghegan 8 months ago
parent 9e2d813d59
commit 1bd4bc85ca
  1. 75
      src/backend/access/nbtree/nbtree.c
  2. 715
      src/backend/access/nbtree/nbtsearch.c
  3. 2
      src/backend/access/nbtree/nbtutils.c
  4. 62
      src/include/access/nbtree.h

@ -66,7 +66,9 @@ typedef enum
*/ */
typedef struct BTParallelScanDescData typedef struct BTParallelScanDescData
{ {
BlockNumber btps_scanPage; /* latest or next page to be scanned */ BlockNumber btps_nextScanPage; /* next page to be scanned */
BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
* btps_nextScanPage */
BTPS_State btps_pageStatus; /* indicates whether next page is BTPS_State btps_pageStatus; /* indicates whether next page is
* available for scan. see above for * available for scan. see above for
* possible states of parallel scan. */ * possible states of parallel scan. */
@ -550,7 +552,8 @@ btinitparallelscan(void *target)
BTParallelScanDesc bt_target = (BTParallelScanDesc) target; BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
SpinLockInit(&bt_target->btps_mutex); SpinLockInit(&bt_target->btps_mutex);
bt_target->btps_scanPage = InvalidBlockNumber; bt_target->btps_nextScanPage = InvalidBlockNumber;
bt_target->btps_lastCurrPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
ConditionVariableInit(&bt_target->btps_cv); ConditionVariableInit(&bt_target->btps_cv);
} }
@ -575,7 +578,8 @@ btparallelrescan(IndexScanDesc scan)
* consistency. * consistency.
*/ */
SpinLockAcquire(&btscan->btps_mutex); SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_nextScanPage = InvalidBlockNumber;
btscan->btps_lastCurrPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
SpinLockRelease(&btscan->btps_mutex); SpinLockRelease(&btscan->btps_mutex);
} }
@ -591,18 +595,20 @@ btparallelrescan(IndexScanDesc scan)
* start just yet (only backends that call from _bt_first are capable of * start just yet (only backends that call from _bt_first are capable of
* starting primitive index scans, which they indicate by passing first=true). * starting primitive index scans, which they indicate by passing first=true).
* *
* If the return value is true, *pageno returns the next or current page * If the return value is true, *next_scan_page returns the next page of the
* of the scan (depending on the scan direction). An invalid block number * scan, and *last_curr_page returns the page that *next_scan_page came from.
* means the scan hasn't yet started, or that caller needs to start the next * An invalid *next_scan_page means the scan hasn't yet started, or that
* primitive index scan (if it's the latter case we'll set so.needPrimScan). * caller needs to start the next primitive index scan (if it's the latter
* The first time a participating process reaches the last page, it will return * case we'll set so.needPrimScan). The first time a participating process
* true and set *pageno to P_NONE; after that, further attempts to seize the * reaches the last page, it will return true and set *next_scan_page to
* scan will return false. * P_NONE; after that, further attempts to seize the scan will return false.
* *
* Callers should ignore the value of pageno if the return value is false. * Callers should ignore the value of *next_scan_page and *last_curr_page if
* the return value is false.
*/ */
bool bool
_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
BlockNumber *last_curr_page, bool first)
{ {
BTScanOpaque so = (BTScanOpaque) scan->opaque; BTScanOpaque so = (BTScanOpaque) scan->opaque;
bool exit_loop = false; bool exit_loop = false;
@ -610,7 +616,17 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
ParallelIndexScanDesc parallel_scan = scan->parallel_scan; ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan; BTParallelScanDesc btscan;
*pageno = P_NONE; *next_scan_page = P_NONE;
*last_curr_page = InvalidBlockNumber;
/*
* Reset so->currPos, and initialize moreLeft/moreRight such that the next
* call to _bt_readnextpage treats this backend similarly to a serial
* backend that steps from *last_curr_page to *next_scan_page (unless this
* backend's so->currPos is initialized by _bt_readfirstpage before then).
*/
BTScanPosInvalidate(so->currPos);
so->currPos.moreLeft = so->currPos.moreRight = true;
if (first) if (first)
{ {
@ -660,7 +676,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
array->cur_elem = btscan->btps_arrElems[i]; array->cur_elem = btscan->btps_arrElems[i];
skey->sk_argument = array->elem_values[array->cur_elem]; skey->sk_argument = array->elem_values[array->cur_elem];
} }
*pageno = InvalidBlockNumber; *next_scan_page = InvalidBlockNumber;
exit_loop = true; exit_loop = true;
} }
else else
@ -688,7 +704,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
* of advancing it to a new page! * of advancing it to a new page!
*/ */
btscan->btps_pageStatus = BTPARALLEL_ADVANCING; btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
*pageno = btscan->btps_scanPage; *next_scan_page = btscan->btps_nextScanPage;
*last_curr_page = btscan->btps_lastCurrPage;
exit_loop = true; exit_loop = true;
} }
SpinLockRelease(&btscan->btps_mutex); SpinLockRelease(&btscan->btps_mutex);
@ -703,17 +720,21 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
/* /*
* _bt_parallel_release() -- Complete the process of advancing the scan to a * _bt_parallel_release() -- Complete the process of advancing the scan to a
* new page. We now have the new value btps_scanPage; some other backend * new page. We now have the new value btps_nextScanPage; another backend
* can now begin advancing the scan. * can now begin advancing the scan.
* *
* Callers whose scan uses array keys must save their scan_page argument so * Callers whose scan uses array keys must save their curr_page argument so
* that it can be passed to _bt_parallel_primscan_schedule, should caller * that it can be passed to _bt_parallel_primscan_schedule, should caller
* determine that another primitive index scan is required. If that happens, * determine that another primitive index scan is required.
* scan_page won't be scanned by any backend (unless the next primitive index *
* scan lands on scan_page). * Note: unlike the serial case, parallel scans don't need to remember both
* sibling links. next_scan_page is whichever link is next given the scan's
* direction. That's all we'll ever need, since the direction of a parallel
* scan can never change.
*/ */
void void
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
BlockNumber curr_page)
{ {
ParallelIndexScanDesc parallel_scan = scan->parallel_scan; ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan; BTParallelScanDesc btscan;
@ -722,7 +743,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
parallel_scan->ps_offset); parallel_scan->ps_offset);
SpinLockAcquire(&btscan->btps_mutex); SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = scan_page; btscan->btps_nextScanPage = next_scan_page;
btscan->btps_lastCurrPage = curr_page;
btscan->btps_pageStatus = BTPARALLEL_IDLE; btscan->btps_pageStatus = BTPARALLEL_IDLE;
SpinLockRelease(&btscan->btps_mutex); SpinLockRelease(&btscan->btps_mutex);
ConditionVariableSignal(&btscan->btps_cv); ConditionVariableSignal(&btscan->btps_cv);
@ -778,13 +800,13 @@ _bt_parallel_done(IndexScanDesc scan)
/* /*
* _bt_parallel_primscan_schedule() -- Schedule another primitive index scan. * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
* *
* Caller passes the block number most recently passed to _bt_parallel_release * Caller passes the curr_page most recently passed to _bt_parallel_release
* by its backend. Caller successfully schedules the next primitive index scan * by its backend. Caller successfully schedules the next primitive index scan
* if the shared parallel state hasn't been seized since caller's backend last * if the shared parallel state hasn't been seized since caller's backend last
* advanced the scan. * advanced the scan.
*/ */
void void
_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
{ {
BTScanOpaque so = (BTScanOpaque) scan->opaque; BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan; ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@ -796,10 +818,11 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
parallel_scan->ps_offset); parallel_scan->ps_offset);
SpinLockAcquire(&btscan->btps_mutex); SpinLockAcquire(&btscan->btps_mutex);
if (btscan->btps_scanPage == prev_scan_page && if (btscan->btps_lastCurrPage == curr_page &&
btscan->btps_pageStatus == BTPARALLEL_IDLE) btscan->btps_pageStatus == BTPARALLEL_IDLE)
{ {
btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_nextScanPage = InvalidBlockNumber;
btscan->btps_lastCurrPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
/* Serialize scan's current array keys */ /* Serialize scan's current array keys */

File diff suppressed because it is too large Load Diff

@ -2407,7 +2407,7 @@ new_prim_scan:
so->needPrimScan = true; /* ...but call _bt_first again */ so->needPrimScan = true; /* ...but call _bt_first again */
if (scan->parallel_scan) if (scan->parallel_scan)
_bt_parallel_primscan_schedule(scan, pstate->prev_scan_page); _bt_parallel_primscan_schedule(scan, so->currPos.currPage);
/* Caller's tuple doesn't match the new qual */ /* Caller's tuple doesn't match the new qual */
return false; return false;

@ -925,13 +925,13 @@ typedef BTVacuumPostingData *BTVacuumPosting;
* Index scans work a page at a time: we pin and read-lock the page, identify * Index scans work a page at a time: we pin and read-lock the page, identify
* all the matching items on the page and save them in BTScanPosData, then * all the matching items on the page and save them in BTScanPosData, then
* release the read-lock while returning the items to the caller for * release the read-lock while returning the items to the caller for
* processing. This approach minimizes lock/unlock traffic. Note that we * processing. This approach minimizes lock/unlock traffic. We must always
* keep the pin on the index page until the caller is done with all the items * drop the lock to make it okay for caller to process the returned items.
* (this is needed for VACUUM synchronization, see nbtree/README). When we * Whether or not we can also release the pin during this window will vary.
* are ready to step to the next page, if the caller has told us any of the * We drop the pin eagerly (when safe) to avoid blocking progress by VACUUM
* items were killed, we re-lock the page to mark them killed, then unlock. * (see nbtree/README section about making concurrent TID recycling safe).
* Finally we drop the pin and step to the next page in the appropriate * We'll always release both the lock and the pin on the current page before
* direction. * moving on to its sibling page.
* *
* If we are doing an index-only scan, we save the entire IndexTuple for each * If we are doing an index-only scan, we save the entire IndexTuple for each
* matched item, otherwise only its heap TID and offset. The IndexTuples go * matched item, otherwise only its heap TID and offset. The IndexTuples go
@ -950,28 +950,15 @@ typedef struct BTScanPosItem /* what we remember about each match */
typedef struct BTScanPosData typedef struct BTScanPosData
{ {
Buffer buf; /* if valid, the buffer is pinned */ Buffer buf; /* currPage buf (invalid means unpinned) */
XLogRecPtr lsn; /* pos in the WAL stream when page was read */ /* page details as of the saved position's call to _bt_readpage */
BlockNumber currPage; /* page referenced by items array */ BlockNumber currPage; /* page referenced by items array */
BlockNumber nextPage; /* page's right link when we scanned it */ BlockNumber prevPage; /* currPage's left link */
BlockNumber nextPage; /* currPage's right link */
XLogRecPtr lsn; /* currPage's LSN */
/* /* scan direction for the saved position's call to _bt_readpage */
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
* We can clear the appropriate one of these flags when _bt_checkkeys()
* sets BTReadPageState.continuescan = false.
*/
bool moreLeft;
bool moreRight;
/*
* Direction of the scan at the time that _bt_readpage was called.
*
* Used by btrestrpos to "restore" the scan's array keys by resetting each
* array to its first element's value (first in this scan direction). This
* avoids the need to directly track the array keys in btmarkpos.
*/
ScanDirection dir; ScanDirection dir;
/* /*
@ -980,6 +967,13 @@ typedef struct BTScanPosData
*/ */
int nextTupleOffset; int nextTupleOffset;
/*
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
*/
bool moreLeft;
bool moreRight;
/* /*
* The items array is always ordered in index order (ie, increasing * The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the * indexoffset). When scanning backwards it is convenient to fill the
@ -1021,11 +1015,8 @@ typedef BTScanPosData *BTScanPos;
) )
#define BTScanPosInvalidate(scanpos) \ #define BTScanPosInvalidate(scanpos) \
do { \ do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \ (scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \ (scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextTupleOffset = 0; \
} while (0) } while (0)
/* We need one of these for each equality-type SK_SEARCHARRAY scan key */ /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
@ -1091,7 +1082,6 @@ typedef struct BTReadPageState
OffsetNumber minoff; /* Lowest non-pivot tuple's offset */ OffsetNumber minoff; /* Lowest non-pivot tuple's offset */
OffsetNumber maxoff; /* Highest non-pivot tuple's offset */ OffsetNumber maxoff; /* Highest non-pivot tuple's offset */
IndexTuple finaltup; /* Needed by scans with array keys */ IndexTuple finaltup; /* Needed by scans with array keys */
BlockNumber prev_scan_page; /* previous _bt_parallel_release block */
Page page; /* Page being read */ Page page; /* Page being read */
/* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
@ -1192,12 +1182,14 @@ extern int btgettreeheight(Relation rel);
/* /*
* prototypes for internal functions in nbtree.c * prototypes for internal functions in nbtree.c
*/ */
extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
bool first); BlockNumber *last_curr_page, bool first);
extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); extern void _bt_parallel_release(IndexScanDesc scan,
BlockNumber next_scan_page,
BlockNumber curr_page);
extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_done(IndexScanDesc scan);
extern void _bt_parallel_primscan_schedule(IndexScanDesc scan, extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
BlockNumber prev_scan_page); BlockNumber curr_page);
/* /*
* prototypes for functions in nbtdedup.c * prototypes for functions in nbtdedup.c

Loading…
Cancel
Save