Optimize nbtree backwards scans.

Make nbtree backwards scans optimistically access the next page to be
read to the left by following a prevPage block number that's now stashed
in currPos when the leaf page is first read.  This approach matches the
one taken during forward scans, which follow a symmetric nextPage block
number from currPos.  We stash both a prevPage and a nextPage, since the
scan direction might change (when fetching from a scrollable cursor).

Backwards scans will no longer need to lock the same page twice, except
in rare cases where the scan detects a concurrent page split (or page
deletion).  Testing has shown this optimization to be particularly
effective during parallel index-only backwards scans: ~12% reductions in
query execution time are quite possible.

We're much better off being optimistic; concurrent left sibling page
splits are rare in general.  It's possible that we'll need to lock more
pages than the pessimistic approach would have, but only when there are
_multiple_ concurrent splits of the left sibling page we now start at.
If there's just a single concurrent left sibling page split, the new
approach to scanning backwards will at least break even relative to the
old one (we'll acquire the same number of leaf page locks as before).

The optimization from this commit has long been contemplated by comments
added by commit 2ed5b87f96, which changed the rules for locking/pinning
during nbtree index scans.  The approach that that commit introduced to
leaf level link traversal when scanning forwards is now more or less
applied all the time, regardless of the direction we're scanning in.

Following uniform conventions around sibling link traversal is simpler.
The only real remaining difference between our forward and backwards
handling is that our backwards handling must still detect and recover
from any concurrent left sibling splits (and concurrent page deletions),
as documented in the nbtree README.  That is structured as a single,
isolated extra step that takes place in _bt_readnextpage.

Also use this opportunity to further simplify the functions that deal
with reading pages and traversing sibling links on the leaf level, and
to document their preconditions and postconditions (with respect to
things like buffer locks, buffer pins, and seizing the parallel scan).

This enhancement completely supersedes the one recently added by commit
3f44959f.

Author: Matthias van de Meent <boekewurm+postgres@gmail.com>
Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAEze2WgpBGRgTTxTWVPXc9+PB6fc1a7t+VyGXHzfnrFXcQVxnA@mail.gmail.com
Discussion: https://postgr.es/m/CAH2-WzkBTuFv7W2+84jJT8mWZLXVL0GHq2hMUTn6c9Vw=eYrCw@mail.gmail.com
pull/182/head
Peter Geoghegan 8 months ago
parent 9e2d813d59
commit 1bd4bc85ca
  1. 75
      src/backend/access/nbtree/nbtree.c
  2. 715
      src/backend/access/nbtree/nbtsearch.c
  3. 2
      src/backend/access/nbtree/nbtutils.c
  4. 62
      src/include/access/nbtree.h

@ -66,7 +66,9 @@ typedef enum
*/
typedef struct BTParallelScanDescData
{
BlockNumber btps_scanPage; /* latest or next page to be scanned */
BlockNumber btps_nextScanPage; /* next page to be scanned */
BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
* btps_nextScanPage */
BTPS_State btps_pageStatus; /* indicates whether next page is
* available for scan. see above for
* possible states of parallel scan. */
@ -550,7 +552,8 @@ btinitparallelscan(void *target)
BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
SpinLockInit(&bt_target->btps_mutex);
bt_target->btps_scanPage = InvalidBlockNumber;
bt_target->btps_nextScanPage = InvalidBlockNumber;
bt_target->btps_lastCurrPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
ConditionVariableInit(&bt_target->btps_cv);
}
@ -575,7 +578,8 @@ btparallelrescan(IndexScanDesc scan)
* consistency.
*/
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_nextScanPage = InvalidBlockNumber;
btscan->btps_lastCurrPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
SpinLockRelease(&btscan->btps_mutex);
}
@ -591,18 +595,20 @@ btparallelrescan(IndexScanDesc scan)
* start just yet (only backends that call from _bt_first are capable of
* starting primitive index scans, which they indicate by passing first=true).
*
* If the return value is true, *pageno returns the next or current page
* of the scan (depending on the scan direction). An invalid block number
* means the scan hasn't yet started, or that caller needs to start the next
* primitive index scan (if it's the latter case we'll set so.needPrimScan).
* The first time a participating process reaches the last page, it will return
* true and set *pageno to P_NONE; after that, further attempts to seize the
* scan will return false.
* If the return value is true, *next_scan_page returns the next page of the
* scan, and *last_curr_page returns the page that *next_scan_page came from.
* An invalid *next_scan_page means the scan hasn't yet started, or that
* caller needs to start the next primitive index scan (if it's the latter
* case we'll set so.needPrimScan). The first time a participating process
* reaches the last page, it will return true and set *next_scan_page to
* P_NONE; after that, further attempts to seize the scan will return false.
*
* Callers should ignore the value of pageno if the return value is false.
* Callers should ignore the value of *next_scan_page and *last_curr_page if
* the return value is false.
*/
bool
_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
_bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
BlockNumber *last_curr_page, bool first)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
bool exit_loop = false;
@ -610,7 +616,17 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
*pageno = P_NONE;
*next_scan_page = P_NONE;
*last_curr_page = InvalidBlockNumber;
/*
* Reset so->currPos, and initialize moreLeft/moreRight such that the next
* call to _bt_readnextpage treats this backend similarly to a serial
* backend that steps from *last_curr_page to *next_scan_page (unless this
* backend's so->currPos is initialized by _bt_readfirstpage before then).
*/
BTScanPosInvalidate(so->currPos);
so->currPos.moreLeft = so->currPos.moreRight = true;
if (first)
{
@ -660,7 +676,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
array->cur_elem = btscan->btps_arrElems[i];
skey->sk_argument = array->elem_values[array->cur_elem];
}
*pageno = InvalidBlockNumber;
*next_scan_page = InvalidBlockNumber;
exit_loop = true;
}
else
@ -688,7 +704,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
* of advancing it to a new page!
*/
btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
*pageno = btscan->btps_scanPage;
*next_scan_page = btscan->btps_nextScanPage;
*last_curr_page = btscan->btps_lastCurrPage;
exit_loop = true;
}
SpinLockRelease(&btscan->btps_mutex);
@ -703,17 +720,21 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
/*
* _bt_parallel_release() -- Complete the process of advancing the scan to a
* new page. We now have the new value btps_scanPage; some other backend
* new page. We now have the new value btps_nextScanPage; another backend
* can now begin advancing the scan.
*
* Callers whose scan uses array keys must save their scan_page argument so
* Callers whose scan uses array keys must save their curr_page argument so
* that it can be passed to _bt_parallel_primscan_schedule, should caller
* determine that another primitive index scan is required. If that happens,
* scan_page won't be scanned by any backend (unless the next primitive index
* scan lands on scan_page).
* determine that another primitive index scan is required.
*
* Note: unlike the serial case, parallel scans don't need to remember both
* sibling links. next_scan_page is whichever link is next given the scan's
* direction. That's all we'll ever need, since the direction of a parallel
* scan can never change.
*/
void
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
_bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
BlockNumber curr_page)
{
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
@ -722,7 +743,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
parallel_scan->ps_offset);
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = scan_page;
btscan->btps_nextScanPage = next_scan_page;
btscan->btps_lastCurrPage = curr_page;
btscan->btps_pageStatus = BTPARALLEL_IDLE;
SpinLockRelease(&btscan->btps_mutex);
ConditionVariableSignal(&btscan->btps_cv);
@ -778,13 +800,13 @@ _bt_parallel_done(IndexScanDesc scan)
/*
* _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
*
* Caller passes the block number most recently passed to _bt_parallel_release
* Caller passes the curr_page most recently passed to _bt_parallel_release
* by its backend. Caller successfully schedules the next primitive index scan
* if the shared parallel state hasn't been seized since caller's backend last
* advanced the scan.
*/
void
_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@ -796,10 +818,11 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
parallel_scan->ps_offset);
SpinLockAcquire(&btscan->btps_mutex);
if (btscan->btps_scanPage == prev_scan_page &&
if (btscan->btps_lastCurrPage == curr_page &&
btscan->btps_pageStatus == BTPARALLEL_IDLE)
{
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_nextScanPage = InvalidBlockNumber;
btscan->btps_lastCurrPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
/* Serialize scan's current array keys */

File diff suppressed because it is too large Load Diff

@ -2407,7 +2407,7 @@ new_prim_scan:
so->needPrimScan = true; /* ...but call _bt_first again */
if (scan->parallel_scan)
_bt_parallel_primscan_schedule(scan, pstate->prev_scan_page);
_bt_parallel_primscan_schedule(scan, so->currPos.currPage);
/* Caller's tuple doesn't match the new qual */
return false;

@ -925,13 +925,13 @@ typedef BTVacuumPostingData *BTVacuumPosting;
* Index scans work a page at a time: we pin and read-lock the page, identify
* all the matching items on the page and save them in BTScanPosData, then
* release the read-lock while returning the items to the caller for
* processing. This approach minimizes lock/unlock traffic. Note that we
* keep the pin on the index page until the caller is done with all the items
* (this is needed for VACUUM synchronization, see nbtree/README). When we
* are ready to step to the next page, if the caller has told us any of the
* items were killed, we re-lock the page to mark them killed, then unlock.
* Finally we drop the pin and step to the next page in the appropriate
* direction.
* processing. This approach minimizes lock/unlock traffic. We must always
* drop the lock to make it okay for caller to process the returned items.
* Whether or not we can also release the pin during this window will vary.
* We drop the pin eagerly (when safe) to avoid blocking progress by VACUUM
* (see nbtree/README section about making concurrent TID recycling safe).
* We'll always release both the lock and the pin on the current page before
* moving on to its sibling page.
*
* If we are doing an index-only scan, we save the entire IndexTuple for each
* matched item, otherwise only its heap TID and offset. The IndexTuples go
@ -950,28 +950,15 @@ typedef struct BTScanPosItem /* what we remember about each match */
typedef struct BTScanPosData
{
Buffer buf; /* if valid, the buffer is pinned */
Buffer buf; /* currPage buf (invalid means unpinned) */
XLogRecPtr lsn; /* pos in the WAL stream when page was read */
/* page details as of the saved position's call to _bt_readpage */
BlockNumber currPage; /* page referenced by items array */
BlockNumber nextPage; /* page's right link when we scanned it */
BlockNumber prevPage; /* currPage's left link */
BlockNumber nextPage; /* currPage's right link */
XLogRecPtr lsn; /* currPage's LSN */
/*
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
* We can clear the appropriate one of these flags when _bt_checkkeys()
* sets BTReadPageState.continuescan = false.
*/
bool moreLeft;
bool moreRight;
/*
* Direction of the scan at the time that _bt_readpage was called.
*
* Used by btrestrpos to "restore" the scan's array keys by resetting each
* array to its first element's value (first in this scan direction). This
* avoids the need to directly track the array keys in btmarkpos.
*/
/* scan direction for the saved position's call to _bt_readpage */
ScanDirection dir;
/*
@ -980,6 +967,13 @@ typedef struct BTScanPosData
*/
int nextTupleOffset;
/*
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
*/
bool moreLeft;
bool moreRight;
/*
* The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the
@ -1021,11 +1015,8 @@ typedef BTScanPosData *BTScanPos;
)
#define BTScanPosInvalidate(scanpos) \
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
(scanpos).currPage = InvalidBlockNumber; \
} while (0)
/* We need one of these for each equality-type SK_SEARCHARRAY scan key */
@ -1091,7 +1082,6 @@ typedef struct BTReadPageState
OffsetNumber minoff; /* Lowest non-pivot tuple's offset */
OffsetNumber maxoff; /* Highest non-pivot tuple's offset */
IndexTuple finaltup; /* Needed by scans with array keys */
BlockNumber prev_scan_page; /* previous _bt_parallel_release block */
Page page; /* Page being read */
/* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
@ -1192,12 +1182,14 @@ extern int btgettreeheight(Relation rel);
/*
* prototypes for internal functions in nbtree.c
*/
extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
bool first);
extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
BlockNumber *last_curr_page, bool first);
extern void _bt_parallel_release(IndexScanDesc scan,
BlockNumber next_scan_page,
BlockNumber curr_page);
extern void _bt_parallel_done(IndexScanDesc scan);
extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
BlockNumber prev_scan_page);
BlockNumber curr_page);
/*
* prototypes for functions in nbtdedup.c

Loading…
Cancel
Save