@ -32,16 +32,6 @@ static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
static OffsetNumber _bt_binsrch ( Relation rel , BTScanInsert key , Buffer buf ) ;
static int _bt_binsrch_posting ( BTScanInsert key , Page page ,
OffsetNumber offnum ) ;
static bool _bt_readpage ( IndexScanDesc scan , ScanDirection dir ,
OffsetNumber offnum , bool firstpage ) ;
static void _bt_saveitem ( BTScanOpaque so , int itemIndex ,
OffsetNumber offnum , IndexTuple itup ) ;
static int _bt_setuppostingitems ( BTScanOpaque so , int itemIndex ,
OffsetNumber offnum , const ItemPointerData * heapTid ,
IndexTuple itup ) ;
static inline void _bt_savepostingitem ( BTScanOpaque so , int itemIndex ,
OffsetNumber offnum ,
ItemPointer heapTid , int tupleOffset ) ;
static inline void _bt_returnitem ( IndexScanDesc scan , BTScanOpaque so ) ;
static bool _bt_steppage ( IndexScanDesc scan , ScanDirection dir ) ;
static bool _bt_readfirstpage ( IndexScanDesc scan , OffsetNumber offnum ,
@ -1623,517 +1613,6 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
return true ;
}
/*
* _bt_readpage ( ) - - Load data from current index page into so - > currPos
*
* Caller must have pinned and read - locked so - > currPos . buf ; the buffer ' s state
* is not changed here . Also , currPos . moreLeft and moreRight must be valid ;
* they are updated as appropriate . All other fields of so - > currPos are
* initialized from scratch here .
*
* We scan the current page starting at offnum and moving in the indicated
* direction . All items matching the scan keys are loaded into currPos . items .
* moreLeft or moreRight ( as appropriate ) is cleared if _bt_checkkeys reports
* that there can be no more matching tuples in the current scan direction
* ( could just be for the current primitive index scan when scan has arrays ) .
*
* In the case of a parallel scan , caller must have called _bt_parallel_seize
* prior to calling this function ; this function will invoke
* _bt_parallel_release before returning .
*
* Returns true if any matching items found on the page , false if none .
*/
static bool
_bt_readpage ( IndexScanDesc scan , ScanDirection dir , OffsetNumber offnum ,
bool firstpage )
{
Relation rel = scan - > indexRelation ;
BTScanOpaque so = ( BTScanOpaque ) scan - > opaque ;
Page page ;
BTPageOpaque opaque ;
OffsetNumber minoff ;
OffsetNumber maxoff ;
BTReadPageState pstate ;
bool arrayKeys ;
int itemIndex ,
indnatts ;
/* save the page/buffer block number, along with its sibling links */
page = BufferGetPage ( so - > currPos . buf ) ;
opaque = BTPageGetOpaque ( page ) ;
so - > currPos . currPage = BufferGetBlockNumber ( so - > currPos . buf ) ;
so - > currPos . prevPage = opaque - > btpo_prev ;
so - > currPos . nextPage = opaque - > btpo_next ;
/* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
so - > currPos . dir = dir ;
so - > currPos . nextTupleOffset = 0 ;
/* either moreRight or moreLeft should be set now (may be unset later) */
Assert ( ScanDirectionIsForward ( dir ) ? so - > currPos . moreRight :
so - > currPos . moreLeft ) ;
Assert ( ! P_IGNORE ( opaque ) ) ;
Assert ( BTScanPosIsPinned ( so - > currPos ) ) ;
Assert ( ! so - > needPrimScan ) ;
if ( scan - > parallel_scan )
{
/* allow next/prev page to be read by other worker without delay */
if ( ScanDirectionIsForward ( dir ) )
_bt_parallel_release ( scan , so - > currPos . nextPage ,
so - > currPos . currPage ) ;
else
_bt_parallel_release ( scan , so - > currPos . prevPage ,
so - > currPos . currPage ) ;
}
PredicateLockPage ( rel , so - > currPos . currPage , scan - > xs_snapshot ) ;
/* initialize local variables */
indnatts = IndexRelationGetNumberOfAttributes ( rel ) ;
arrayKeys = so - > numArrayKeys ! = 0 ;
minoff = P_FIRSTDATAKEY ( opaque ) ;
maxoff = PageGetMaxOffsetNumber ( page ) ;
/* initialize page-level state that we'll pass to _bt_checkkeys */
pstate . minoff = minoff ;
pstate . maxoff = maxoff ;
pstate . finaltup = NULL ;
pstate . page = page ;
pstate . firstpage = firstpage ;
pstate . forcenonrequired = false ;
pstate . startikey = 0 ;
pstate . offnum = InvalidOffsetNumber ;
pstate . skip = InvalidOffsetNumber ;
pstate . continuescan = true ; /* default assumption */
pstate . rechecks = 0 ;
pstate . targetdistance = 0 ;
pstate . nskipadvances = 0 ;
if ( ScanDirectionIsForward ( dir ) )
{
/* SK_SEARCHARRAY forward scans must provide high key up front */
if ( arrayKeys )
{
if ( ! P_RIGHTMOST ( opaque ) )
{
ItemId iid = PageGetItemId ( page , P_HIKEY ) ;
pstate . finaltup = ( IndexTuple ) PageGetItem ( page , iid ) ;
if ( so - > scanBehind & &
! _bt_scanbehind_checkkeys ( scan , dir , pstate . finaltup ) )
{
/* Schedule another primitive index scan after all */
so - > currPos . moreRight = false ;
so - > needPrimScan = true ;
if ( scan - > parallel_scan )
_bt_parallel_primscan_schedule ( scan ,
so - > currPos . currPage ) ;
return false ;
}
}
so - > scanBehind = so - > oppositeDirCheck = false ; /* reset */
}
/*
* Consider pstate . startikey optimization once the ongoing primitive
* index scan has already read at least one page
*/
if ( ! pstate . firstpage & & minoff < maxoff )
_bt_set_startikey ( scan , & pstate ) ;
/* load items[] in ascending order */
itemIndex = 0 ;
offnum = Max ( offnum , minoff ) ;
while ( offnum < = maxoff )
{
ItemId iid = PageGetItemId ( page , offnum ) ;
IndexTuple itup ;
bool passes_quals ;
/*
* If the scan specifies not to return killed tuples , then we
* treat a killed tuple as not passing the qual
*/
if ( scan - > ignore_killed_tuples & & ItemIdIsDead ( iid ) )
{
offnum = OffsetNumberNext ( offnum ) ;
continue ;
}
itup = ( IndexTuple ) PageGetItem ( page , iid ) ;
Assert ( ! BTreeTupleIsPivot ( itup ) ) ;
pstate . offnum = offnum ;
passes_quals = _bt_checkkeys ( scan , & pstate , arrayKeys ,
itup , indnatts ) ;
/*
* Check if we need to skip ahead to a later tuple ( only possible
* when the scan uses array keys )
*/
if ( arrayKeys & & OffsetNumberIsValid ( pstate . skip ) )
{
Assert ( ! passes_quals & & pstate . continuescan ) ;
Assert ( offnum < pstate . skip ) ;
Assert ( ! pstate . forcenonrequired ) ;
offnum = pstate . skip ;
pstate . skip = InvalidOffsetNumber ;
continue ;
}
if ( passes_quals )
{
/* tuple passes all scan key conditions */
if ( ! BTreeTupleIsPosting ( itup ) )
{
/* Remember it */
_bt_saveitem ( so , itemIndex , offnum , itup ) ;
itemIndex + + ;
}
else
{
int tupleOffset ;
/*
* Set up state to return posting list , and remember first
* TID
*/
tupleOffset =
_bt_setuppostingitems ( so , itemIndex , offnum ,
BTreeTupleGetPostingN ( itup , 0 ) ,
itup ) ;
itemIndex + + ;
/* Remember additional TIDs */
for ( int i = 1 ; i < BTreeTupleGetNPosting ( itup ) ; i + + )
{
_bt_savepostingitem ( so , itemIndex , offnum ,
BTreeTupleGetPostingN ( itup , i ) ,
tupleOffset ) ;
itemIndex + + ;
}
}
}
/* When !continuescan, there can't be any more matches, so stop */
if ( ! pstate . continuescan )
break ;
offnum = OffsetNumberNext ( offnum ) ;
}
/*
* We don ' t need to visit page to the right when the high key
* indicates that no more matches will be found there .
*
* Checking the high key like this works out more often than you might
* think . Leaf page splits pick a split point between the two most
* dissimilar tuples ( this is weighed against the need to evenly share
* free space ) . Leaf pages with high key attribute values that can
* only appear on non - pivot tuples on the right sibling page are
* common .
*/
if ( pstate . continuescan & & ! so - > scanBehind & & ! P_RIGHTMOST ( opaque ) )
{
ItemId iid = PageGetItemId ( page , P_HIKEY ) ;
IndexTuple itup = ( IndexTuple ) PageGetItem ( page , iid ) ;
int truncatt ;
/* Reset arrays, per _bt_set_startikey contract */
if ( pstate . forcenonrequired )
_bt_start_array_keys ( scan , dir ) ;
pstate . forcenonrequired = false ;
pstate . startikey = 0 ; /* _bt_set_startikey ignores P_HIKEY */
truncatt = BTreeTupleGetNAtts ( itup , rel ) ;
_bt_checkkeys ( scan , & pstate , arrayKeys , itup , truncatt ) ;
}
if ( ! pstate . continuescan )
so - > currPos . moreRight = false ;
Assert ( itemIndex < = MaxTIDsPerBTreePage ) ;
so - > currPos . firstItem = 0 ;
so - > currPos . lastItem = itemIndex - 1 ;
so - > currPos . itemIndex = 0 ;
}
else
{
/* SK_SEARCHARRAY backward scans must provide final tuple up front */
if ( arrayKeys )
{
if ( minoff < = maxoff & & ! P_LEFTMOST ( opaque ) )
{
ItemId iid = PageGetItemId ( page , minoff ) ;
pstate . finaltup = ( IndexTuple ) PageGetItem ( page , iid ) ;
if ( so - > scanBehind & &
! _bt_scanbehind_checkkeys ( scan , dir , pstate . finaltup ) )
{
/* Schedule another primitive index scan after all */
so - > currPos . moreLeft = false ;
so - > needPrimScan = true ;
if ( scan - > parallel_scan )
_bt_parallel_primscan_schedule ( scan ,
so - > currPos . currPage ) ;
return false ;
}
}
so - > scanBehind = so - > oppositeDirCheck = false ; /* reset */
}
/*
* Consider pstate . startikey optimization once the ongoing primitive
* index scan has already read at least one page
*/
if ( ! pstate . firstpage & & minoff < maxoff )
_bt_set_startikey ( scan , & pstate ) ;
/* load items[] in descending order */
itemIndex = MaxTIDsPerBTreePage ;
offnum = Min ( offnum , maxoff ) ;
while ( offnum > = minoff )
{
ItemId iid = PageGetItemId ( page , offnum ) ;
IndexTuple itup ;
bool tuple_alive ;
bool passes_quals ;
/*
* If the scan specifies not to return killed tuples , then we
* treat a killed tuple as not passing the qual . Most of the
* time , it ' s a win to not bother examining the tuple ' s index
* keys , but just skip to the next tuple ( previous , actually ,
* since we ' re scanning backwards ) . However , if this is the first
* tuple on the page , we do check the index keys , to prevent
* uselessly advancing to the page to the left . This is similar
* to the high key optimization used by forward scans .
*/
if ( scan - > ignore_killed_tuples & & ItemIdIsDead ( iid ) )
{
if ( offnum > minoff )
{
offnum = OffsetNumberPrev ( offnum ) ;
continue ;
}
tuple_alive = false ;
}
else
tuple_alive = true ;
itup = ( IndexTuple ) PageGetItem ( page , iid ) ;
Assert ( ! BTreeTupleIsPivot ( itup ) ) ;
pstate . offnum = offnum ;
if ( arrayKeys & & offnum = = minoff & & pstate . forcenonrequired )
{
/* Reset arrays, per _bt_set_startikey contract */
pstate . forcenonrequired = false ;
pstate . startikey = 0 ;
_bt_start_array_keys ( scan , dir ) ;
}
passes_quals = _bt_checkkeys ( scan , & pstate , arrayKeys ,
itup , indnatts ) ;
if ( arrayKeys & & so - > scanBehind )
{
/*
* Done scanning this page , but not done with the current
* primscan .
*
* Note : Forward scans don ' t check this explicitly , since they
* prefer to reuse pstate . skip for this instead .
*/
Assert ( ! passes_quals & & pstate . continuescan ) ;
Assert ( ! pstate . forcenonrequired ) ;
break ;
}
/*
* Check if we need to skip ahead to a later tuple ( only possible
* when the scan uses array keys )
*/
if ( arrayKeys & & OffsetNumberIsValid ( pstate . skip ) )
{
Assert ( ! passes_quals & & pstate . continuescan ) ;
Assert ( offnum > pstate . skip ) ;
Assert ( ! pstate . forcenonrequired ) ;
offnum = pstate . skip ;
pstate . skip = InvalidOffsetNumber ;
continue ;
}
if ( passes_quals & & tuple_alive )
{
/* tuple passes all scan key conditions */
if ( ! BTreeTupleIsPosting ( itup ) )
{
/* Remember it */
itemIndex - - ;
_bt_saveitem ( so , itemIndex , offnum , itup ) ;
}
else
{
int tupleOffset ;
/*
* Set up state to return posting list , and remember first
* TID .
*
* Note that we deliberately save / return items from
* posting lists in ascending heap TID order for backwards
* scans . This allows _bt_killitems ( ) to make a
* consistent assumption about the order of items
* associated with the same posting list tuple .
*/
itemIndex - - ;
tupleOffset =
_bt_setuppostingitems ( so , itemIndex , offnum ,
BTreeTupleGetPostingN ( itup , 0 ) ,
itup ) ;
/* Remember additional TIDs */
for ( int i = 1 ; i < BTreeTupleGetNPosting ( itup ) ; i + + )
{
itemIndex - - ;
_bt_savepostingitem ( so , itemIndex , offnum ,
BTreeTupleGetPostingN ( itup , i ) ,
tupleOffset ) ;
}
}
}
/* When !continuescan, there can't be any more matches, so stop */
if ( ! pstate . continuescan )
break ;
offnum = OffsetNumberPrev ( offnum ) ;
}
/*
* We don ' t need to visit page to the left when no more matches will
* be found there
*/
if ( ! pstate . continuescan )
so - > currPos . moreLeft = false ;
Assert ( itemIndex > = 0 ) ;
so - > currPos . firstItem = itemIndex ;
so - > currPos . lastItem = MaxTIDsPerBTreePage - 1 ;
so - > currPos . itemIndex = MaxTIDsPerBTreePage - 1 ;
}
/*
* If _bt_set_startikey told us to temporarily treat the scan ' s keys as
* nonrequired ( possible only during scans with array keys ) , there must be
* no lasting consequences for the scan ' s array keys . The scan ' s arrays
* should now have exactly the same elements as they would have had if the
* nonrequired behavior had never been used . ( In general , a scan ' s arrays
* are expected to track its progress through the index ' s key space . )
*
* We are required ( by _bt_set_startikey ) to call _bt_checkkeys against
* pstate . finaltup with pstate . forcenonrequired = false to allow the scan ' s
* arrays to recover . Assert that that step hasn ' t been missed .
*/
Assert ( ! pstate . forcenonrequired ) ;
return ( so - > currPos . firstItem < = so - > currPos . lastItem ) ;
}
/* Save an index item into so->currPos.items[itemIndex] */
static void
_bt_saveitem ( BTScanOpaque so , int itemIndex ,
OffsetNumber offnum , IndexTuple itup )
{
BTScanPosItem * currItem = & so - > currPos . items [ itemIndex ] ;
Assert ( ! BTreeTupleIsPivot ( itup ) & & ! BTreeTupleIsPosting ( itup ) ) ;
currItem - > heapTid = itup - > t_tid ;
currItem - > indexOffset = offnum ;
if ( so - > currTuples )
{
Size itupsz = IndexTupleSize ( itup ) ;
currItem - > tupleOffset = so - > currPos . nextTupleOffset ;
memcpy ( so - > currTuples + so - > currPos . nextTupleOffset , itup , itupsz ) ;
so - > currPos . nextTupleOffset + = MAXALIGN ( itupsz ) ;
}
}
/*
* Setup state to save TIDs / items from a single posting list tuple .
*
* Saves an index item into so - > currPos . items [ itemIndex ] for TID that is
* returned to scan first . Second or subsequent TIDs for posting list should
* be saved by calling _bt_savepostingitem ( ) .
*
* Returns an offset into tuple storage space that main tuple is stored at if
* needed .
*/
static int
_bt_setuppostingitems ( BTScanOpaque so , int itemIndex , OffsetNumber offnum ,
const ItemPointerData * heapTid , IndexTuple itup )
{
BTScanPosItem * currItem = & so - > currPos . items [ itemIndex ] ;
Assert ( BTreeTupleIsPosting ( itup ) ) ;
currItem - > heapTid = * heapTid ;
currItem - > indexOffset = offnum ;
if ( so - > currTuples )
{
/* Save base IndexTuple (truncate posting list) */
IndexTuple base ;
Size itupsz = BTreeTupleGetPostingOffset ( itup ) ;
itupsz = MAXALIGN ( itupsz ) ;
currItem - > tupleOffset = so - > currPos . nextTupleOffset ;
base = ( IndexTuple ) ( so - > currTuples + so - > currPos . nextTupleOffset ) ;
memcpy ( base , itup , itupsz ) ;
/* Defensively reduce work area index tuple header size */
base - > t_info & = ~ INDEX_SIZE_MASK ;
base - > t_info | = itupsz ;
so - > currPos . nextTupleOffset + = itupsz ;
return currItem - > tupleOffset ;
}
return 0 ;
}
/*
* Save an index item into so - > currPos . items [ itemIndex ] for current posting
* tuple .
*
* Assumes that _bt_setuppostingitems ( ) has already been called for current
* posting list tuple . Caller passes its return value as tupleOffset .
*/
static inline void
_bt_savepostingitem ( BTScanOpaque so , int itemIndex , OffsetNumber offnum ,
ItemPointer heapTid , int tupleOffset )
{
BTScanPosItem * currItem = & so - > currPos . items [ itemIndex ] ;
currItem - > heapTid = * heapTid ;
currItem - > indexOffset = offnum ;
/*
* Have index - only scans return the same base IndexTuple for every TID
* that originates from the same posting list
*/
if ( so - > currTuples )
currItem - > tupleOffset = tupleOffset ;
}
/*
* Return the index item from so - > currPos . items [ so - > currPos . itemIndex ] to the
* index scan by setting the relevant fields in caller ' s index scan descriptor