@ -1,14 +1,14 @@
/*-------------------------------------------------------------------------
/*-------------------------------------------------------------------------
*
*
* nbtsearch . c
* nbtsearch . c
* s earch code for postgres btrees .
* S earch code for postgres btrees .
*
*
*
*
* Portions Copyright ( c ) 1996 - 2002 , PostgreSQL Global Development Group
* Portions Copyright ( c ) 1996 - 2002 , PostgreSQL Global Development Group
* Portions Copyright ( c ) 1994 , Regents of the University of California
* Portions Copyright ( c ) 1994 , Regents of the University of California
*
*
* IDENTIFICATION
* IDENTIFICATION
* $ Header : / cvsroot / pgsql / src / backend / access / nbtree / nbtsearch . c , v 1.73 2003 / 02 / 21 00 : 06 : 21 tgl Exp $
* $ Header : / cvsroot / pgsql / src / backend / access / nbtree / nbtsearch . c , v 1.74 2003 / 02 / 22 00 : 45 : 04 tgl Exp $
*
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
*/
@ -19,6 +19,7 @@
# include "access/nbtree.h"
# include "access/nbtree.h"
static Buffer _bt_walk_left ( Relation rel , Buffer buf ) ;
static bool _bt_endpoint ( IndexScanDesc scan , ScanDirection dir ) ;
static bool _bt_endpoint ( IndexScanDesc scan , ScanDirection dir ) ;
@ -79,10 +80,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
par_blkno = BufferGetBlockNumber ( * bufP ) ;
par_blkno = BufferGetBlockNumber ( * bufP ) ;
/*
/*
* We need to save the bit image of the index entry we chose in
* We need to save the location of the index entry we chose in
* the parent page on a stack . In case we split the tree , we ' ll
* the parent page on a stack . In case we split the tree , we ' ll
* use this bit image to figure out what our real parent page is ,
* use the stack to work back up to the parent page . We also save
* in case the parent splits while we ' re working lower in the
* the actual downlink ( TID ) to uniquely identify the index entry ,
* in case it moves right while we ' re working lower in the
* tree . See the paper by Lehman and Yao for how this is detected
* tree . See the paper by Lehman and Yao for how this is detected
* and handled . ( We use the child link to disambiguate duplicate
* and handled . ( We use the child link to disambiguate duplicate
* keys in the index - - Lehman and Yao disallow duplicate keys . )
* keys in the index - - Lehman and Yao disallow duplicate keys . )
@ -114,7 +116,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
/*
/*
* _bt_moveright ( ) - - move right in the btree if necessary .
* _bt_moveright ( ) - - move right in the btree if necessary .
*
*
* When we drop and reacquire a pointer to a page , it is possible that
* When we follow a pointer to reach a page , it is possible that
* the page has changed in the meanwhile . If this happens , we ' re
* the page has changed in the meanwhile . If this happens , we ' re
* guaranteed that the page has " split right " - - that is , that any
* guaranteed that the page has " split right " - - that is , that any
* data that appeared on the page originally is either on the page
* data that appeared on the page originally is either on the page
@ -148,9 +150,13 @@ _bt_moveright(Relation rel,
* right . ( If the scan key is equal to the high key , we might or
* right . ( If the scan key is equal to the high key , we might or
* might not need to move right ; have to scan the page first anyway . )
* might not need to move right ; have to scan the page first anyway . )
* It could even have split more than once , so scan as far as needed .
* It could even have split more than once , so scan as far as needed .
*
* We also have to move right if we followed a link that brought us to
* a dead page .
*/
*/
while ( ! P_RIGHTMOST ( opaque ) & &
while ( ! P_RIGHTMOST ( opaque ) & &
_bt_compare ( rel , keysz , scankey , page , P_HIKEY ) > 0 )
( P_IGNORE ( opaque ) | |
_bt_compare ( rel , keysz , scankey , page , P_HIKEY ) > 0 ) )
{
{
/* step right one page */
/* step right one page */
BlockNumber rblkno = opaque - > btpo_next ;
BlockNumber rblkno = opaque - > btpo_next ;
@ -161,6 +167,10 @@ _bt_moveright(Relation rel,
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
}
}
if ( P_IGNORE ( opaque ) )
elog ( ERROR , " _bt_moveright: fell off the end of %s " ,
RelationGetRelationName ( rel ) ) ;
return buf ;
return buf ;
}
}
@ -796,7 +806,6 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
OffsetNumber offnum ,
OffsetNumber offnum ,
maxoff ;
maxoff ;
BlockNumber blkno ;
BlockNumber blkno ;
BlockNumber obknum ;
/*
/*
* Don ' t use ItemPointerGetOffsetNumber or you risk to get assertion
* Don ' t use ItemPointerGetOffsetNumber or you risk to get assertion
@ -814,7 +823,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
offnum = OffsetNumberNext ( offnum ) ;
offnum = OffsetNumberNext ( offnum ) ;
else
else
{
{
/* w alk right to the next page with data */
/* W alk right to the next page with data */
for ( ; ; )
for ( ; ; )
{
{
/* if we're at end of scan, release the buffer and return */
/* if we're at end of scan, release the buffer and return */
@ -831,6 +840,8 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
* bufP = _bt_getbuf ( rel , blkno , BT_READ ) ;
* bufP = _bt_getbuf ( rel , blkno , BT_READ ) ;
page = BufferGetPage ( * bufP ) ;
page = BufferGetPage ( * bufP ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
if ( ! P_IGNORE ( opaque ) )
{
maxoff = PageGetMaxOffsetNumber ( page ) ;
maxoff = PageGetMaxOffsetNumber ( page ) ;
/* done if it's not empty */
/* done if it's not empty */
offnum = P_FIRSTDATAKEY ( opaque ) ;
offnum = P_FIRSTDATAKEY ( opaque ) ;
@ -839,53 +850,49 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
}
}
}
}
}
}
else
}
else /* backwards scan */
{
{
if ( offnum > P_FIRSTDATAKEY ( opaque ) )
if ( offnum > P_FIRSTDATAKEY ( opaque ) )
offnum = OffsetNumberPrev ( offnum ) ;
offnum = OffsetNumberPrev ( offnum ) ;
else
else
{
{
/* walk left to the next page with data */
/*
* Walk left to the next page with data . This is much more
* complex than the walk - right case because of the possibility
* that the page to our left splits while we are in flight to it ,
* plus the possibility that the page we were on gets deleted
* after we leave it . See nbtree / README for details .
*/
for ( ; ; )
for ( ; ; )
{
{
/* if we're at end of scan, release the buffer and return */
* bufP = _bt_walk_left ( rel , * bufP ) ;
if ( P_LEFTMOST ( opaque ) )
/* if we're at end of scan, return failure */
if ( * bufP = = InvalidBuffer )
{
{
_bt_relbuf ( rel , * bufP ) ;
ItemPointerSetInvalid ( current ) ;
ItemPointerSetInvalid ( current ) ;
* bufP = so - > btso_curbuf = InvalidBuffer ;
so - > btso_curbuf = InvalidBuffer ;
return false ;
return false ;
}
}
/* step left */
obknum = BufferGetBlockNumber ( * bufP ) ;
blkno = opaque - > btpo_prev ;
_bt_relbuf ( rel , * bufP ) ;
* bufP = _bt_getbuf ( rel , blkno , BT_READ ) ;
page = BufferGetPage ( * bufP ) ;
page = BufferGetPage ( * bufP ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
/*
/*
* If the adjacent page just split , then we have to walk
* Okay , we managed to move left to a non - deleted page .
* right to find the block that ' s now adjacent to where we
* Done if it ' s not half - dead and not empty . Else loop back
* were . Because pages only split right , we don ' t have to
* and do it all again .
* worry about this failing to terminate .
*/
*/
while ( opaque - > btpo_next ! = obknum )
if ( ! P_IGNORE ( opaque ) )
{
{
blkno = opaque - > btpo_next ;
_bt_relbuf ( rel , * bufP ) ;
* bufP = _bt_getbuf ( rel , blkno , BT_READ ) ;
page = BufferGetPage ( * bufP ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
}
/* done if it's not empty */
maxoff = PageGetMaxOffsetNumber ( page ) ;
maxoff = PageGetMaxOffsetNumber ( page ) ;
offnum = maxoff ;
offnum = maxoff ;
if ( ! PageIsEmpty ( page ) & & maxoff > = P_FIRSTDATAKEY ( opaque ) )
if ( ! PageIsEmpty ( page ) & &
maxoff > = P_FIRSTDATAKEY ( opaque ) )
break ;
break ;
}
}
}
}
}
}
}
/* Update scan state */
/* Update scan state */
so - > btso_curbuf = * bufP ;
so - > btso_curbuf = * bufP ;
@ -895,11 +902,133 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
return true ;
return true ;
}
}
/*
* _bt_walk_left ( ) - - step left one page , if possible
*
* The given buffer must be pinned and read - locked . This will be dropped
* before stepping left . On return , we have pin and read lock on the
* returned page , instead .
*
* Returns InvalidBuffer if there is no page to the left ( no lock is held
* in that case ) .
*
* When working on a non - leaf level , it is possible for the returned page
* to be half - dead ; the caller should check that condition and step left
* again if it ' s important .
*/
static Buffer
_bt_walk_left ( Relation rel , Buffer buf )
{
Page page ;
BTPageOpaque opaque ;
page = BufferGetPage ( buf ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
for ( ; ; )
{
BlockNumber obknum ;
BlockNumber lblkno ;
BlockNumber blkno ;
int tries ;
/* if we're at end of tree, release buf and return failure */
if ( P_LEFTMOST ( opaque ) )
{
_bt_relbuf ( rel , buf ) ;
break ;
}
/* remember original page we are stepping left from */
obknum = BufferGetBlockNumber ( buf ) ;
/* step left */
blkno = lblkno = opaque - > btpo_prev ;
_bt_relbuf ( rel , buf ) ;
buf = _bt_getbuf ( rel , blkno , BT_READ ) ;
page = BufferGetPage ( buf ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
/*
* If this isn ' t the page we want , walk right till we find
* what we want - - - but go no more than four hops ( an
* arbitrary limit ) . If we don ' t find the correct page by then ,
* the most likely bet is that the original page got deleted
* and isn ' t in the sibling chain at all anymore , not that its
* left sibling got split more than four times .
*
* Note that it is correct to test P_ISDELETED not P_IGNORE
* here , because half - dead pages are still in the sibling
* chain . Caller must reject half - dead pages if wanted .
*/
tries = 0 ;
for ( ; ; )
{
if ( ! P_ISDELETED ( opaque ) & & opaque - > btpo_next = = obknum )
{
/* Found desired page, return it */
return buf ;
}
if ( P_RIGHTMOST ( opaque ) | | + + tries > 4 )
break ;
blkno = opaque - > btpo_next ;
_bt_relbuf ( rel , buf ) ;
buf = _bt_getbuf ( rel , blkno , BT_READ ) ;
page = BufferGetPage ( buf ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
}
/* Return to the original page to see what's up */
_bt_relbuf ( rel , buf ) ;
buf = _bt_getbuf ( rel , obknum , BT_READ ) ;
page = BufferGetPage ( buf ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
if ( P_ISDELETED ( opaque ) )
{
/*
* It was deleted . Move right to first nondeleted page
* ( there must be one ) ; that is the page that has acquired the
* deleted one ' s keyspace , so stepping left from it will take
* us where we want to be .
*/
for ( ; ; )
{
if ( P_RIGHTMOST ( opaque ) )
elog ( ERROR , " _bt_walk_left: fell off the end of %s " ,
RelationGetRelationName ( rel ) ) ;
blkno = opaque - > btpo_next ;
_bt_relbuf ( rel , buf ) ;
buf = _bt_getbuf ( rel , blkno , BT_READ ) ;
page = BufferGetPage ( buf ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
if ( ! P_ISDELETED ( opaque ) )
break ;
}
/*
* Now return to top of loop , resetting obknum to
* point to this nondeleted page , and try again .
*/
}
else
{
/*
* It wasn ' t deleted ; the explanation had better be
* that the page to the left got split or deleted .
* Without this check , we ' d go into an infinite loop
* if there ' s anything wrong .
*/
if ( opaque - > btpo_prev = = lblkno )
elog ( ERROR , " _bt_walk_left: can't find left sibling in %s " ,
RelationGetRelationName ( rel ) ) ;
/* Okay to try again with new lblkno value */
}
}
return InvalidBuffer ;
}
/*
/*
* _bt_get_endpoint ( ) - - Find the first or last page on a given tree level
* _bt_get_endpoint ( ) - - Find the first or last page on a given tree level
*
*
* If the index is empty , we will return InvalidBuffer ; any other failure
* If the index is empty , we will return InvalidBuffer ; any other failure
* condition causes elog ( ) .
* condition causes elog ( ) . We will not return a dead page .
*
*
* The returned buffer is pinned and read - locked .
* The returned buffer is pinned and read - locked .
*/
*/
@ -941,12 +1070,13 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
* step right if needed to get to it ( this could happen if the
* step right if needed to get to it ( this could happen if the
* page split since we obtained a pointer to it ) .
* page split since we obtained a pointer to it ) .
*/
*/
while ( P_ISDELETED ( opaque ) | |
while ( P_IGNORE ( opaque ) | |
( rightmost & & ! P_RIGHTMOST ( opaque ) ) )
( rightmost & & ! P_RIGHTMOST ( opaque ) ) )
{
{
blkno = opaque - > btpo_next ;
blkno = opaque - > btpo_next ;
if ( blkno = = P_NONE )
if ( blkno = = P_NONE )
elog ( ERROR , " _bt_get_endpoint: ran off end of btree " ) ;
elog ( ERROR , " _bt_get_endpoint: fell off the end of %s " ,
RelationGetRelationName ( rel ) ) ;
_bt_relbuf ( rel , buf ) ;
_bt_relbuf ( rel , buf ) ;
buf = _bt_getbuf ( rel , blkno , BT_READ ) ;
buf = _bt_getbuf ( rel , blkno , BT_READ ) ;
page = BufferGetPage ( buf ) ;
page = BufferGetPage ( buf ) ;
@ -959,7 +1089,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
if ( opaque - > btpo . level < level )
if ( opaque - > btpo . level < level )
elog ( ERROR , " _bt_get_endpoint: btree level %u not found " , level ) ;
elog ( ERROR , " _bt_get_endpoint: btree level %u not found " , level ) ;
/* Step to leftmost or rightmost child page */
/* Descend to leftmost or rightmost child page */
if ( rightmost )
if ( rightmost )
offnum = PageGetMaxOffsetNumber ( page ) ;
offnum = PageGetMaxOffsetNumber ( page ) ;
else
else