@ -94,16 +94,26 @@ typedef struct BtreeCheckState
/* Target page's LSN */
/* Target page's LSN */
XLogRecPtr targetlsn ;
XLogRecPtr targetlsn ;
/*
* Low key : high key of left sibling of target page . Used only for child
* verification . So , ' lowkey ' is kept only when ' readonly ' is set .
*/
IndexTuple lowkey ;
/*
* The rightlink and incomplete split flag of block one level down to the
* target page , which was visited last time via downlink from taget page .
* We use it to check for missing downlinks .
*/
BlockNumber prevrightlink ;
bool previncompletesplit ;
/*
/*
* Mutable state , for optional heapallindexed verification :
* Mutable state , for optional heapallindexed verification :
*/
*/
/* Bloom filter fingerprints B-Tree index */
/* Bloom filter fingerprints B-Tree index */
bloom_filter * filter ;
bloom_filter * filter ;
/* Bloom filter fingerprints downlink blocks within tree */
bloom_filter * downlinkfilter ;
/* Right half of incomplete split marker */
bool rightsplit ;
/* Debug counter */
/* Debug counter */
int64 heaptuplespresent ;
int64 heaptuplespresent ;
} BtreeCheckState ;
} BtreeCheckState ;
@ -137,9 +147,14 @@ static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
BtreeLevel level ) ;
BtreeLevel level ) ;
static void bt_target_page_check ( BtreeCheckState * state ) ;
static void bt_target_page_check ( BtreeCheckState * state ) ;
static BTScanInsert bt_right_page_check_scankey ( BtreeCheckState * state ) ;
static BTScanInsert bt_right_page_check_scankey ( BtreeCheckState * state ) ;
static void bt_downlink_check ( BtreeCheckState * state , BTScanInsert targetkey ,
static void bt_child_check ( BtreeCheckState * state , BTScanInsert targetkey ,
BlockNumber childblock ) ;
OffsetNumber downlinkoffnum ) ;
static void bt_downlink_missing_check ( BtreeCheckState * state ) ;
static void bt_child_highkey_check ( BtreeCheckState * state ,
OffsetNumber target_downlinkoffnum ,
Page loaded_child ,
uint32 target_level ) ;
static void bt_downlink_missing_check ( BtreeCheckState * state , bool rightsplit ,
BlockNumber targetblock , Page target ) ;
static void bt_tuple_present_callback ( Relation index , ItemPointer tid ,
static void bt_tuple_present_callback ( Relation index , ItemPointer tid ,
Datum * values , bool * isnull ,
Datum * values , bool * isnull ,
bool tupleIsAlive , void * checkstate ) ;
bool tupleIsAlive , void * checkstate ) ;
@ -470,20 +485,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
errmsg ( " index \" %s \" cannot be verified using transaction snapshot " ,
errmsg ( " index \" %s \" cannot be verified using transaction snapshot " ,
RelationGetRelationName ( rel ) ) ) ) ;
RelationGetRelationName ( rel ) ) ) ) ;
}
}
else
{
/*
* Extra readonly downlink check .
*
* In readonly case , we know that there cannot be a concurrent
* page split or a concurrent page deletion , which gives us the
* opportunity to verify that every non - ignorable page had a
* downlink one level up . We must be tolerant of interrupted page
* splits and page deletions , though . This is taken care of in
* bt_downlink_missing_check ( ) .
*/
state - > downlinkfilter = bloom_create ( total_pages , work_mem , seed ) ;
}
}
}
Assert ( ! state - > rootdescend | | state - > readonly ) ;
Assert ( ! state - > rootdescend | | state - > readonly ) ;
@ -532,12 +533,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
current . istruerootlevel = true ;
current . istruerootlevel = true ;
while ( current . leftmost ! = P_NONE )
while ( current . leftmost ! = P_NONE )
{
{
/*
* Leftmost page on level cannot be right half of incomplete split .
* This can go stale immediately in ! readonly case .
*/
state - > rightsplit = false ;
/*
/*
* Verify this level , and get left most page for next level down , if
* Verify this level , and get left most page for next level down , if
* not at leaf level
* not at leaf level
@ -561,16 +556,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
IndexInfo * indexinfo = BuildIndexInfo ( state - > rel ) ;
IndexInfo * indexinfo = BuildIndexInfo ( state - > rel ) ;
TableScanDesc scan ;
TableScanDesc scan ;
/* Report on extra downlink checks performed in readonly case */
if ( state - > readonly )
{
ereport ( DEBUG1 ,
( errmsg_internal ( " finished verifying presence of downlink blocks within index \" %s \" with bitset %.2f%% set " ,
RelationGetRelationName ( rel ) ,
100.0 * bloom_prop_bits_set ( state - > downlinkfilter ) ) ) ) ;
bloom_free ( state - > downlinkfilter ) ;
}
/*
/*
* Create our own scan for table_index_build_scan ( ) , rather than
* Create our own scan for table_index_build_scan ( ) , rather than
* getting it to do so for us . This is required so that we can
* getting it to do so for us . This is required so that we can
@ -673,6 +658,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
level . istruerootlevel ?
level . istruerootlevel ?
" (true root level) " : level . level = = 0 ? " (leaf level) " : " " ) ;
" (true root level) " : level . level = = 0 ? " (leaf level) " : " " ) ;
state - > prevrightlink = InvalidBlockNumber ;
state - > previncompletesplit = false ;
do
do
{
{
/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
@ -692,9 +680,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
* mode , and since a page has no links within other pages
* mode , and since a page has no links within other pages
* ( siblings and parent ) once it is marked fully deleted , it
* ( siblings and parent ) once it is marked fully deleted , it
* should be impossible to land on a fully deleted page in
* should be impossible to land on a fully deleted page in
* readonly mode . See bt_downlink _check ( ) for further details .
* readonly mode . See bt_chil d_check ( ) for further details .
*
*
* The bt_downlink _check ( ) P_ISDELETED ( ) check is repeated here so
* The bt_chil d_check ( ) P_ISDELETED ( ) check is repeated here so
* that pages that are only reachable through sibling links get
* that pages that are only reachable through sibling links get
* checked .
* checked .
*/
*/
@ -816,21 +804,57 @@ nextpage:
errmsg ( " circular link chain found in block %u of index \" %s \" " ,
errmsg ( " circular link chain found in block %u of index \" %s \" " ,
current , RelationGetRelationName ( state - > rel ) ) ) ) ;
current , RelationGetRelationName ( state - > rel ) ) ) ) ;
leftcurrent = current ;
current = opaque - > btpo_next ;
if ( state - > lowkey )
{
Assert ( state - > readonly ) ;
pfree ( state - > lowkey ) ;
state - > lowkey = NULL ;
}
/*
/*
* Record if page that is about to become target is the right half of
* Copy current target high key as the low key of right sibling .
* an incomplete page split . This can go stale immediately in
* Allocate memory in upper level context , so it would be cleared
* ! readonly case .
* after reset of target context .
*
* We only need the low key in corner cases of checking child high
* keys . We use high key only when incomplete split on the child level
* falls to the boundary of pages on the target level . See
* bt_child_highkey_check ( ) for details . So , typically we won ' t end
* up doing anything with low key , but it ' s simpler for general case
* high key verification to always have it available .
*
* The correctness of managing low key in the case of concurrent
* splits wasn ' t investigated yet . Thankfully we only need low key
* for readonly verification and concurrent splits won ' t happen .
*/
*/
state - > rightsplit = P_INCOMPLETE_SPLIT ( opaque ) ;
if ( state - > readonly & & ! P_RIGHTMOST ( opaque ) )
{
IndexTuple itup ;
ItemId itemid ;
leftcurrent = current ;
itemid = PageGetItemIdCareful ( state , state - > targetblock ,
current = opaque - > btpo_next ;
state - > target , P_HIKEY ) ;
itup = ( IndexTuple ) PageGetItem ( state - > target , itemid ) ;
state - > lowkey = MemoryContextAlloc ( oldcontext , IndexTupleSize ( itup ) ) ;
memcpy ( state - > lowkey , itup , IndexTupleSize ( itup ) ) ;
}
/* Free page and associated memory for this iteration */
/* Free page and associated memory for this iteration */
MemoryContextReset ( state - > targetcontext ) ;
MemoryContextReset ( state - > targetcontext ) ;
}
}
while ( current ! = P_NONE ) ;
while ( current ! = P_NONE ) ;
if ( state - > lowkey )
{
Assert ( state - > readonly ) ;
pfree ( state - > lowkey ) ;
state - > lowkey = NULL ;
}
/* Don't change context for caller */
/* Don't change context for caller */
MemoryContextSwitchTo ( oldcontext ) ;
MemoryContextSwitchTo ( oldcontext ) ;
@ -863,7 +887,9 @@ nextpage:
* tuple .
* tuple .
*
*
* - That downlink to block was encountered in parent where that ' s expected .
* - That downlink to block was encountered in parent where that ' s expected .
* ( Limited to heapallindexed readonly callers . )
* ( Limited to readonly callers . )
*
* - That high keys of child pages matches corresponding pivot keys in parent .
*
*
* This is also where heapallindexed callers use their Bloom filter to
* This is also where heapallindexed callers use their Bloom filter to
* fingerprint IndexTuples for later table_index_build_scan ( ) verification .
* fingerprint IndexTuples for later table_index_build_scan ( ) verification .
@ -981,22 +1007,26 @@ bt_target_page_check(BtreeCheckState *state)
( uint32 ) state - > targetlsn ) ) ) ;
( uint32 ) state - > targetlsn ) ) ) ;
}
}
/* Fingerprint downlink blocks in heapallindexed + readonly case */
if ( state - > heapallindexed & & state - > readonly & & ! P_ISLEAF ( topaque ) )
{
BlockNumber childblock = BTreeTupleGetDownLink ( itup ) ;
bloom_add_element ( state - > downlinkfilter ,
( unsigned char * ) & childblock ,
sizeof ( BlockNumber ) ) ;
}
/*
/*
* Don ' t try to generate scankey using " negative infinity " item on
* Don ' t try to generate scankey using " negative infinity " item on
* internal pages . They are always truncated to zero attributes .
* internal pages . They are always truncated to zero attributes .
*/
*/
if ( offset_is_negative_infinity ( topaque , offset ) )
if ( offset_is_negative_infinity ( topaque , offset ) )
{
/*
* We don ' t call bt_child_check ( ) for " negative infinity " items .
* But if we ' re performing downlink connectivity check , we do it
* for every item including " negative infinity " one .
*/
if ( ! P_ISLEAF ( topaque ) & & state - > readonly )
{
bt_child_highkey_check ( state ,
offset ,
NULL ,
topaque - > btpo . level ) ;
}
continue ;
continue ;
}
/*
/*
* Readonly callers may optionally verify that non - pivot tuples can
* Readonly callers may optionally verify that non - pivot tuples can
@ -1340,20 +1370,23 @@ bt_target_page_check(BtreeCheckState *state)
* because it has no useful value to compare ) .
* because it has no useful value to compare ) .
*/
*/
if ( ! P_ISLEAF ( topaque ) & & state - > readonly )
if ( ! P_ISLEAF ( topaque ) & & state - > readonly )
{
bt_child_check ( state , skey , offset ) ;
BlockNumber childblock = BTreeTupleGetDownLink ( itup ) ;
bt_downlink_check ( state , skey , childblock ) ;
}
}
}
/*
/*
* * Check if page has a downlink in parent *
* Special case bt_child_highkey_check ( ) call
*
*
* This can only be checked in heapallindexed + readonly case .
* We don ' t pass an real downlink , but we ' ve to finish the level
* processing . If condition is satisfied , we ' ve already processed all the
* downlinks from the target level . But there still might be pages to the
* right of the child page pointer to by our rightmost downlink . And they
* might have missing downlinks . This final call checks for them .
*/
*/
if ( state - > heapallindexed & & state - > readonly )
if ( ! P_ISLEAF ( topaque ) & & P_RIGHTMOST ( topaque ) & & state - > readonly )
bt_downlink_missing_check ( state ) ;
{
bt_child_highkey_check ( state , InvalidOffsetNumber ,
NULL , topaque - > btpo . level ) ;
}
}
}
/*
/*
@ -1497,7 +1530,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
* Top level tree walk caller moves on to next page ( makes it the new
* Top level tree walk caller moves on to next page ( makes it the new
* target ) following recovery from this race . ( cf . The rationale for
* target ) following recovery from this race . ( cf . The rationale for
* child / downlink verification needing a ShareLock within
* child / downlink verification needing a ShareLock within
* bt_downlink _check ( ) , where page deletion is also the main source of
* bt_chil d_check ( ) , where page deletion is also the main source of
* trouble . )
* trouble . )
*
*
* Note that it doesn ' t matter if right sibling page here is actually a
* Note that it doesn ' t matter if right sibling page here is actually a
@ -1570,6 +1603,297 @@ bt_right_page_check_scankey(BtreeCheckState *state)
return bt_mkscankey_pivotsearch ( state - > rel , firstitup ) ;
return bt_mkscankey_pivotsearch ( state - > rel , firstitup ) ;
}
}
/*
* Check if two tuples are binary identical except the block number . So ,
* this function is capable to compare pivot keys on different levels .
*/
static bool
bt_pivot_tuple_identical ( IndexTuple itup1 , IndexTuple itup2 )
{
if ( IndexTupleSize ( itup1 ) ! = IndexTupleSize ( itup2 ) )
return false ;
if ( memcmp ( & itup1 - > t_tid . ip_posid , & itup2 - > t_tid . ip_posid ,
IndexTupleSize ( itup1 ) - offsetof ( ItemPointerData , ip_posid ) ) ! = 0 )
return false ;
return true ;
}
/*---
* Check high keys on the child level . Traverse rightlinks from previous
* downlink to the current one . Check that there are no intermediate pages
* with missing downlinks .
*
* If ' loaded_child ' is given , it ' s assumed to be the page pointed to by the
* downlink referenced by ' downlinkoffnum ' of the target page .
*
* Basically this function is called for each target downlink and checks two
* invariants :
*
* 1 ) You can reach the next child from previous one via rightlinks ;
* 2 ) Each child high key have matching pivot key on target level .
*
* Consider the sample tree picture .
*
* 1
* / \
* 2 < - > 3
* / \ / \
* 4 < > 5 < > 6 < > 7 < > 8
*
* This function will be called for blocks 4 , 5 , 6 and 8. Consider what is
* happening for each function call .
*
* - The function call for block 4 initializes data structure and matches high
* key of block 4 to downlink ' s pivot key of block 2.
* - The high key of block 5 is matched to the high key of block 2.
* - The block 6 has an incomplete split flag set , so its high key isn ' t
* matched to anything .
* - The function call for block 8 checks that block 8 can be found while
* following rightlinks from block 6. The high key of block 7 will be
* matched to downlink ' s pivot key in block 3.
*
* There is also final call of this function , which checks that there is no
* missing downlinks for children to the right of the child referenced by
* rightmost downlink in target level .
*/
static void
bt_child_highkey_check ( BtreeCheckState * state ,
OffsetNumber target_downlinkoffnum ,
Page loaded_child ,
uint32 target_level )
{
BlockNumber blkno = state - > prevrightlink ;
Page page ;
BTPageOpaque opaque ;
bool rightsplit = state - > previncompletesplit ;
bool first = true ;
ItemId itemid ;
IndexTuple itup ;
BlockNumber downlink ;
if ( OffsetNumberIsValid ( target_downlinkoffnum ) )
{
itemid = PageGetItemIdCareful ( state , state - > targetblock ,
state - > target , target_downlinkoffnum ) ;
itup = ( IndexTuple ) PageGetItem ( state - > target , itemid ) ;
downlink = BTreeTupleGetDownLink ( itup ) ;
}
else
{
downlink = P_NONE ;
}
/*
* If no previous rightlink is memorized for current level just below
* target page ' s level , we are about to start from the leftmost page . We
* can ' t follow rightlinks from previous page , because there is no
* previous page . But we still can match high key .
*
* So we initialize variables for the loop above like there is previous
* page referencing current child . Also we imply previous page to not
* have incomplete split flag , that would make us require downlink for
* current child . That ' s correct , because leftmost page on the level
* should always have parent downlink .
*/
if ( ! BlockNumberIsValid ( blkno ) )
{
blkno = downlink ;
rightsplit = false ;
}
/* Move to the right on the child level */
while ( true )
{
/*
* Did we traverse the whole tree level and this is check for pages to
* the right of rightmost downlink ?
*/
if ( blkno = = P_NONE & & downlink = = P_NONE )
{
state - > prevrightlink = InvalidBlockNumber ;
state - > previncompletesplit = false ;
return ;
}
/* Did we traverse the whole tree level and don't find next downlink? */
if ( blkno = = P_NONE )
ereport ( ERROR ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg ( " can't traverse from downlink %u to downlink %u of index \" %s \" " ,
state - > prevrightlink , downlink ,
RelationGetRelationName ( state - > rel ) ) ) ) ;
/* Load page contents */
if ( blkno = = downlink & & loaded_child )
page = loaded_child ;
else
page = palloc_btree_page ( state , blkno ) ;
opaque = ( BTPageOpaque ) PageGetSpecialPointer ( page ) ;
/* The first page we visit at the level should be leftmost */
if ( first & & ! BlockNumberIsValid ( state - > prevrightlink ) & & ! P_LEFTMOST ( opaque ) )
ereport ( ERROR ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg ( " the first child of leftmost target page is not leftmost of its level in index \" %s \" " ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Target block=%u child block=%u target page lsn=%X/%X. " ,
state - > targetblock , blkno ,
( uint32 ) ( state - > targetlsn > > 32 ) ,
( uint32 ) state - > targetlsn ) ) ) ;
/* Check level for non-ignorable page */
if ( ! P_IGNORE ( opaque ) & & opaque - > btpo . level ! = target_level - 1 )
ereport ( ERROR ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg ( " block found while following rightlinks from child of index \" %s \" has invalid level " ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Block pointed to=%u expected level=%u level in pointed to block=%u. " ,
blkno , target_level - 1 , opaque - > btpo . level ) ) ) ;
/* Try to detect circular links */
if ( ( ! first & & blkno = = state - > prevrightlink ) | | blkno = = opaque - > btpo_prev )
ereport ( ERROR ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg ( " circular link chain found in block %u of index \" %s \" " ,
blkno , RelationGetRelationName ( state - > rel ) ) ) ) ;
if ( blkno ! = downlink & & ! P_IGNORE ( opaque ) )
{
/* blkno probably has missing parent downlink */
bt_downlink_missing_check ( state , rightsplit , blkno , page ) ;
}
rightsplit = P_INCOMPLETE_SPLIT ( opaque ) ;
/*
* If we visit page with high key , check that it is be equal to the
* target key next to corresponding downlink .
*/
if ( ! rightsplit & & ! P_RIGHTMOST ( opaque ) )
{
BTPageOpaque topaque ;
IndexTuple highkey ;
OffsetNumber pivotkey_offset ;
/* Get high key */
itemid = PageGetItemIdCareful ( state , blkno , page , P_HIKEY ) ;
highkey = ( IndexTuple ) PageGetItem ( page , itemid ) ;
/*
* There might be two situations when we examine high key . If
* current child page is referenced by given target downlink , we
* should look to the next offset number for matching key from
* target page .
*
* Alternatively , we ' re following rightlinks somewhere in the
* middle between page referenced by previous target ' s downlink
* and the page referenced by current target ' s downlink . If
* current child page hasn ' t incomplete split flag set , then its
* high key should match to the target ' s key of current offset
* number . This happens when a previous call here ( to
* bt_child_highkey_check ( ) ) found an incomplete split , and we
* reach a right sibling page without a downlink - - the right
* sibling page ' s high key still needs to be matched to a
* separator key on the parent / target level .
*
* Don ' t apply OffsetNumberNext ( ) to target_downlinkoffnum when we
* already had to step right on the child level . Our traversal of
* the child level must try to move in perfect lockstep behind ( to
* the left of ) the target / parent level traversal .
*/
if ( blkno = = downlink )
pivotkey_offset = OffsetNumberNext ( target_downlinkoffnum ) ;
else
pivotkey_offset = target_downlinkoffnum ;
topaque = ( BTPageOpaque ) PageGetSpecialPointer ( state - > target ) ;
if ( ! offset_is_negative_infinity ( topaque , pivotkey_offset ) )
{
/*
* If we ' re looking for the next pivot tuple in target page ,
* but there is no more pivot tuples , then we should match to
* high key instead .
*/
if ( pivotkey_offset > PageGetMaxOffsetNumber ( state - > target ) )
{
if ( P_RIGHTMOST ( topaque ) )
ereport ( ERROR ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg ( " child high key is greater than rightmost pivot key on target level in index \" %s \" " ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Target block=%u child block=%u target page lsn=%X/%X. " ,
state - > targetblock , blkno ,
( uint32 ) ( state - > targetlsn > > 32 ) ,
( uint32 ) state - > targetlsn ) ) ) ;
pivotkey_offset = P_HIKEY ;
}
itemid = PageGetItemIdCareful ( state , state - > targetblock ,
state - > target , pivotkey_offset ) ;
itup = ( IndexTuple ) PageGetItem ( state - > target , itemid ) ;
}
else
{
/*
* We cannot try to match child ' s high key to a negative
* infinity key in target , since there is nothing to compare .
* However , it ' s still possible to match child ' s high key
* outside of target page . The reason why we ' re are is that
* bt_child_highkey_check ( ) was previously called for the
* cousin page of ' loaded_child ' , which is incomplete split .
* So , now we traverse to the right of that cousin page and
* current child level page under consideration still belongs
* to the subtree of target ' s left sibling . Thus , we need to
* match child ' s high key to it ' s left uncle page high key .
* Thankfully we saved it , it ' s called a " low key " of target
* page .
*/
if ( ! state - > lowkey )
ereport ( ERROR ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg ( " can't find left sibling high key in index \" %s \" " ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Target block=%u child block=%u target page lsn=%X/%X. " ,
state - > targetblock , blkno ,
( uint32 ) ( state - > targetlsn > > 32 ) ,
( uint32 ) state - > targetlsn ) ) ) ;
itup = state - > lowkey ;
}
if ( ! bt_pivot_tuple_identical ( highkey , itup ) )
{
ereport ( ERROR ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg ( " mismatch between parent key and child high key in index \" %s \" " ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Target block=%u child block=%u target page lsn=%X/%X. " ,
state - > targetblock , blkno ,
( uint32 ) ( state - > targetlsn > > 32 ) ,
( uint32 ) state - > targetlsn ) ) ) ;
}
}
/* Exit if we already found next downlink */
if ( blkno = = downlink )
{
state - > prevrightlink = opaque - > btpo_next ;
state - > previncompletesplit = rightsplit ;
return ;
}
/* Traverse to the next page using rightlink */
blkno = opaque - > btpo_next ;
/* Free page contents if it's allocated by us */
if ( page ! = loaded_child )
pfree ( page ) ;
first = false ;
}
}
/*
/*
* Checks one of target ' s downlink against its child page .
* Checks one of target ' s downlink against its child page .
*
*
@ -1578,15 +1902,28 @@ bt_right_page_check_scankey(BtreeCheckState *state)
* The downlink insertion into the target is probably where any problem raised
* The downlink insertion into the target is probably where any problem raised
* here arises , and there is no such thing as a parent link , so doing the
* here arises , and there is no such thing as a parent link , so doing the
* verification this way around is much more practical .
* verification this way around is much more practical .
*
* This function visits child page and it ' s sequentially called for each
* downlink of target page . Assuming this we also check downlink connectivity
* here in order to save child page visits .
*/
*/
static void
static void
bt_downlink_check ( BtreeCheckState * state , BTScanInsert targetkey ,
bt_chil d_check ( BtreeCheckState * state , BTScanInsert targetkey ,
BlockNumber childblock )
OffsetNumber downlinkoffnum )
{
{
ItemId itemid ;
IndexTuple itup ;
BlockNumber childblock ;
OffsetNumber offset ;
OffsetNumber offset ;
OffsetNumber maxoffset ;
OffsetNumber maxoffset ;
Page child ;
Page child ;
BTPageOpaque copaque ;
BTPageOpaque copaque ;
BTPageOpaque topaque ;
itemid = PageGetItemIdCareful ( state , state - > targetblock ,
state - > target , downlinkoffnum ) ;
itup = ( IndexTuple ) PageGetItem ( state - > target , itemid ) ;
childblock = BTreeTupleGetDownLink ( itup ) ;
/*
/*
* Caller must have ShareLock on target relation , because of
* Caller must have ShareLock on target relation , because of
@ -1636,10 +1973,18 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* Check all items , rather than checking just the first and trusting that
* Check all items , rather than checking just the first and trusting that
* the operator class obeys the transitive law .
* the operator class obeys the transitive law .
*/
*/
topaque = ( BTPageOpaque ) PageGetSpecialPointer ( state - > target ) ;
child = palloc_btree_page ( state , childblock ) ;
child = palloc_btree_page ( state , childblock ) ;
copaque = ( BTPageOpaque ) PageGetSpecialPointer ( child ) ;
copaque = ( BTPageOpaque ) PageGetSpecialPointer ( child ) ;
maxoffset = PageGetMaxOffsetNumber ( child ) ;
maxoffset = PageGetMaxOffsetNumber ( child ) ;
/*
* Since we ' ve already loaded the child block , combine this check with
* check for downlink connectivity .
*/
bt_child_highkey_check ( state , downlinkoffnum ,
child , topaque - > btpo . level ) ;
/*
/*
* Since there cannot be a concurrent VACUUM operation in readonly mode ,
* Since there cannot be a concurrent VACUUM operation in readonly mode ,
* and since a page has no links within other pages ( siblings and parent )
* and since a page has no links within other pages ( siblings and parent )
@ -1734,23 +2079,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* be concerned about concurrent page splits or page deletions .
* be concerned about concurrent page splits or page deletions .
*/
*/
static void
static void
bt_downlink_missing_check ( BtreeCheckState * state )
bt_downlink_missing_check ( BtreeCheckState * state , bool rightsplit ,
BlockNumber blkno , Page page )
{
{
BTPageOpaque t opaque = ( BTPageOpaque ) PageGetSpecialPointer ( state - > tar get ) ;
BTPageOpaque opaque = ( BTPageOpaque ) PageGetSpecialPointer ( pa ge) ;
ItemId itemid ;
ItemId itemid ;
IndexTuple itup ;
IndexTuple itup ;
Page child ;
Page child ;
BTPageOpaque copaque ;
BTPageOpaque copaque ;
uint32 level ;
uint32 level ;
BlockNumber childblk ;
BlockNumber childblk ;
XLogRecPtr pagelsn ;
Assert ( state - > heapallindexed & & state - > readonly ) ;
Assert ( state - > readonly ) ;
Assert ( ! P_IGNORE ( t opaque) ) ;
Assert ( ! P_IGNORE ( opaque ) ) ;
/* No next level up with downlinks to fingerprint from the true root */
/* No next level up with downlinks to fingerprint from the true root */
if ( P_ISROOT ( t opaque) )
if ( P_ISROOT ( opaque ) )
return ;
return ;
pagelsn = PageGetLSN ( page ) ;
/*
/*
* Incomplete ( interrupted ) page splits can account for the lack of a
* Incomplete ( interrupted ) page splits can account for the lack of a
* downlink . Some inserting transaction should eventually complete the
* downlink . Some inserting transaction should eventually complete the
@ -1772,54 +2121,47 @@ bt_downlink_missing_check(BtreeCheckState *state)
* by design , so it shouldn ' t be taken to indicate corruption . See
* by design , so it shouldn ' t be taken to indicate corruption . See
* _bt_pagedel ( ) for full details .
* _bt_pagedel ( ) for full details .
*/
*/
if ( state - > rightsplit )
if ( rightsplit )
{
{
ereport ( DEBUG1 ,
ereport ( DEBUG1 ,
( errcode ( ERRCODE_NO_DATA ) ,
( errcode ( ERRCODE_NO_DATA ) ,
errmsg ( " harmless interrupted page split detected in index %s " ,
errmsg ( " harmless interrupted page split detected in index %s " ,
RelationGetRelationName ( state - > rel ) ) ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Block=%u level=%u left sibling=%u page lsn=%X/%X. " ,
errdetail_internal ( " Block=%u level=%u left sibling=%u page lsn=%X/%X. " ,
state - > target block , t opaque- > btpo . level ,
blkn o , opaque - > btpo . level ,
t opaque- > btpo_prev ,
opaque - > btpo_prev ,
( uint32 ) ( state - > tar get lsn > > 32 ) ,
( uint32 ) ( pa gelsn > > 32 ) ,
( uint32 ) state - > tar get lsn) ) ) ;
( uint32 ) pa gelsn) ) ) ;
return ;
return ;
}
}
/* Target's downlink is typically present in parent/fingerprinted */
if ( ! bloom_lacks_element ( state - > downlinkfilter ,
( unsigned char * ) & state - > targetblock ,
sizeof ( BlockNumber ) ) )
return ;
/*
/*
* Target is probably the " top parent " of a multi - level page deletion .
* Page under check is probably the " top parent " of a multi - level page
* We ' ll need to descend the subtree to make sure that descendant pages
* deletion . We ' ll need to descend the subtree to make sure that
* are consistent with that , though .
* descendant pages are consistent with that , though .
*
*
* If the target page ( which must be non - ignorable ) is a leaf page , then
* If the page ( which must be non - ignorable ) is a leaf page , then clearly
* clearly it can ' t be the top parent . The lack of a downlink is probably
* it can ' t be the top parent . The lack of a downlink is probably a
* a symptom of a broad problem that could just as easily cause
* symptom of a broad problem that could just as easily cause
* inconsistencies anywhere else .
* inconsistencies anywhere else .
*/
*/
if ( P_ISLEAF ( t opaque) )
if ( P_ISLEAF ( opaque ) )
ereport ( ERROR ,
ereport ( ERROR ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg ( " leaf index block lacks downlink in index \" %s \" " ,
errmsg ( " leaf index block lacks downlink in index \" %s \" " ,
RelationGetRelationName ( state - > rel ) ) ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Block=%u page lsn=%X/%X. " ,
errdetail_internal ( " Block=%u page lsn=%X/%X. " ,
state - > targetblock ,
blkno ,
( uint32 ) ( state - > tar get lsn > > 32 ) ,
( uint32 ) ( pa gelsn > > 32 ) ,
( uint32 ) state - > tar get lsn) ) ) ;
( uint32 ) pa gelsn) ) ) ;
/* Descend from the target page, which is an internal page */
/* Descend from the given page, which is an internal page */
elog ( DEBUG1 , " checking for interrupted multi-level deletion due to missing downlink in index \" %s \" " ,
elog ( DEBUG1 , " checking for interrupted multi-level deletion due to missing downlink in index \" %s \" " ,
RelationGetRelationName ( state - > rel ) ) ;
RelationGetRelationName ( state - > rel ) ) ;
level = topaque - > btpo . level ;
level = opaque - > btpo . level ;
itemid = PageGetItemIdCareful ( state , state - > targetblock , state - > target ,
itemid = PageGetItemIdCareful ( state , blkno , page , P_FIRSTDATAKEY ( opaque ) ) ;
P_FIRSTDATAKEY ( topaque ) ) ;
itup = ( IndexTuple ) PageGetItem ( page , itemid ) ;
itup = ( IndexTuple ) PageGetItem ( state - > target , itemid ) ;
childblk = BTreeTupleGetDownLink ( itup ) ;
childblk = BTreeTupleGetDownLink ( itup ) ;
for ( ; ; )
for ( ; ; )
{
{
@ -1837,8 +2179,8 @@ bt_downlink_missing_check(BtreeCheckState *state)
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg_internal ( " downlink points to block in index \" %s \" whose level is not one level down " ,
errmsg_internal ( " downlink points to block in index \" %s \" whose level is not one level down " ,
RelationGetRelationName ( state - > rel ) ) ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u. " ,
errdetail_internal ( " Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u. " ,
state - > targetblock , childblk ,
blkno , childblk ,
level - 1 , copaque - > btpo . level ) ) ) ;
level - 1 , copaque - > btpo . level ) ) ) ;
level = copaque - > btpo . level ;
level = copaque - > btpo . level ;
@ -1854,12 +2196,12 @@ bt_downlink_missing_check(BtreeCheckState *state)
* Since there cannot be a concurrent VACUUM operation in readonly mode ,
* Since there cannot be a concurrent VACUUM operation in readonly mode ,
* and since a page has no links within other pages ( siblings and parent )
* and since a page has no links within other pages ( siblings and parent )
* once it is marked fully deleted , it should be impossible to land on a
* once it is marked fully deleted , it should be impossible to land on a
* fully deleted page . See bt_downlink _check ( ) for further details .
* fully deleted page . See bt_chil d_check ( ) for further details .
*
*
* The bt_downlink _check ( ) P_ISDELETED ( ) check is repeated here because
* The bt_chil d_check ( ) P_ISDELETED ( ) check is repeated here because
* bt_downlink _check ( ) does not visit pages reachable through negative
* bt_chil d_check ( ) does not visit pages reachable through negative
* infinity items . Besides , bt_downlink _check ( ) is unwilling to descend
* infinity items . Besides , bt_chil d_check ( ) is unwilling to descend
* multiple levels . ( The similar bt_downlink _check ( ) P_ISDELETED ( ) check
* multiple levels . ( The similar bt_chil d_check ( ) P_ISDELETED ( ) check
* within bt_check_level_from_leftmost ( ) won ' t reach the page either ,
* within bt_check_level_from_leftmost ( ) won ' t reach the page either ,
* since the leaf ' s live siblings should have their sibling links updated
* since the leaf ' s live siblings should have their sibling links updated
* to bypass the deletion target page when it is marked fully dead . )
* to bypass the deletion target page when it is marked fully dead . )
@ -1875,26 +2217,26 @@ bt_downlink_missing_check(BtreeCheckState *state)
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
( errcode ( ERRCODE_INDEX_CORRUPTED ) ,
errmsg_internal ( " downlink to deleted leaf page found in index \" %s \" " ,
errmsg_internal ( " downlink to deleted leaf page found in index \" %s \" " ,
RelationGetRelationName ( state - > rel ) ) ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X. " ,
errdetail_internal ( " Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X. " ,
state - > targetblock , childblk ,
blkno , childblk ,
( uint32 ) ( state - > tar get lsn > > 32 ) ,
( uint32 ) ( pa gelsn > > 32 ) ,
( uint32 ) state - > tar get lsn) ) ) ;
( uint32 ) pa gelsn) ) ) ;
/*
/*
* Iff leaf page is half - dead , its high key top parent link should point
* Iff leaf page is half - dead , its high key top parent link should point
* to what VACUUM considered to be the top parent page at the instant it
* to what VACUUM considered to be the top parent page at the instant it
* was interrupted . Provided the high key link actually points to the
* was interrupted . Provided the high key link actually points to the
* target page , the missing downlink we detected is consistent with there
* page under check , the missing downlink we detected is consistent with
* having been an interrupted multi - level page deletion . This means that
* there having been an interrupted multi - level page deletion . This means
* the subtree with the target page at its root ( a page deletion chain ) is
* that th e subtree with the page under check at its root ( a page deletion
* in a consistent state , enabling VACUUM to resume deleting the entir e
* chain ) is in a consistent state , enabling VACUUM to resume deleting the
* chain the next time it encounters the half - dead leaf page .
* entire chain the next time it encounters the half - dead leaf page .
*/
*/
if ( P_ISHALFDEAD ( copaque ) & & ! P_RIGHTMOST ( copaque ) )
if ( P_ISHALFDEAD ( copaque ) & & ! P_RIGHTMOST ( copaque ) )
{
{
itemid = PageGetItemIdCareful ( state , childblk , child , P_HIKEY ) ;
itemid = PageGetItemIdCareful ( state , childblk , child , P_HIKEY ) ;
itup = ( IndexTuple ) PageGetItem ( child , itemid ) ;
itup = ( IndexTuple ) PageGetItem ( child , itemid ) ;
if ( BTreeTupleGetTopParent ( itup ) = = state - > targetblock )
if ( BTreeTupleGetTopParent ( itup ) = = blkno )
return ;
return ;
}
}
@ -1903,9 +2245,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
errmsg ( " internal index block lacks downlink in index \" %s \" " ,
errmsg ( " internal index block lacks downlink in index \" %s \" " ,
RelationGetRelationName ( state - > rel ) ) ,
RelationGetRelationName ( state - > rel ) ) ,
errdetail_internal ( " Block=%u level=%u page lsn=%X/%X. " ,
errdetail_internal ( " Block=%u level=%u page lsn=%X/%X. " ,
state - > target block , t opaque- > btpo . level ,
blkn o , opaque - > btpo . level ,
( uint32 ) ( state - > tar get lsn > > 32 ) ,
( uint32 ) ( pa gelsn > > 32 ) ,
( uint32 ) state - > tar get lsn) ) ) ;
( uint32 ) pa gelsn) ) ) ;
}
}
/*
/*
@ -2146,7 +2488,7 @@ bt_posting_plain_tuple(IndexTuple itup, int n)
* separator key value won ' t always be available from parent , though , because
* separator key value won ' t always be available from parent , though , because
* the first items of internal pages are negative infinity items , truncated
* the first items of internal pages are negative infinity items , truncated
* down to zero attributes during internal page splits . While it ' s true that
* down to zero attributes during internal page splits . While it ' s true that
* bt_downlink _check ( ) and the high key check can detect most imaginable key
* bt_chil d_check ( ) and the high key check can detect most imaginable key
* space problems , there are remaining problems it won ' t detect with non - pivot
* space problems , there are remaining problems it won ' t detect with non - pivot
* tuples in cousin leaf pages . Starting a search from the root for every
* tuples in cousin leaf pages . Starting a search from the root for every
* existing leaf tuple detects small inconsistencies in upper levels of the
* existing leaf tuple detects small inconsistencies in upper levels of the