@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $ Header : / cvsroot / pgsql / src / backend / access / hash / hashpage . c , v 1.41 2003 / 09 / 02 18 : 13 : 31 tgl Exp $
* $ Header : / cvsroot / pgsql / src / backend / access / hash / hashpage . c , v 1.42 2003 / 09 / 04 22 : 06 : 27 tgl Exp $
*
* NOTES
* Postgres hash pages look like ordinary relation pages . The opaque
@ -26,54 +26,201 @@
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
# include "postgres.h"
# include "access/genam.h"
# include "access/hash.h"
# include "miscadmin.h"
# include "storage/lmgr.h"
# include "utils/lsyscache.h"
static void _hash_splitbucket ( Relation rel , Buffer metabuf ,
Bucket obucket , Bucket nbucket ,
BlockNumber start_oblkno ,
BlockNumber start_nblkno ,
uint32 maxbucket ,
uint32 highmask , uint32 lowmask ) ;
/*
* We use high - concurrency locking on hash indexes ( see README for an overview
* of the locking rules ) . There are two cases in which we don ' t do locking .
* One is when the index is newly created in the current transaction . Since
* the creating transaction has not committed , no one else can see the index ,
* and there ' s no reason to take locks . The second case is for temp
* relations , which no one else can see either . ( We still take buffer - level
* locks , but not lmgr locks . )
*/
# define USELOCKING(rel) (!((rel)->rd_isnew || (rel)->rd_istemp))
/*
* We use high - concurrency locking on hash indices . There are two cases in
* which we don ' t do locking . One is when we ' re building the index .
* Since the creating transaction has not committed , no one can see
* the index , and there ' s no reason to share locks . The second case
* is when we ' re just starting up the database system . We use some
* special - purpose initialization code in the relation cache manager
* ( see utils / cache / relcache . c ) to allow us to do indexed scans on
* the system catalogs before we ' d normally be able to . This happens
* before the lock table is fully initialized , so we can ' t use it .
* Strictly speaking , this violates 2 pl , but we don ' t do 2 pl on the
* system catalogs anyway .
* _hash_getlock ( ) - - Acquire an lmgr lock .
*
* Note that our page locks are actual lockmanager locks , not buffer
* locks ( as are used by btree , for example ) . This is a good idea because
* the algorithms are not deadlock - free , and we ' d better be able to detect
* and recover from deadlocks .
* ' whichlock ' should be zero to acquire the split - control lock , or the
* block number of a bucket ' s primary bucket page to acquire the per - bucket
* lock . ( See README for details of the use of these locks . )
*
* Another important difference from btree is that a hash indexscan
* retains both a lock and a buffer pin on the current index page
* between hashgettuple ( ) calls ( btree keeps only a buffer pin ) .
* Because of this , it ' s safe to do item deletions with only a regular
* write lock on a hash page - - - there cannot be an indexscan stopped on
* the page being deleted , other than an indexscan of our own backend ,
* which will be taken care of by _hash_adjscans .
* ' access ' must be HASH_SHARE or HASH_EXCLUSIVE .
*/
# define USELOCKING (!BuildingHash && !IsInitProcessingMode())
void
_hash_getlock ( Relation rel , BlockNumber whichlock , int access )
{
if ( USELOCKING ( rel ) )
LockPage ( rel , whichlock , access ) ;
}
/*
* _hash_try_getlock ( ) - - Acquire an lmgr lock , but only if it ' s free .
*
* Same as above except we return FALSE without blocking if lock isn ' t free .
*/
bool
_hash_try_getlock ( Relation rel , BlockNumber whichlock , int access )
{
if ( USELOCKING ( rel ) )
return ConditionalLockPage ( rel , whichlock , access ) ;
else
return true ;
}
static void _hash_setpagelock ( Relation rel , BlockNumber blkno , int access ) ;
static void _hash_unsetpagelock ( Relation rel , BlockNumber blkno , int access ) ;
static void _hash_splitbucket ( Relation rel , Buffer metabuf ,
Bucket obucket , Bucket nbucket ) ;
/*
* _hash_droplock ( ) - - Release an lmgr lock .
*/
void
_hash_droplock ( Relation rel , BlockNumber whichlock , int access )
{
if ( USELOCKING ( rel ) )
UnlockPage ( rel , whichlock , access ) ;
}
/*
* _hash_getbuf ( ) - - Get a buffer by block number for read or write .
*
* ' access ' must be HASH_READ , HASH_WRITE , or HASH_NOLOCK .
*
* When this routine returns , the appropriate lock is set on the
* requested buffer and its reference count has been incremented
* ( ie , the buffer is " locked and pinned " ) .
*
* XXX P_NEW is not used because , unlike the tree structures , we
* need the bucket blocks to be at certain block numbers . we must
* depend on the caller to call _hash_pageinit on the block if it
* knows that this is a new block .
*/
Buffer
_hash_getbuf ( Relation rel , BlockNumber blkno , int access )
{
Buffer buf ;
if ( blkno = = P_NEW )
elog ( ERROR , " hash AM does not use P_NEW " ) ;
buf = ReadBuffer ( rel , blkno ) ;
if ( access ! = HASH_NOLOCK )
LockBuffer ( buf , access ) ;
/* ref count and lock type are correct */
return buf ;
}
/*
* _hash_relbuf ( ) - - release a locked buffer .
*
* Lock and pin ( refcount ) are both dropped . Note that either read or
* write lock can be dropped this way , but if we modified the buffer ,
* this is NOT the right way to release a write lock .
*/
void
_hash_relbuf ( Relation rel , Buffer buf )
{
LockBuffer ( buf , BUFFER_LOCK_UNLOCK ) ;
ReleaseBuffer ( buf ) ;
}
/*
* _hash_dropbuf ( ) - - release an unlocked buffer .
*
* This is used to unpin a buffer on which we hold no lock . It is assumed
* that the buffer is not dirty .
*/
void
_hash_dropbuf ( Relation rel , Buffer buf )
{
ReleaseBuffer ( buf ) ;
}
/*
* _hash_wrtbuf ( ) - - write a hash page to disk .
*
* This routine releases the lock held on the buffer and our refcount
* for it . It is an error to call _hash_wrtbuf ( ) without a write lock
* and a pin on the buffer .
*
* NOTE : actually , the buffer manager just marks the shared buffer page
* dirty here ; the real I / O happens later . This is okay since we are not
* relying on write ordering anyway . The WAL mechanism is responsible for
* guaranteeing correctness after a crash .
*/
void
_hash_wrtbuf ( Relation rel , Buffer buf )
{
LockBuffer ( buf , BUFFER_LOCK_UNLOCK ) ;
WriteBuffer ( buf ) ;
}
/*
* _hash_wrtnorelbuf ( ) - - write a hash page to disk , but do not release
* our reference or lock .
*
* It is an error to call _hash_wrtnorelbuf ( ) without a write lock
* and a pin on the buffer .
*
* See above NOTE .
*/
void
_hash_wrtnorelbuf ( Relation rel , Buffer buf )
{
WriteNoReleaseBuffer ( buf ) ;
}
/*
* _hash_chgbufaccess ( ) - - Change the lock type on a buffer , without
* dropping our pin on it .
*
* from_access and to_access may be HASH_READ , HASH_WRITE , or HASH_NOLOCK ,
* the last indicating that no buffer - level lock is held or wanted .
*
* When from_access = = HASH_WRITE , we assume the buffer is dirty and tell
* bufmgr it must be written out . If the caller wants to release a write
* lock on a page that ' s not been modified , it ' s okay to pass from_access
* as HASH_READ ( a bit ugly , but handy in some places ) .
*/
void
_hash_chgbufaccess ( Relation rel ,
Buffer buf ,
int from_access ,
int to_access )
{
if ( from_access ! = HASH_NOLOCK )
LockBuffer ( buf , BUFFER_LOCK_UNLOCK ) ;
if ( from_access = = HASH_WRITE )
WriteNoReleaseBuffer ( buf ) ;
if ( to_access ! = HASH_NOLOCK )
LockBuffer ( buf , to_access ) ;
}
/*
* _hash_metapinit ( ) - - Initialize the metadata page of a hash index ,
* the two buckets that we begin with and the initial
* bitmap page .
*
* We are fairly cavalier about locking here , since we know that no one else
* could be accessing this index . In particular the rule about not holding
* multiple buffer locks is ignored .
*/
void
_hash_metapinit ( Relation rel )
@ -83,16 +230,31 @@ _hash_metapinit(Relation rel)
Buffer metabuf ;
Buffer buf ;
Page pg ;
int32 data_width ;
int32 item_width ;
int32 ffactor ;
uint16 i ;
/* can't be sharing this with anyone, now... */
if ( USELOCKING )
LockRelation ( rel , AccessExclusiveLock ) ;
/* safety check */
if ( RelationGetNumberOfBlocks ( rel ) ! = 0 )
elog ( ERROR , " cannot initialize non-empty hash index \" %s \" " ,
RelationGetRelationName ( rel ) ) ;
/*
* Determine the target fill factor ( tuples per bucket ) for this index .
* The idea is to make the fill factor correspond to pages about 3 / 4 ths
* full . We can compute it exactly if the index datatype is fixed - width ,
* but for var - width there ' s some guessing involved .
*/
data_width = get_typavgwidth ( RelationGetDescr ( rel ) - > attrs [ 0 ] - > atttypid ,
RelationGetDescr ( rel ) - > attrs [ 0 ] - > atttypmod ) ;
item_width = MAXALIGN ( sizeof ( HashItemData ) ) + MAXALIGN ( data_width ) +
sizeof ( ItemIdData ) ; /* include the line pointer */
ffactor = ( BLCKSZ * 3 / 4 ) / item_width ;
/* keep to a sane range */
if ( ffactor < 10 )
ffactor = 10 ;
metabuf = _hash_getbuf ( rel , HASH_METAPAGE , HASH_WRITE ) ;
pg = BufferGetPage ( metabuf ) ;
_hash_pageinit ( pg , BufferGetPageSize ( metabuf ) ) ;
@ -110,7 +272,7 @@ _hash_metapinit(Relation rel)
metap - > hashm_version = HASH_VERSION ;
metap - > hashm_ntuples = 0 ;
metap - > hashm_nmaps = 0 ;
metap - > hashm_ffactor = DEFAULT_FFACTOR ;
metap - > hashm_ffactor = ffactor ;
metap - > hashm_bsize = BufferGetPageSize ( metabuf ) ;
/* find largest bitmap array size that will fit in page size */
for ( i = _hash_log2 ( metap - > hashm_bsize ) ; i > 0 ; - - i )
@ -142,7 +304,7 @@ _hash_metapinit(Relation rel)
metap - > hashm_firstfree = 0 ;
/*
* i nitialize the first two buckets
* I nitialize the first two buckets
*/
for ( i = 0 ; i < = 1 ; i + + )
{
@ -159,135 +321,17 @@ _hash_metapinit(Relation rel)
}
/*
* Initialize bitmap page . Can ' t do this until we
* Initialize first bitmap page . Can ' t do this until we
* create the first two buckets , else smgr will complain .
*/
_hash_initbitmap ( rel , metap , 3 ) ;
/* all done */
_hash_wrtbuf ( rel , metabuf ) ;
if ( USELOCKING )
UnlockRelation ( rel , AccessExclusiveLock ) ;
}
/*
* _hash_getbuf ( ) - - Get a buffer by block number for read or write .
*
* When this routine returns , the appropriate lock is set on the
* requested buffer its reference count is correct .
*
* XXX P_NEW is not used because , unlike the tree structures , we
* need the bucket blocks to be at certain block numbers . we must
* depend on the caller to call _hash_pageinit on the block if it
* knows that this is a new block .
*/
Buffer
_hash_getbuf ( Relation rel , BlockNumber blkno , int access )
{
Buffer buf ;
if ( blkno = = P_NEW )
elog ( ERROR , " hash AM does not use P_NEW " ) ;
switch ( access )
{
case HASH_WRITE :
case HASH_READ :
_hash_setpagelock ( rel , blkno , access ) ;
break ;
default :
elog ( ERROR , " unrecognized hash access code: %d " , access ) ;
break ;
}
buf = ReadBuffer ( rel , blkno ) ;
/* ref count and lock type are correct */
return buf ;
}
/*
* _hash_relbuf ( ) - - release a locked buffer .
*/
void
_hash_relbuf ( Relation rel , Buffer buf , int access )
{
BlockNumber blkno ;
blkno = BufferGetBlockNumber ( buf ) ;
switch ( access )
{
case HASH_WRITE :
case HASH_READ :
_hash_unsetpagelock ( rel , blkno , access ) ;
break ;
default :
elog ( ERROR , " unrecognized hash access code: %d " , access ) ;
break ;
}
ReleaseBuffer ( buf ) ;
}
/*
* _hash_wrtbuf ( ) - - write a hash page to disk .
*
* This routine releases the lock held on the buffer and our reference
* to it . It is an error to call _hash_wrtbuf ( ) without a write lock
* or a reference to the buffer .
*/
void
_hash_wrtbuf ( Relation rel , Buffer buf )
{
BlockNumber blkno ;
blkno = BufferGetBlockNumber ( buf ) ;
WriteBuffer ( buf ) ;
_hash_unsetpagelock ( rel , blkno , HASH_WRITE ) ;
}
/*
* _hash_wrtnorelbuf ( ) - - write a hash page to disk , but do not release
* our reference or lock .
*
* It is an error to call _hash_wrtnorelbuf ( ) without a write lock
* or a reference to the buffer .
*/
void
_hash_wrtnorelbuf ( Buffer buf )
{
BlockNumber blkno ;
blkno = BufferGetBlockNumber ( buf ) ;
WriteNoReleaseBuffer ( buf ) ;
}
/*
* _hash_chgbufaccess ( ) - - Change from read to write access or vice versa .
*
* When changing from write to read , we assume the buffer is dirty and tell
* bufmgr it must be written out .
*/
void
_hash_chgbufaccess ( Relation rel ,
Buffer buf ,
int from_access ,
int to_access )
{
BlockNumber blkno ;
blkno = BufferGetBlockNumber ( buf ) ;
if ( from_access = = HASH_WRITE )
_hash_wrtnorelbuf ( buf ) ;
_hash_unsetpagelock ( rel , blkno , from_access ) ;
_hash_setpagelock ( rel , blkno , to_access ) ;
}
/*
* _hash_pageinit ( ) - - Initialize a new page .
* _hash_pageinit ( ) - - Initialize a new hash index page .
*/
void
_hash_pageinit ( Page page , Size size )
@ -297,57 +341,14 @@ _hash_pageinit(Page page, Size size)
}
/*
* _hash_setpagelock ( ) - - Acquire the requested type of lock on a page .
*/
static void
_hash_setpagelock ( Relation rel ,
BlockNumber blkno ,
int access )
{
if ( USELOCKING )
{
switch ( access )
{
case HASH_WRITE :
LockPage ( rel , blkno , ExclusiveLock ) ;
break ;
case HASH_READ :
LockPage ( rel , blkno , ShareLock ) ;
break ;
default :
elog ( ERROR , " unrecognized hash access code: %d " , access ) ;
break ;
}
}
}
/*
* _hash_unsetpagelock ( ) - - Release the specified type of lock on a page .
*/
static void
_hash_unsetpagelock ( Relation rel ,
BlockNumber blkno ,
int access )
{
if ( USELOCKING )
{
switch ( access )
{
case HASH_WRITE :
UnlockPage ( rel , blkno , ExclusiveLock ) ;
break ;
case HASH_READ :
UnlockPage ( rel , blkno , ShareLock ) ;
break ;
default :
elog ( ERROR , " unrecognized hash access code: %d " , access ) ;
break ;
}
}
}
/*
* Expand the hash table by creating one new bucket .
* Attempt to expand the hash table by creating one new bucket .
*
* This will silently do nothing if it cannot get the needed locks .
*
* The caller should hold no locks on the hash index .
*
* The caller must hold a pin , but no lock , on the metapage buffer .
* The buffer is returned in the same state .
*/
void
_hash_expandtable ( Relation rel , Buffer metabuf )
@ -356,15 +357,72 @@ _hash_expandtable(Relation rel, Buffer metabuf)
Bucket old_bucket ;
Bucket new_bucket ;
uint32 spare_ndx ;
BlockNumber start_oblkno ;
BlockNumber start_nblkno ;
uint32 maxbucket ;
uint32 highmask ;
uint32 lowmask ;
/*
* Obtain the page - zero lock to assert the right to begin a split
* ( see README ) .
*
* Note : deadlock should be impossible here . Our own backend could only
* be holding bucket sharelocks due to stopped indexscans ; those will not
* block other holders of the page - zero lock , who are only interested in
* acquiring bucket sharelocks themselves . Exclusive bucket locks are
* only taken here and in hashbulkdelete , and neither of these operations
* needs any additional locks to complete . ( If , due to some flaw in this
* reasoning , we manage to deadlock anyway , it ' s okay to error out ; the
* index will be left in a consistent state . )
*/
_hash_getlock ( rel , 0 , HASH_EXCLUSIVE ) ;
/* Write-lock the meta page */
_hash_chgbufaccess ( rel , metabuf , HASH_NOLOCK , HASH_WRITE ) ;
metap = ( HashMetaPage ) BufferGetPage ( metabuf ) ;
_hash_checkpage ( rel , ( Page ) metap , LH_META_PAGE ) ;
_hash_chgbufaccess ( rel , metabuf , HASH_READ , HASH_WRITE ) ;
/*
* Check to see if split is still needed ; someone else might have already
* done one while we waited for the lock .
*
* Make sure this stays in sync with_hash_doinsert ( )
*/
if ( metap - > hashm_ntuples < =
( double ) metap - > hashm_ffactor * ( metap - > hashm_maxbucket + 1 ) )
goto fail ;
new_bucket = + + metap - > hashm_maxbucket ;
/*
* Determine which bucket is to be split , and attempt to lock the old
* bucket . If we can ' t get the lock , give up .
*
* The lock protects us against other backends , but not against our own
* backend . Must check for active scans separately .
*
* Ideally we would lock the new bucket too before proceeding , but if
* we are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
* isn ' t correct yet . For simplicity we update the metapage first and
* then lock . This should be okay because no one else should be trying
* to lock the new bucket yet . . .
*/
new_bucket = metap - > hashm_maxbucket + 1 ;
old_bucket = ( new_bucket & metap - > hashm_lowmask ) ;
start_oblkno = BUCKET_TO_BLKNO ( metap , old_bucket ) ;
if ( _hash_has_active_scan ( rel , old_bucket ) )
goto fail ;
if ( ! _hash_try_getlock ( rel , start_oblkno , HASH_EXCLUSIVE ) )
goto fail ;
/*
* Okay to proceed with split . Update the metapage bucket mapping info .
*/
metap - > hashm_maxbucket = new_bucket ;
if ( new_bucket > metap - > hashm_highmask )
{
/* Starting a new doubling */
@ -379,7 +437,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
* this new batch of bucket pages .
*
* XXX should initialize new bucket pages to prevent out - of - order
* page creation .
* page creation ? Don ' t wanna do it right here though .
*/
spare_ndx = _hash_log2 ( metap - > hashm_maxbucket + 1 ) ;
if ( spare_ndx > metap - > hashm_ovflpoint )
@ -389,10 +447,50 @@ _hash_expandtable(Relation rel, Buffer metabuf)
metap - > hashm_ovflpoint = spare_ndx ;
}
_hash_chgbufaccess ( rel , metabuf , HASH_WRITE , HASH_READ ) ;
/* now we can compute the new bucket's primary block number */
start_nblkno = BUCKET_TO_BLKNO ( metap , new_bucket ) ;
Assert ( ! _hash_has_active_scan ( rel , new_bucket ) ) ;
if ( ! _hash_try_getlock ( rel , start_nblkno , HASH_EXCLUSIVE ) )
elog ( PANIC , " could not get lock on supposedly new bucket " ) ;
/*
* Copy bucket mapping info now ; this saves re - accessing the meta page
* inside _hash_splitbucket ' s inner loop . Note that once we drop the
* split lock , other splits could begin , so these values might be out of
* date before _hash_splitbucket finishes . That ' s okay , since all it
* needs is to tell which of these two buckets to map hashkeys into .
*/
maxbucket = metap - > hashm_maxbucket ;
highmask = metap - > hashm_highmask ;
lowmask = metap - > hashm_lowmask ;
/* Write out the metapage and drop lock, but keep pin */
_hash_chgbufaccess ( rel , metabuf , HASH_WRITE , HASH_NOLOCK ) ;
/* Release split lock; okay for other splits to occur now */
_hash_droplock ( rel , 0 , HASH_EXCLUSIVE ) ;
/* Relocate records to the new bucket */
_hash_splitbucket ( rel , metabuf , old_bucket , new_bucket ) ;
_hash_splitbucket ( rel , metabuf , old_bucket , new_bucket ,
start_oblkno , start_nblkno ,
maxbucket , highmask , lowmask ) ;
/* Release bucket locks, allowing others to access them */
_hash_droplock ( rel , start_oblkno , HASH_EXCLUSIVE ) ;
_hash_droplock ( rel , start_nblkno , HASH_EXCLUSIVE ) ;
return ;
/* Here if decide not to split or fail to acquire old bucket lock */
fail :
/* We didn't write the metapage, so just drop lock */
_hash_chgbufaccess ( rel , metabuf , HASH_READ , HASH_NOLOCK ) ;
/* Release split lock */
_hash_droplock ( rel , 0 , HASH_EXCLUSIVE ) ;
}
@ -403,27 +501,35 @@ _hash_expandtable(Relation rel, Buffer metabuf)
* or more overflow ( bucket chain ) pages . We must relocate tuples that
* belong in the new bucket , and compress out any free space in the old
* bucket .
*
* The caller must hold exclusive locks on both buckets to ensure that
* no one else is trying to access them ( see README ) .
*
* The caller must hold a pin , but no lock , on the metapage buffer .
* The buffer is returned in the same state . ( The metapage is only
* touched if it becomes necessary to add or remove overflow pages . )
*/
static void
_hash_splitbucket ( Relation rel ,
Buffer metabuf ,
Bucket obucket ,
Bucket nbucket )
Bucket nbucket ,
BlockNumber start_oblkno ,
BlockNumber start_nblkno ,
uint32 maxbucket ,
uint32 highmask ,
uint32 lowmask )
{
Bucket bucket ;
Buffer obuf ;
Buffer nbuf ;
Buffer ovflbuf ;
BlockNumber oblkno ;
BlockNumber nblkno ;
BlockNumber start_oblkno ;
BlockNumber start_nblkno ;
bool null ;
Datum datum ;
HashItem hitem ;
HashPageOpaque oopaque ;
HashPageOpaque nopaque ;
HashMetaPage metap ;
IndexTuple itup ;
Size itemsz ;
OffsetNumber ooffnum ;
@ -433,12 +539,11 @@ _hash_splitbucket(Relation rel,
Page npage ;
TupleDesc itupdesc = RelationGetDescr ( rel ) ;
metap = ( HashMetaPage ) BufferGetPage ( metabuf ) ;
_hash_checkpage ( rel , ( Page ) metap , LH_META_PAGE ) ;
/* get the buffers & pages */
start_oblkno = BUCKET_TO_BLKNO ( metap , obucket ) ;
start_nblkno = BUCKET_TO_BLKNO ( metap , nbucket ) ;
/*
* It should be okay to simultaneously write - lock pages from each
* bucket , since no one else can be trying to acquire buffer lock
* on pages of either bucket .
*/
oblkno = start_oblkno ;
nblkno = start_nblkno ;
obuf = _hash_getbuf ( rel , oblkno , HASH_WRITE ) ;
@ -446,7 +551,10 @@ _hash_splitbucket(Relation rel,
opage = BufferGetPage ( obuf ) ;
npage = BufferGetPage ( nbuf ) ;
/* initialize the new bucket page */
_hash_checkpage ( rel , opage , LH_BUCKET_PAGE ) ;
oopaque = ( HashPageOpaque ) PageGetSpecialPointer ( opage ) ;
/* initialize the new bucket's primary page */
_hash_pageinit ( npage , BufferGetPageSize ( nbuf ) ) ;
nopaque = ( HashPageOpaque ) PageGetSpecialPointer ( npage ) ;
nopaque - > hasho_prevblkno = InvalidBlockNumber ;
@ -454,44 +562,11 @@ _hash_splitbucket(Relation rel,
nopaque - > hasho_bucket = nbucket ;
nopaque - > hasho_flag = LH_BUCKET_PAGE ;
nopaque - > hasho_filler = HASHO_FILL ;
_hash_wrtnorelbuf ( nbuf ) ;
/*
* make sure the old bucket isn ' t empty . advance ' opage ' and friends
* through the overflow bucket chain until we find a non - empty page .
*
* XXX we should only need this once , if we are careful to preserve the
* invariant that overflow pages are never empty .
*/
_hash_checkpage ( rel , opage , LH_BUCKET_PAGE ) ;
oopaque = ( HashPageOpaque ) PageGetSpecialPointer ( opage ) ;
if ( PageIsEmpty ( opage ) )
{
oblkno = oopaque - > hasho_nextblkno ;
_hash_relbuf ( rel , obuf , HASH_WRITE ) ;
if ( ! BlockNumberIsValid ( oblkno ) )
{
/*
* the old bucket is completely empty ; of course , the new
* bucket will be as well , but since it ' s a base bucket page
* we don ' t care .
*/
_hash_relbuf ( rel , nbuf , HASH_WRITE ) ;
return ;
}
obuf = _hash_getbuf ( rel , oblkno , HASH_WRITE ) ;
opage = BufferGetPage ( obuf ) ;
_hash_checkpage ( rel , opage , LH_OVERFLOW_PAGE ) ;
if ( PageIsEmpty ( opage ) )
elog ( ERROR , " empty hash overflow page %u " , oblkno ) ;
oopaque = ( HashPageOpaque ) PageGetSpecialPointer ( opage ) ;
}
/*
* we are now guaranteed that ' opage ' is not empty . partition the
* tuples in the old bucket between the old bucket and the new bucket ,
* advancing along their respective overflow bucket chains and adding
* overflow pages as needed .
* Partition the tuples in the old bucket between the old bucket and the
* new bucket , advancing along the old bucket ' s overflow bucket chain
* and adding overflow pages to the new bucket as needed .
*/
ooffnum = FirstOffsetNumber ;
omaxoffnum = PageGetMaxOffsetNumber ( opage ) ;
@ -505,48 +580,39 @@ _hash_splitbucket(Relation rel,
/* check if we're at the end of the page */
if ( ooffnum > omaxoffnum )
{
/* at end of page, but check for overflow page */
/* at end of page, but check for an(other) overflow page */
oblkno = oopaque - > hasho_nextblkno ;
if ( BlockNumberIsValid ( oblkno ) )
{
/*
* we ran out of tuples on this particular page , but we
* have more overflow pages ; re - init values .
*/
_hash_wrtbuf ( rel , obuf ) ;
obuf = _hash_getbuf ( rel , oblkno , HASH_WRITE ) ;
opage = BufferGetPage ( obuf ) ;
_hash_checkpage ( rel , opage , LH_OVERFLOW_PAGE ) ;
oopaque = ( HashPageOpaque ) PageGetSpecialPointer ( opage ) ;
/* we're guaranteed that an ovfl page has at least 1 tuple */
if ( PageIsEmpty ( opage ) )
elog ( ERROR , " empty hash overflow page %u " , oblkno ) ;
ooffnum = FirstOffsetNumber ;
omaxoffnum = PageGetMaxOffsetNumber ( opage ) ;
}
else
{
/*
* We ' re at the end of the bucket chain , so now we ' re
* really done with everything . Before quitting , call
* _hash_squeezebucket to ensure the tuples remaining in the
* old bucket ( including the overflow pages ) are packed as
* tightly as possible . The new bucket is already tight .
*/
_hash_wrtbuf ( rel , obuf ) ;
_hash_wrtbuf ( rel , nbuf ) ;
_hash_squeezebucket ( rel , obucket , start_oblkno ) ;
return ;
}
if ( ! BlockNumberIsValid ( oblkno ) )
break ;
/*
* we ran out of tuples on this particular page , but we
* have more overflow pages ; advance to next page .
*/
_hash_wrtbuf ( rel , obuf ) ;
obuf = _hash_getbuf ( rel , oblkno , HASH_WRITE ) ;
opage = BufferGetPage ( obuf ) ;
_hash_checkpage ( rel , opage , LH_OVERFLOW_PAGE ) ;
oopaque = ( HashPageOpaque ) PageGetSpecialPointer ( opage ) ;
ooffnum = FirstOffsetNumber ;
omaxoffnum = PageGetMaxOffsetNumber ( opage ) ;
continue ;
}
/* hash on the tuple */
/*
* Re - hash the tuple to determine which bucket it now belongs in .
*
* It is annoying to call the hash function while holding locks ,
* but releasing and relocking the page for each tuple is unappealing
* too .
*/
hitem = ( HashItem ) PageGetItem ( opage , PageGetItemId ( opage , ooffnum ) ) ;
itup = & ( hitem - > hash_itup ) ;
datum = index_getattr ( itup , 1 , itupdesc , & null ) ;
Assert ( ! null ) ;
bucket = _hash_call ( rel , metap , datum ) ;
bucket = _hash_hashkey2bucket ( _hash_datum2hashkey ( rel , datum ) ,
maxbucket , highmask , lowmask ) ;
if ( bucket = = nbucket )
{
@ -562,11 +628,13 @@ _hash_splitbucket(Relation rel,
if ( PageGetFreeSpace ( npage ) < itemsz )
{
ovflbuf = _hash_addovflpage ( rel , metabuf , nbuf ) ;
_hash_wrtbuf ( rel , nbuf ) ;
nbuf = ovflbuf ;
/* write out nbuf and drop lock, but keep pin */
_hash_chgbufaccess ( rel , nbuf , HASH_WRITE , HASH_NOLOCK ) ;
/* chain to a new overflow page */
nbuf = _hash_addovflpage ( rel , metabuf , nbuf ) ;
npage = BufferGetPage ( nbuf ) ;
_hash_checkpage ( rel , npage , LH_BUCKET_PAGE | LH_OVERFLOW_PAGE ) ;
_hash_checkpage ( rel , npage , LH_OVERFLOW_PAGE ) ;
/* we don't need nopaque within the loop */
}
noffnum = OffsetNumberNext ( PageGetMaxOffsetNumber ( npage ) ) ;
@ -574,7 +642,6 @@ _hash_splitbucket(Relation rel,
= = InvalidOffsetNumber )
elog ( ERROR , " failed to add index item to \" %s \" " ,
RelationGetRelationName ( rel ) ) ;
_hash_wrtnorelbuf ( nbuf ) ;
/*
* now delete the tuple from the old bucket . after this
@ -586,40 +653,7 @@ _hash_splitbucket(Relation rel,
* instead of calling PageGetMaxOffsetNumber .
*/
PageIndexTupleDelete ( opage , ooffnum ) ;
_hash_wrtnorelbuf ( obuf ) ;
omaxoffnum = OffsetNumberPrev ( omaxoffnum ) ;
/*
* tidy up . if the old page was an overflow page and it is
* now empty , we must free it ( we want to preserve the
* invariant that overflow pages cannot be empty ) .
*/
if ( PageIsEmpty ( opage ) & &
( oopaque - > hasho_flag & LH_OVERFLOW_PAGE ) )
{
oblkno = _hash_freeovflpage ( rel , obuf ) ;
/* check that we're not through the bucket chain */
if ( ! BlockNumberIsValid ( oblkno ) )
{
_hash_wrtbuf ( rel , nbuf ) ;
_hash_squeezebucket ( rel , obucket , start_oblkno ) ;
return ;
}
/*
* re - init . again , we ' re guaranteed that an ovfl page has
* at least one tuple .
*/
obuf = _hash_getbuf ( rel , oblkno , HASH_WRITE ) ;
opage = BufferGetPage ( obuf ) ;
_hash_checkpage ( rel , opage , LH_OVERFLOW_PAGE ) ;
oopaque = ( HashPageOpaque ) PageGetSpecialPointer ( opage ) ;
if ( PageIsEmpty ( opage ) )
elog ( ERROR , " empty hash overflow page %u " , oblkno ) ;
ooffnum = FirstOffsetNumber ;
omaxoffnum = PageGetMaxOffsetNumber ( opage ) ;
}
}
else
{
@ -632,5 +666,15 @@ _hash_splitbucket(Relation rel,
ooffnum = OffsetNumberNext ( ooffnum ) ;
}
}
/* NOTREACHED */
/*
* We ' re at the end of the old bucket chain , so we ' re done partitioning
* the tuples . Before quitting , call _hash_squeezebucket to ensure the
* tuples remaining in the old bucket ( including the overflow pages ) are
* packed as tightly as possible . The new bucket is already tight .
*/
_hash_wrtbuf ( rel , obuf ) ;
_hash_wrtbuf ( rel , nbuf ) ;
_hash_squeezebucket ( rel , obucket , start_oblkno ) ;
}