@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $ PostgreSQL : pgsql / src / backend / access / hash / hashpage . c , v 1.60 2006 / 10 / 04 00 : 29 : 48 momjian Exp $
* $ PostgreSQL : pgsql / src / backend / access / hash / hashpage . c , v 1.61 2006 / 11 / 19 21 : 33 : 23 tgl Exp $
*
* NOTES
* Postgres hash pages look like ordinary relation pages . The opaque
@ -32,9 +32,11 @@
# include "access/hash.h"
# include "miscadmin.h"
# include "storage/lmgr.h"
# include "storage/smgr.h"
# include "utils/lsyscache.h"
static BlockNumber _hash_alloc_buckets ( Relation rel , uint32 nblocks ) ;
static void _hash_splitbucket ( Relation rel , Buffer metabuf ,
Bucket obucket , Bucket nbucket ,
BlockNumber start_oblkno ,
@ -102,21 +104,18 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
* requested buffer and its reference count has been incremented
* ( ie , the buffer is " locked and pinned " ) .
*
* XXX P_NEW is not used because , unlike the tree structures , we
* need the bucket blocks to be at certain block numbers .
* blkno = = P_NEW is allowed , but it is caller ' s responsibility to
* ensure that only one process can extend the index at a time .
*
* All call sites should call either _hash_pageinit or _hash_check page
* All call sites should call either _hash_check page or _hash_pageinit
* on the returned page , depending on whether the block is expected
* to be new or not .
* to be valid or not .
*/
Buffer
_hash_getbuf ( Relation rel , BlockNumber blkno , int access )
{
Buffer buf ;
if ( blkno = = P_NEW )
elog ( ERROR , " hash AM does not use P_NEW " ) ;
buf = ReadBuffer ( rel , blkno ) ;
if ( access ! = HASH_NOLOCK )
@ -237,7 +236,14 @@ _hash_metapinit(Relation rel)
if ( ffactor < 10 )
ffactor = 10 ;
metabuf = _hash_getbuf ( rel , HASH_METAPAGE , HASH_WRITE ) ;
/*
* We initialize the metapage , the first two bucket pages , and the
* first bitmap page in sequence , using P_NEW to cause smgrextend ( )
* calls to occur . This ensures that the smgr level has the right
* idea of the physical index length .
*/
metabuf = _hash_getbuf ( rel , P_NEW , HASH_WRITE ) ;
Assert ( BufferGetBlockNumber ( metabuf ) = = HASH_METAPAGE ) ;
pg = BufferGetPage ( metabuf ) ;
_hash_pageinit ( pg , BufferGetPageSize ( metabuf ) ) ;
@ -290,7 +296,8 @@ _hash_metapinit(Relation rel)
*/
for ( i = 0 ; i < = 1 ; i + + )
{
buf = _hash_getbuf ( rel , BUCKET_TO_BLKNO ( metap , i ) , HASH_WRITE ) ;
buf = _hash_getbuf ( rel , P_NEW , HASH_WRITE ) ;
Assert ( BufferGetBlockNumber ( buf ) = = BUCKET_TO_BLKNO ( metap , i ) ) ;
pg = BufferGetPage ( buf ) ;
_hash_pageinit ( pg , BufferGetPageSize ( buf ) ) ;
pageopaque = ( HashPageOpaque ) PageGetSpecialPointer ( pg ) ;
@ -303,8 +310,7 @@ _hash_metapinit(Relation rel)
}
/*
* Initialize first bitmap page . Can ' t do this until we create the first
* two buckets , else smgr will complain .
* Initialize first bitmap page
*/
_hash_initbitmap ( rel , metap , 3 ) ;
@ -339,6 +345,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
Bucket old_bucket ;
Bucket new_bucket ;
uint32 spare_ndx ;
BlockNumber firstblock = InvalidBlockNumber ;
BlockNumber start_oblkno ;
BlockNumber start_nblkno ;
uint32 maxbucket ;
@ -376,6 +383,40 @@ _hash_expandtable(Relation rel, Buffer metabuf)
( double ) metap - > hashm_ffactor * ( metap - > hashm_maxbucket + 1 ) )
goto fail ;
/*
* Can ' t split anymore if maxbucket has reached its maximum possible value .
*
* Ideally we ' d allow bucket numbers up to UINT_MAX - 1 ( no higher because
* the calculation maxbucket + 1 mustn ' t overflow ) . Currently we restrict
* to half that because of overflow looping in _hash_log2 ( ) and
* insufficient space in hashm_spares [ ] . It ' s moot anyway because an
* index with 2 ^ 32 buckets would certainly overflow BlockNumber and
* hence _hash_alloc_buckets ( ) would fail , but if we supported buckets
* smaller than a disk block then this would be an independent constraint .
*/
if ( metap - > hashm_maxbucket > = ( uint32 ) 0x7FFFFFFE )
goto fail ;
/*
* If the split point is increasing ( hashm_maxbucket ' s log base 2
* increases ) , we need to allocate a new batch of bucket pages .
*/
new_bucket = metap - > hashm_maxbucket + 1 ;
spare_ndx = _hash_log2 ( new_bucket + 1 ) ;
if ( spare_ndx > metap - > hashm_ovflpoint )
{
Assert ( spare_ndx = = metap - > hashm_ovflpoint + 1 ) ;
/*
* The number of buckets in the new splitpoint is equal to the
* total number already in existence , i . e . new_bucket . Currently
* this maps one - to - one to blocks required , but someday we may need
* a more complicated calculation here .
*/
firstblock = _hash_alloc_buckets ( rel , new_bucket ) ;
if ( firstblock = = InvalidBlockNumber )
goto fail ; /* can't split due to BlockNumber overflow */
}
/*
* Determine which bucket is to be split , and attempt to lock the old
* bucket . If we can ' t get the lock , give up .
@ -389,7 +430,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
* lock . This should be okay because no one else should be trying to lock
* the new bucket yet . . .
*/
new_bucket = metap - > hashm_maxbucket + 1 ;
old_bucket = ( new_bucket & metap - > hashm_lowmask ) ;
start_oblkno = BUCKET_TO_BLKNO ( metap , old_bucket ) ;
@ -425,14 +465,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
* increases ) , we need to adjust the hashm_spares [ ] array and
* hashm_ovflpoint so that future overflow pages will be created beyond
* this new batch of bucket pages .
*
* XXX should initialize new bucket pages to prevent out - of - order page
* creation ? Don ' t wanna do it right here though .
*/
spare_ndx = _hash_log2 ( metap - > hashm_maxbucket + 1 ) ;
if ( spare_ndx > metap - > hashm_ovflpoint )
{
Assert ( spare_ndx = = metap - > hashm_ovflpoint + 1 ) ;
metap - > hashm_spares [ spare_ndx ] = metap - > hashm_spares [ metap - > hashm_ovflpoint ] ;
metap - > hashm_ovflpoint = spare_ndx ;
}
@ -440,6 +475,12 @@ _hash_expandtable(Relation rel, Buffer metabuf)
/* now we can compute the new bucket's primary block number */
start_nblkno = BUCKET_TO_BLKNO ( metap , new_bucket ) ;
/* if we added a splitpoint, should match result of _hash_alloc_buckets */
if ( firstblock ! = InvalidBlockNumber & &
firstblock ! = start_nblkno )
elog ( PANIC , " unexpected hash relation size: %u, should be %u " ,
firstblock , start_nblkno ) ;
Assert ( ! _hash_has_active_scan ( rel , new_bucket ) ) ;
if ( ! _hash_try_getlock ( rel , start_nblkno , HASH_EXCLUSIVE ) )
@ -487,6 +528,79 @@ fail:
}
/*
* _hash_alloc_buckets - - allocate a new splitpoint ' s worth of bucket pages
*
* This does not need to initialize the new bucket pages ; we ' ll do that as
* each one is used by _hash_expandtable ( ) . But we have to extend the logical
* EOF to the end of the splitpoint ; otherwise the first overflow page
* allocated beyond the splitpoint will represent a noncontiguous access ,
* which can confuse md . c ( and will probably be forbidden by future changes
* to md . c ) .
*
* We do this by writing a page of zeroes at the end of the splitpoint range .
* We expect that the filesystem will ensure that the intervening pages read
* as zeroes too . On many filesystems this " hole " will not be allocated
* immediately , which means that the index file may end up more fragmented
* than if we forced it all to be allocated now ; but since we don ' t scan
* hash indexes sequentially anyway , that probably doesn ' t matter .
*
* XXX It ' s annoying that this code is executed with the metapage lock held .
* We need to interlock against _hash_getovflpage ( ) adding a new overflow page
* concurrently , but it ' d likely be better to use LockRelationForExtension
* for the purpose . OTOH , adding a splitpoint is a very infrequent operation ,
* so it may not be worth worrying about .
*
* Returns the first block number in the new splitpoint ' s range , or
* InvalidBlockNumber if allocation failed due to BlockNumber overflow .
*/
static BlockNumber
_hash_alloc_buckets ( Relation rel , uint32 nblocks )
{
BlockNumber firstblock ;
BlockNumber lastblock ;
BlockNumber endblock ;
char zerobuf [ BLCKSZ ] ;
/*
* Since we hold metapage lock , no one else is either splitting or
* allocating a new page in _hash_getovflpage ( ) ; hence it ' s safe to
* assume that the relation length isn ' t changing under us .
*/
firstblock = RelationGetNumberOfBlocks ( rel ) ;
lastblock = firstblock + nblocks - 1 ;
/*
* Check for overflow in block number calculation ; if so , we cannot
* extend the index anymore .
*/
if ( lastblock < firstblock | | lastblock = = InvalidBlockNumber )
return InvalidBlockNumber ;
/* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
MemSet ( zerobuf , 0 , sizeof ( zerobuf ) ) ;
/*
* XXX If the extension results in creation of new segment files ,
* we have to make sure that each non - last file is correctly filled out to
* RELSEG_SIZE blocks . This ought to be done inside mdextend , but
* changing the smgr API seems best left for development cycle not late
* beta . Temporary fix for bug # 2737.
*/
# ifndef LET_OS_MANAGE_FILESIZE
for ( endblock = firstblock | ( RELSEG_SIZE - 1 ) ;
endblock < lastblock ;
endblock + = RELSEG_SIZE )
smgrextend ( rel - > rd_smgr , endblock , zerobuf , rel - > rd_istemp ) ;
# endif
smgrextend ( rel - > rd_smgr , lastblock , zerobuf , rel - > rd_istemp ) ;
return firstblock ;
}
/*
* _hash_splitbucket - - split ' obucket ' into ' obucket ' and ' nbucket '
*