@ -188,6 +188,37 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
pscan , flags ) ;
pscan , flags ) ;
}
}
TableScanDesc
table_beginscan_parallel_tidrange ( Relation relation ,
ParallelTableScanDesc pscan )
{
Snapshot snapshot ;
uint32 flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE ;
TableScanDesc sscan ;
Assert ( RelFileLocatorEquals ( relation - > rd_locator , pscan - > phs_locator ) ) ;
/* disable syncscan in parallel tid range scan. */
pscan - > phs_syncscan = false ;
if ( ! pscan - > phs_snapshot_any )
{
/* Snapshot was serialized -- restore it */
snapshot = RestoreSnapshot ( ( char * ) pscan + pscan - > phs_snapshot_off ) ;
RegisterSnapshot ( snapshot ) ;
flags | = SO_TEMP_SNAPSHOT ;
}
else
{
/* SnapshotAny passed by caller (not serialized) */
snapshot = SnapshotAny ;
}
sscan = relation - > rd_tableam - > scan_begin ( relation , snapshot , 0 , NULL ,
pscan , flags ) ;
return sscan ;
}
/* ----------------------------------------------------------------------------
/* ----------------------------------------------------------------------------
* Index scan related functions .
* Index scan related functions .
@ -398,6 +429,7 @@ table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
bpscan - > phs_nblocks > NBuffers / 4 ;
bpscan - > phs_nblocks > NBuffers / 4 ;
SpinLockInit ( & bpscan - > phs_mutex ) ;
SpinLockInit ( & bpscan - > phs_mutex ) ;
bpscan - > phs_startblock = InvalidBlockNumber ;
bpscan - > phs_startblock = InvalidBlockNumber ;
bpscan - > phs_numblock = InvalidBlockNumber ;
pg_atomic_init_u64 ( & bpscan - > phs_nallocated , 0 ) ;
pg_atomic_init_u64 ( & bpscan - > phs_nallocated , 0 ) ;
return sizeof ( ParallelBlockTableScanDescData ) ;
return sizeof ( ParallelBlockTableScanDescData ) ;
@ -416,14 +448,22 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
*
*
* Determine where the parallel seq scan should start . This function may be
* Determine where the parallel seq scan should start . This function may be
* called many times , once by each parallel worker . We must be careful only
* called many times , once by each parallel worker . We must be careful only
* to set the startblock once .
* to set the phs_startblock and phs_numblock fields once .
*
* Callers may optionally specify a non - InvalidBlockNumber value for
* ' startblock ' to force the scan to start at the given page . Likewise ,
* ' numblocks ' can be specified as a non - InvalidBlockNumber to limit the
* number of blocks to scan to that many blocks .
*/
*/
void
void
table_block_parallelscan_startblock_init ( Relation rel ,
table_block_parallelscan_startblock_init ( Relation rel ,
ParallelBlockTableScanWorker pbscanwork ,
ParallelBlockTableScanWorker pbscanwork ,
ParallelBlockTableScanDesc pbscan )
ParallelBlockTableScanDesc pbscan ,
BlockNumber startblock ,
BlockNumber numblocks )
{
{
BlockNumber sync_startpage = InvalidBlockNumber ;
BlockNumber sync_startpage = InvalidBlockNumber ;
BlockNumber scan_nblocks ;
/* Reset the state we use for controlling allocation size. */
/* Reset the state we use for controlling allocation size. */
memset ( pbscanwork , 0 , sizeof ( * pbscanwork ) ) ;
memset ( pbscanwork , 0 , sizeof ( * pbscanwork ) ) ;
@ -431,42 +471,36 @@ table_block_parallelscan_startblock_init(Relation rel,
StaticAssertStmt ( MaxBlockNumber < = 0xFFFFFFFE ,
StaticAssertStmt ( MaxBlockNumber < = 0xFFFFFFFE ,
" pg_nextpower2_32 may be too small for non-standard BlockNumber width " ) ;
" pg_nextpower2_32 may be too small for non-standard BlockNumber width " ) ;
/*
* We determine the chunk size based on the size of the relation . First we
* split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
* take the next highest power of 2 number of the chunk size . This means
* we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
* and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks .
*/
pbscanwork - > phsw_chunk_size = pg_nextpower2_32 ( Max ( pbscan - > phs_nblocks /
PARALLEL_SEQSCAN_NCHUNKS , 1 ) ) ;
/*
* Ensure we don ' t go over the maximum chunk size with larger tables . This
* means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
* tables . Too large a chunk size has been shown to be detrimental to
* synchronous scan performance .
*/
pbscanwork - > phsw_chunk_size = Min ( pbscanwork - > phsw_chunk_size ,
PARALLEL_SEQSCAN_MAX_CHUNK_SIZE ) ;
retry :
retry :
/* Grab the spinlock. */
/* Grab the spinlock. */
SpinLockAcquire ( & pbscan - > phs_mutex ) ;
SpinLockAcquire ( & pbscan - > phs_mutex ) ;
/*
/*
* If the scan ' s startblock has not yet been initialized , we must do so
* When the caller specified a limit on the number of blocks to scan , set
* now . If this is not a synchronized scan , we just start at block 0 , but
* that in the ParallelBlockTableScanDesc , if it ' s not been done by
* if it is a synchronized scan , we must get the starting position from
* another worker already .
* the synchronized scan machinery . We can ' t hold the spinlock while
*/
* doing that , though , so release the spinlock , get the information we
if ( numblocks ! = InvalidBlockNumber & &
* need , and retry . If nobody else has initialized the scan in the
pbscan - > phs_numblock = = InvalidBlockNumber )
* meantime , we ' ll fill in the value we fetched on the second time
{
* through .
pbscan - > phs_numblock = numblocks ;
}
/*
* If the scan ' s phs_startblock has not yet been initialized , we must do
* so now . If a startblock was specified , start there , otherwise if this
* is not a synchronized scan , we just start at block 0 , but if it is a
* synchronized scan , we must get the starting position from the
* synchronized scan machinery . We can ' t hold the spinlock while doing
* that , though , so release the spinlock , get the information we need , and
* retry . If nobody else has initialized the scan in the meantime , we ' ll
* fill in the value we fetched on the second time through .
*/
*/
if ( pbscan - > phs_startblock = = InvalidBlockNumber )
if ( pbscan - > phs_startblock = = InvalidBlockNumber )
{
{
if ( ! pbscan - > base . phs_syncscan )
if ( startblock ! = InvalidBlockNumber )
pbscan - > phs_startblock = startblock ;
else if ( ! pbscan - > base . phs_syncscan )
pbscan - > phs_startblock = 0 ;
pbscan - > phs_startblock = 0 ;
else if ( sync_startpage ! = InvalidBlockNumber )
else if ( sync_startpage ! = InvalidBlockNumber )
pbscan - > phs_startblock = sync_startpage ;
pbscan - > phs_startblock = sync_startpage ;
@ -478,6 +512,34 @@ retry:
}
}
}
}
SpinLockRelease ( & pbscan - > phs_mutex ) ;
SpinLockRelease ( & pbscan - > phs_mutex ) ;
/*
* Figure out how many blocks we ' re going to scan ; either all of them , or
* just phs_numblock ' s worth , if a limit has been imposed .
*/
if ( pbscan - > phs_numblock = = InvalidBlockNumber )
scan_nblocks = pbscan - > phs_nblocks ;
else
scan_nblocks = pbscan - > phs_numblock ;
/*
* We determine the chunk size based on scan_nblocks . First we split
* scan_nblocks into PARALLEL_SEQSCAN_NCHUNKS chunks then we calculate the
* next highest power of 2 number of the result . This means we split the
* blocks we ' re scanning into somewhere between PARALLEL_SEQSCAN_NCHUNKS
* and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks .
*/
pbscanwork - > phsw_chunk_size = pg_nextpower2_32 ( Max ( scan_nblocks /
PARALLEL_SEQSCAN_NCHUNKS , 1 ) ) ;
/*
* Ensure we don ' t go over the maximum chunk size with larger tables . This
* means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
* tables . Too large a chunk size has been shown to be detrimental to
* sequential scan performance .
*/
pbscanwork - > phsw_chunk_size = Min ( pbscanwork - > phsw_chunk_size ,
PARALLEL_SEQSCAN_MAX_CHUNK_SIZE ) ;
}
}
/*
/*
@ -493,6 +555,7 @@ table_block_parallelscan_nextpage(Relation rel,
ParallelBlockTableScanWorker pbscanwork ,
ParallelBlockTableScanWorker pbscanwork ,
ParallelBlockTableScanDesc pbscan )
ParallelBlockTableScanDesc pbscan )
{
{
BlockNumber scan_nblocks ;
BlockNumber page ;
BlockNumber page ;
uint64 nallocated ;
uint64 nallocated ;
@ -513,7 +576,7 @@ table_block_parallelscan_nextpage(Relation rel,
*
*
* Here we name these ranges of blocks " chunks " . The initial size of
* Here we name these ranges of blocks " chunks " . The initial size of
* these chunks is determined in table_block_parallelscan_startblock_init
* these chunks is determined in table_block_parallelscan_startblock_init
* based on the size of the relatio n. Towards the end of the scan , we
* based on the number of blocks to sca n. Towards the end of the scan , we
* start making reductions in the size of the chunks in order to attempt
* start making reductions in the size of the chunks in order to attempt
* to divide the remaining work over all the workers as evenly as
* to divide the remaining work over all the workers as evenly as
* possible .
* possible .
@ -530,17 +593,23 @@ table_block_parallelscan_nextpage(Relation rel,
* phs_nallocated counter will exceed rs_nblocks , because workers will
* phs_nallocated counter will exceed rs_nblocks , because workers will
* still increment the value , when they try to allocate the next block but
* still increment the value , when they try to allocate the next block but
* all blocks have been allocated already . The counter must be 64 bits
* all blocks have been allocated already . The counter must be 64 bits
* wide because of that , to avoid wrapping around when r s_nblocks is close
* wide because of that , to avoid wrapping around when scan _nblocks is
* to 2 ^ 32.
* close to 2 ^ 32.
*
*
* The actual block to return is calculated by adding the counter to the
* The actual block to return is calculated by adding the counter to the
* starting block number , modulo nblocks .
* starting block number , modulo phs_ nblocks.
*/
*/
/* First, figure out how many blocks we're planning on scanning */
if ( pbscan - > phs_numblock = = InvalidBlockNumber )
scan_nblocks = pbscan - > phs_nblocks ;
else
scan_nblocks = pbscan - > phs_numblock ;
/*
/*
* First check if we have any remaining blocks in a previous chunk for
* Now check if we have any remaining blocks in a previous chunk for this
* this worker . We must consume all of the blocks from that before we
* worker . We must consume all of the blocks from that before we allocat e
* allocate a new chunk to the worker .
* a new chunk to the worker .
*/
*/
if ( pbscanwork - > phsw_chunk_remaining > 0 )
if ( pbscanwork - > phsw_chunk_remaining > 0 )
{
{
@ -562,7 +631,7 @@ table_block_parallelscan_nextpage(Relation rel,
* chunk size set to 1.
* chunk size set to 1.
*/
*/
if ( pbscanwork - > phsw_chunk_size > 1 & &
if ( pbscanwork - > phsw_chunk_size > 1 & &
pbscanwork - > phsw_nallocated > pb scan- > phs _nblocks -
pbscanwork - > phsw_nallocated > scan_nblocks -
( pbscanwork - > phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS ) )
( pbscanwork - > phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS ) )
pbscanwork - > phsw_chunk_size > > = 1 ;
pbscanwork - > phsw_chunk_size > > = 1 ;
@ -577,7 +646,8 @@ table_block_parallelscan_nextpage(Relation rel,
pbscanwork - > phsw_chunk_remaining = pbscanwork - > phsw_chunk_size - 1 ;
pbscanwork - > phsw_chunk_remaining = pbscanwork - > phsw_chunk_size - 1 ;
}
}
if ( nallocated > = pbscan - > phs_nblocks )
/* Check if we've run out of blocks to scan */
if ( nallocated > = scan_nblocks )
page = InvalidBlockNumber ; /* all blocks have been allocated */
page = InvalidBlockNumber ; /* all blocks have been allocated */
else
else
page = ( nallocated + pbscan - > phs_startblock ) % pbscan - > phs_nblocks ;
page = ( nallocated + pbscan - > phs_startblock ) % pbscan - > phs_nblocks ;