@ -26,7 +26,7 @@
# define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
# define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
# define NUM_BUFFERCACHE_NUMA _ELEM 3
# define NUM_BUFFERCACHE_OS_PAGES _ELEM 3
PG_MODULE_MAGIC_EXT (
. name = " pg_buffercache " ,
@ -67,14 +67,16 @@ typedef struct
} BufferCachePagesContext ;
/*
* Record structure holding the to be exposed cache data .
* Record structure holding the to be exposed cache data for OS pages . This
* structure is used by pg_buffercache_os_pages ( ) , where NUMA information may
* or may not be included .
*/
typedef struct
{
uint32 bufferid ;
int64 page_num ;
int32 numa_node ;
} BufferCacheNuma Rec ;
} BufferCacheOsPages Rec ;
/*
* Function context for data persisting over repeated calls .
@ -82,8 +84,9 @@ typedef struct
typedef struct
{
TupleDesc tupdesc ;
BufferCacheNumaRec * record ;
} BufferCacheNumaContext ;
bool include_numa ;
BufferCacheOsPagesRec * record ;
} BufferCacheOsPagesContext ;
/*
@ -91,6 +94,7 @@ typedef struct
* relation node / tablespace / database / blocknum and dirty indicator .
*/
PG_FUNCTION_INFO_V1 ( pg_buffercache_pages ) ;
PG_FUNCTION_INFO_V1 ( pg_buffercache_os_pages ) ;
PG_FUNCTION_INFO_V1 ( pg_buffercache_numa_pages ) ;
PG_FUNCTION_INFO_V1 ( pg_buffercache_summary ) ;
PG_FUNCTION_INFO_V1 ( pg_buffercache_usage_counts ) ;
@ -284,26 +288,32 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
}
/*
* Inquire about NUMA memory mappings for shared buffers .
* Inquire about OS pages mappings for shared buffers , with NUMA information ,
* optionally .
*
* When " include_numa " is false , this routines ignores everything related
* to NUMA ( returned as NULL values ) , returning mapping information between
* shared buffers and OS pages .
*
* When " include_numa " is true , NUMA is initialized and numa_node values
* are generated . In order to get reliable results we also need to touch
* memory pages , so that the inquiry about NUMA memory node does not return
* - 2 , indicating unmapped / unallocated pages .
*
* Returns NUMA node ID for each memory page used by the buffer . Buffers may
* be smaller or larger than OS memory pages . For each buffer we return one
* entry for each memory page used by the buffer ( if the buffer is smaller ,
* it only uses a part of one memory page ) .
* Buffers may be smaller or larger than OS memory pages . For each buffer we
* return one entry for each memory page used by the buffer ( if the buffer is
* smaller , it only uses a part of one memory page ) .
*
* We expect both sizes ( for buffers and memory pages ) to be a power - of - 2 , so
* one is always a multiple of the other .
*
* In order to get reliable results we also need to touch memory pages , so
* that the inquiry about NUMA memory node doesn ' t return - 2 ( which indicates
* unmapped / unallocated pages ) .
*/
Datum
pg_buffercache_numa_pages ( PG_FUNCTION_ARGS )
static Datum
pg_buffercache_os_pages_internal ( FunctionCallInfo fcinfo , bool include_numa )
{
FuncCallContext * funcctx ;
MemoryContext oldcontext ;
BufferCacheNuma Context * fctx ; /* User function context. */
BufferCacheOsPages Context * fctx ; /* User function context. */
TupleDesc tupledesc ;
TupleDesc expected_tupledesc ;
HeapTuple tuple ;
@ -314,15 +324,15 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
int i ,
idx ;
Size os_page_size ;
void * * os_page_ptrs ;
int * os_page_status ;
uint64 os_page_count ;
int pages_per_buffer ;
int * os_page_status = NULL ;
uint64 os_page_count = 0 ;
int max_entries ;
char * startptr ,
* endptr ;
if ( pg_numa_init ( ) = = - 1 )
/* If NUMA information is requested, initialize NUMA support. */
if ( include_numa & & pg_numa_init ( ) = = - 1 )
elog ( ERROR , " libnuma initialization failed or NUMA is not supported on this platform " ) ;
/*
@ -350,51 +360,56 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
*/
Assert ( ( os_page_size % BLCKSZ = = 0 ) | | ( BLCKSZ % os_page_size = = 0 ) ) ;
/*
* How many addresses we are going to query ? Simply get the page for
* the first buffer , and first page after the last buffer , and count
* the pages from that .
*/
startptr = ( char * ) TYPEALIGN_DOWN ( os_page_size ,
BufferGetBlock ( 1 ) ) ;
endptr = ( char * ) TYPEALIGN ( os_page_size ,
( char * ) BufferGetBlock ( NBuffers ) + BLCKSZ ) ;
os_page_count = ( endptr - startptr ) / os_page_size ;
/* Used to determine the NUMA node for all OS pages at once */
os_page_ptrs = palloc0 ( sizeof ( void * ) * os_page_count ) ;
os_page_status = palloc ( sizeof ( uint64 ) * os_page_count ) ;
/*
* Fill pointers for all the memory pages . This loop stores and
* touches ( if needed ) addresses into os_page_ptrs [ ] as input to one
* big move_pages ( 2 ) inquiry system call , as done in
* pg_numa_query_pages ( ) .
*/
idx = 0 ;
for ( char * ptr = startptr ; ptr < endptr ; ptr + = os_page_size )
if ( include_numa )
{
os_page_ptrs [ idx + + ] = ptr ;
void * * os_page_ptrs = NULL ;
/*
* How many addresses we are going to query ? Simply get the page
* for the first buffer , and first page after the last buffer , and
* count the pages from that .
*/
startptr = ( char * ) TYPEALIGN_DOWN ( os_page_size ,
BufferGetBlock ( 1 ) ) ;
endptr = ( char * ) TYPEALIGN ( os_page_size ,
( char * ) BufferGetBlock ( NBuffers ) + BLCKSZ ) ;
os_page_count = ( endptr - startptr ) / os_page_size ;
/* Used to determine the NUMA node for all OS pages at once */
os_page_ptrs = palloc0 ( sizeof ( void * ) * os_page_count ) ;
os_page_status = palloc ( sizeof ( uint64 ) * os_page_count ) ;
/*
* Fill pointers for all the memory pages . This loop stores and
* touches ( if needed ) addresses into os_page_ptrs [ ] as input to
* one big move_pages ( 2 ) inquiry system call , as done in
* pg_numa_query_pages ( ) .
*/
idx = 0 ;
for ( char * ptr = startptr ; ptr < endptr ; ptr + = os_page_size )
{
os_page_ptrs [ idx + + ] = ptr ;
/* Only need to touch memory once per backend process lifetime */
if ( firstNumaTouch )
pg_numa_touch_mem_if_required ( ptr ) ;
}
/* Only need to touch memory once per backend process lifetime */
if ( firstNumaTouch )
pg_numa_touch_mem_if_required ( ptr ) ;
}
Assert ( idx = = os_page_count ) ;
Assert ( idx = = os_page_count ) ;
elog ( DEBUG1 , " NUMA: NBuffers=%d os_page_count= " UINT64_FORMAT " "
" os_page_size=%zu " , NBuffers , os_page_count , os_page_size ) ;
elog ( DEBUG1 , " NUMA: NBuffers=%d os_page_count= " UINT64_FORMAT " "
" os_page_size=%zu " , NBuffers , os_page_count , os_page_size ) ;
/*
* If we ever get 0xff back from kernel inquiry , then we probably have
* bug in our buffers to OS page mapping code here .
*/
memset ( os_page_status , 0xff , sizeof ( int ) * os_page_count ) ;
/*
* If we ever get 0xff back from kernel inquiry , then we probably
* have bug in our buffers to OS page mapping code here .
*/
memset ( os_page_status , 0xff , sizeof ( int ) * os_page_count ) ;
/* Query NUMA status for all the pointers */
if ( pg_numa_query_pages ( 0 , os_page_count , os_page_ptrs , os_page_status ) = = - 1 )
elog ( ERROR , " failed NUMA pages inquiry: %m " ) ;
/* Query NUMA status for all the pointers */
if ( pg_numa_query_pages ( 0 , os_page_count , os_page_ptrs , os_page_status ) = = - 1 )
elog ( ERROR , " failed NUMA pages inquiry: %m " ) ;
}
/* Initialize the multi-call context, load entries about buffers */
@ -404,12 +419,12 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
oldcontext = MemoryContextSwitchTo ( funcctx - > multi_call_memory_ctx ) ;
/* Create a user function context for cross-call persistence */
fctx = ( BufferCacheNuma Context * ) palloc ( sizeof ( BufferCacheNuma Context ) ) ;
fctx = ( BufferCacheOsPages Context * ) palloc ( sizeof ( BufferCacheOsPages Context ) ) ;
if ( get_call_result_type ( fcinfo , NULL , & expected_tupledesc ) ! = TYPEFUNC_COMPOSITE )
elog ( ERROR , " return type must be a row type " ) ;
if ( expected_tupledesc - > natts ! = NUM_BUFFERCACHE_NUMA _ELEM )
if ( expected_tupledesc - > natts ! = NUM_BUFFERCACHE_OS_PAGES _ELEM )
elog ( ERROR , " incorrect number of output arguments " ) ;
/* Construct a tuple descriptor for the result rows. */
@ -422,6 +437,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
INT4OID , - 1 , 0 ) ;
fctx - > tupdesc = BlessTupleDesc ( tupledesc ) ;
fctx - > include_numa = include_numa ;
/*
* Each buffer needs at least one entry , but it might be offset in
@ -433,15 +449,15 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
pages_per_buffer = Max ( 1 , BLCKSZ / os_page_size ) + 1 ;
max_entries = NBuffers * pages_per_buffer ;
/* Allocate entries for BufferCachePagesRec records. */
fctx - > record = ( BufferCacheNuma Rec * )
/* Allocate entries for BufferCacheOs PagesRec records. */
fctx - > record = ( BufferCacheOsPages Rec * )
MemoryContextAllocHuge ( CurrentMemoryContext ,
sizeof ( BufferCacheNuma Rec ) * max_entries ) ;
sizeof ( BufferCacheOsPages Rec ) * max_entries ) ;
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo ( oldcontext ) ;
if ( firstNumaTouch )
if ( include_numa & & firstNumaTouch )
elog ( DEBUG1 , " NUMA: page-faulting the buffercache for proper NUMA readouts " ) ;
/*
@ -488,7 +504,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
{
fctx - > record [ idx ] . bufferid = bufferid ;
fctx - > record [ idx ] . page_num = page_num ;
fctx - > record [ idx ] . numa_node = os_page_status [ page_num ] ;
fctx - > record [ idx ] . numa_node = include_numa ? os_page_status [ page_num ] : - 1 ;
/* advance to the next entry/page */
+ + idx ;
@ -496,14 +512,18 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
}
}
Assert ( ( idx > = os_page_count ) & & ( idx < = max_entries ) ) ;
Assert ( idx < = max_entries ) ;
if ( include_numa )
Assert ( idx > = os_page_count ) ;
/* Set max calls and remember the user function context. */
funcctx - > max_calls = idx ;
funcctx - > user_fctx = fctx ;
/* Remember this backend touched the pages */
firstNumaTouch = false ;
/* Remember this backend touched the pages (only relevant for NUMA) */
if ( include_numa )
firstNumaTouch = false ;
}
funcctx = SRF_PERCALL_SETUP ( ) ;
@ -514,8 +534,8 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
if ( funcctx - > call_cntr < funcctx - > max_calls )
{
uint32 i = funcctx - > call_cntr ;
Datum values [ NUM_BUFFERCACHE_NUMA _ELEM ] ;
bool nulls [ NUM_BUFFERCACHE_NUMA _ELEM ] ;
Datum values [ NUM_BUFFERCACHE_OS_PAGES _ELEM ] ;
bool nulls [ NUM_BUFFERCACHE_OS_PAGES _ELEM ] ;
values [ 0 ] = Int32GetDatum ( fctx - > record [ i ] . bufferid ) ;
nulls [ 0 ] = false ;
@ -523,8 +543,16 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
values [ 1 ] = Int64GetDatum ( fctx - > record [ i ] . page_num ) ;
nulls [ 1 ] = false ;
values [ 2 ] = Int32GetDatum ( fctx - > record [ i ] . numa_node ) ;
nulls [ 2 ] = false ;
if ( fctx - > include_numa )
{
values [ 2 ] = Int32GetDatum ( fctx - > record [ i ] . numa_node ) ;
nulls [ 2 ] = false ;
}
else
{
values [ 2 ] = ( Datum ) 0 ;
nulls [ 2 ] = true ;
}
/* Build and return the tuple. */
tuple = heap_form_tuple ( fctx - > tupdesc , values , nulls ) ;
@ -536,6 +564,30 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
SRF_RETURN_DONE ( funcctx ) ;
}
/*
* pg_buffercache_os_pages
*
* Retrieve information about OS pages , with or without NUMA information .
*/
Datum
pg_buffercache_os_pages ( PG_FUNCTION_ARGS )
{
bool include_numa ;
/* Get the boolean parameter that controls the NUMA behavior. */
include_numa = PG_GETARG_BOOL ( 0 ) ;
return pg_buffercache_os_pages_internal ( fcinfo , include_numa ) ;
}
/* Backward-compatible wrapper for v1.6. */
Datum
pg_buffercache_numa_pages ( PG_FUNCTION_ARGS )
{
/* Call internal function with include_numa=true */
return pg_buffercache_os_pages_internal ( fcinfo , true ) ;
}
Datum
pg_buffercache_summary ( PG_FUNCTION_ARGS )
{