@ -68,6 +68,7 @@
# include "fmgr.h"
# include "funcapi.h"
# include "miscadmin.h"
# include "port/pg_numa.h"
# include "storage/lwlock.h"
# include "storage/pg_shmem.h"
# include "storage/shmem.h"
@ -89,6 +90,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock
static HTAB * ShmemIndex = NULL ; /* primary index hashtable for shmem */
/* To get reliable results for NUMA inquiry we need to "touch pages" once */
static bool firstNumaTouch = true ;
/*
* InitShmemAccess ( ) - - - set up basic pointers to shared memory .
@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
return ( Datum ) 0 ;
}
/*
* SQL SRF showing NUMA memory nodes for allocated shared memory
*
* Compared to pg_get_shmem_allocations ( ) , this function does not return
* information about shared anonymous allocations and unused shared memory .
*/
Datum
pg_get_shmem_allocations_numa ( PG_FUNCTION_ARGS )
{
# define PG_GET_SHMEM_NUMA_SIZES_COLS 3
ReturnSetInfo * rsinfo = ( ReturnSetInfo * ) fcinfo - > resultinfo ;
HASH_SEQ_STATUS hstat ;
ShmemIndexEnt * ent ;
Datum values [ PG_GET_SHMEM_NUMA_SIZES_COLS ] ;
bool nulls [ PG_GET_SHMEM_NUMA_SIZES_COLS ] ;
Size os_page_size ;
void * * page_ptrs ;
int * pages_status ;
uint64 shm_total_page_count ,
shm_ent_page_count ,
max_nodes ;
Size * nodes ;
if ( pg_numa_init ( ) = = - 1 )
elog ( ERROR , " libnuma initialization failed or NUMA is not supported on this platform " ) ;
InitMaterializedSRF ( fcinfo , 0 ) ;
max_nodes = pg_numa_get_max_node ( ) ;
nodes = palloc ( sizeof ( Size ) * ( max_nodes + 1 ) ) ;
/*
* Different database block sizes ( 4 kB , 8 kB , . . . , 32 kB ) can be used , while
* the OS may have different memory page sizes .
*
* To correctly map between them , we need to : 1. Determine the OS memory
* page size 2. Calculate how many OS pages are used by all buffer blocks
* 3. Calculate how many OS pages are contained within each database
* block .
*
* This information is needed before calling move_pages ( ) for NUMA memory
* node inquiry .
*/
os_page_size = pg_numa_get_pagesize ( ) ;
/*
* Allocate memory for page pointers and status based on total shared
* memory size . This simplified approach allocates enough space for all
* pages in shared memory rather than calculating the exact requirements
* for each segment .
*
* Add 1 , because we don ' t know how exactly the segments align to OS
* pages , so the allocation might use one more memory page . In practice
* this is not very likely , and moreover we have more entries , each of
* them using only fraction of the total pages .
*/
shm_total_page_count = ( ShmemSegHdr - > totalsize / os_page_size ) + 1 ;
page_ptrs = palloc0 ( sizeof ( void * ) * shm_total_page_count ) ;
pages_status = palloc ( sizeof ( int ) * shm_total_page_count ) ;
if ( firstNumaTouch )
elog ( DEBUG1 , " NUMA: page-faulting shared memory segments for proper NUMA readouts " ) ;
LWLockAcquire ( ShmemIndexLock , LW_SHARED ) ;
hash_seq_init ( & hstat , ShmemIndex ) ;
/* output all allocated entries */
memset ( nulls , 0 , sizeof ( nulls ) ) ;
while ( ( ent = ( ShmemIndexEnt * ) hash_seq_search ( & hstat ) ) ! = NULL )
{
int i ;
char * startptr ,
* endptr ;
Size total_len ;
/*
* Calculate the range of OS pages used by this segment . The segment
* may start / end half - way through a page , we want to count these
* pages too . So we align the start / end pointers down / up , and then
* calculate the number of pages from that .
*/
startptr = ( char * ) TYPEALIGN_DOWN ( os_page_size , ent - > location ) ;
endptr = ( char * ) TYPEALIGN ( os_page_size ,
( char * ) ent - > location + ent - > allocated_size ) ;
total_len = ( endptr - startptr ) ;
shm_ent_page_count = total_len / os_page_size ;
/*
* If we ever get 0xff ( - 1 ) back from kernel inquiry , then we probably
* have a bug in mapping buffers to OS pages .
*/
memset ( pages_status , 0xff , sizeof ( int ) * shm_ent_page_count ) ;
/*
* Setup page_ptrs [ ] with pointers to all OS pages for this segment ,
* and get the NUMA status using pg_numa_query_pages .
*
* In order to get reliable results we also need to touch memory
* pages , so that inquiry about NUMA memory node doesn ' t return - 2
* ( ENOENT , which indicates unmapped / unallocated pages ) .
*/
for ( i = 0 ; i < shm_ent_page_count ; i + + )
{
volatile uint64 touch pg_attribute_unused ( ) ;
page_ptrs [ i ] = startptr + ( i * os_page_size ) ;
if ( firstNumaTouch )
pg_numa_touch_mem_if_required ( touch , page_ptrs [ i ] ) ;
CHECK_FOR_INTERRUPTS ( ) ;
}
if ( pg_numa_query_pages ( 0 , shm_ent_page_count , page_ptrs , pages_status ) = = - 1 )
elog ( ERROR , " failed NUMA pages inquiry status: %m " ) ;
/* Count number of NUMA nodes used for this shared memory entry */
memset ( nodes , 0 , sizeof ( Size ) * ( max_nodes + 1 ) ) ;
for ( i = 0 ; i < shm_ent_page_count ; i + + )
{
int s = pages_status [ i ] ;
/* Ensure we are adding only valid index to the array */
if ( s < 0 | | s > max_nodes )
{
elog ( ERROR , " invalid NUMA node id outside of allowed range "
" [0, " UINT64_FORMAT " ]: %d " , max_nodes , s ) ;
}
nodes [ s ] + + ;
}
/*
* Add one entry for each NUMA node , including those without allocated
* memory for this segment .
*/
for ( i = 0 ; i < = max_nodes ; i + + )
{
values [ 0 ] = CStringGetTextDatum ( ent - > key ) ;
values [ 1 ] = i ;
values [ 2 ] = Int64GetDatum ( nodes [ i ] * os_page_size ) ;
tuplestore_putvalues ( rsinfo - > setResult , rsinfo - > setDesc ,
values , nulls ) ;
}
}
LWLockRelease ( ShmemIndexLock ) ;
firstNumaTouch = false ;
return ( Datum ) 0 ;
}