Introduce pg_shmem_allocations_numa view

Introduce new pg_shmem_alloctions_numa view with information about how shared memory is distributed across NUMA nodes. For each shared memory segment, the view returns one row for each NUMA node backing it, with the total amount of memory allocated from that node. The view may be relatively expensive, especially when executed for the first time in a backend, as it has to touch all memory pages to get reliable information about the NUMA node. This may also force allocation of the shared memory. Unlike pg_shmem_allocations, the view does not show anonymous shared memory allocations. It also does not show memory allocated using the dynamic shared memory infrastructure. Author: Jakub Wartak <jakub.wartak@enterprisedb.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com> Reviewed-by: Tomas Vondra <tomas@vondra.me> Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com
9 months ago · 8cc139bec3
parent 65c298f61f
commit 8cc139bec3
12 changed files with 322 additions and 6 deletions
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@ -181,6 +181,11 @@
      <entry>shared memory allocations</entry>
     </row>

+     <row>
+      <entry><link linkend="view-pg-shmem-allocations-numa"><structname>pg_shmem_allocations_numa</structname></link></entry>
+      <entry>NUMA node mappings for shared memory allocations</entry>
+     </row>
+
     <row>
      <entry><link linkend="view-pg-stats"><structname>pg_stats</structname></link></entry>
      <entry>planner statistics</entry>
@ -4051,6 +4056,96 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
  </para>
 </sect1>

+ <sect1 id="view-pg-shmem-allocations-numa">
+  <title><structname>pg_shmem_allocations_numa</structname></title>
+
+  <indexterm zone="view-pg-shmem-allocations-numa">
+   <primary>pg_shmem_allocations_numa</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_shmem_allocations_numa</structname> shows how shared
+   memory allocations in the server's main shared memory segment are distributed
+   across NUMA nodes. This includes both memory allocated by
+   <productname>PostgreSQL</productname> itself and memory allocated
+   by extensions using the mechanisms detailed in
+   <xref linkend="xfunc-shared-addin" />. This view will output multiple rows
+   for each of the shared memory segments provided that they are spread accross
+   multiple NUMA nodes. This view should not be queried by monitoring systems
+   as it is very slow and may end up allocating shared memory in case it was not
+   used earlier.
+   Current limitation for this view is that won't show anonymous shared memory
+   allocations.
+  </para>
+
+  <para>
+   Note that this view does not include memory allocated using the dynamic
+   shared memory infrastructure.
+  </para>
+
+  <warning>
+    <para>
+      When determining the <acronym>NUMA</acronym> node, the view touches
+      all memory pages for the shared memory segment. This will force
+      allocation of the shared memory, if it wasn't allocated already,
+      and the memory may get allocated in a single <acronym>NUMA</acronym>
+      node (depending on system configuration).
+    </para>
+  </warning>
+
+  <table>
+   <title><structname>pg_shmem_allocations_numa</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>name</structfield> <type>text</type>
+      </para>
+      <para>
+       The name of the shared memory allocation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>numa_node</structfield> <type>int4</type>
+      </para>
+      <para>
+      ID of <acronym>NUMA</acronym> node
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>size</structfield> <type>int4</type>
+      </para>
+      <para>
+       Size of the allocation on this particular NUMA memory node in bytes
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   By default, the <structname>pg_shmem_allocations_numa</structname> view can be
+   read only by superusers or roles with privileges of the
+   <literal>pg_read_all_stats</literal> role.
+  </para>
+ </sect1>
+
 <sect1 id="view-pg-stats">
  <title><structname>pg_stats</structname></title>

--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
 REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
 GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;

+CREATE VIEW pg_shmem_allocations_numa AS
+    SELECT * FROM pg_get_shmem_allocations_numa();
+
+REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC;
+GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats;
+
 CREATE VIEW pg_backend_memory_contexts AS
    SELECT * FROM pg_get_backend_memory_contexts();

--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@ -68,6 +68,7 @@
 #include "fmgr.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "port/pg_numa.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
 #include "storage/shmem.h"
@ -89,6 +90,8 @@ slock_t    *ShmemLock;			/* spinlock for shared memory and LWLock

 static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */

+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;

 /*
 *	InitShmemAccess() --- set up basic pointers to shared memory.
@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)

 	return (Datum) 0;
 }
+
+/*
+ * SQL SRF showing NUMA memory nodes for allocated shared memory
+ *
+ * Compared to pg_get_shmem_allocations(), this function does not return
+ * information about shared anonymous allocations and unused shared memory.
+ */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	HASH_SEQ_STATUS hstat;
+	ShmemIndexEnt *ent;
+	Datum		values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	bool		nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	Size		os_page_size;
+	void	  **page_ptrs;
+	int		   *pages_status;
+	uint64		shm_total_page_count,
+				shm_ent_page_count,
+				max_nodes;
+	Size	   *nodes;
+
+	if (pg_numa_init() == -1)
+		elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	max_nodes = pg_numa_get_max_node();
+	nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+	/*
+	 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+	 * the OS may have different memory page sizes.
+	 *
+	 * To correctly map between them, we need to: 1. Determine the OS memory
+	 * page size 2. Calculate how many OS pages are used by all buffer blocks
+	 * 3. Calculate how many OS pages are contained within each database
+	 * block.
+	 *
+	 * This information is needed before calling move_pages() for NUMA memory
+	 * node inquiry.
+	 */
+	os_page_size = pg_numa_get_pagesize();
+
+	/*
+	 * Allocate memory for page pointers and status based on total shared
+	 * memory size. This simplified approach allocates enough space for all
+	 * pages in shared memory rather than calculating the exact requirements
+	 * for each segment.
+	 *
+	 * Add 1, because we don't know how exactly the segments align to OS
+	 * pages, so the allocation might use one more memory page. In practice
+	 * this is not very likely, and moreover we have more entries, each of
+	 * them using only fraction of the total pages.
+	 */
+	shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
+	page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+	pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+	if (firstNumaTouch)
+		elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	hash_seq_init(&hstat, ShmemIndex);
+
+	/* output all allocated entries */
+	memset(nulls, 0, sizeof(nulls));
+	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		int			i;
+		char	   *startptr,
+				   *endptr;
+		Size		total_len;
+
+		/*
+		 * Calculate the range of OS pages used by this segment. The segment
+		 * may start / end half-way through a page, we want to count these
+		 * pages too. So we align the start/end pointers down/up, and then
+		 * calculate the number of pages from that.
+		 */
+		startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
+		endptr = (char *) TYPEALIGN(os_page_size,
+									(char *) ent->location + ent->allocated_size);
+		total_len = (endptr - startptr);
+
+		shm_ent_page_count = total_len / os_page_size;
+
+		/*
+		 * If we ever get 0xff (-1) back from kernel inquiry, then we probably
+		 * have a bug in mapping buffers to OS pages.
+		 */
+		memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+		/*
+		 * Setup page_ptrs[] with pointers to all OS pages for this segment,
+		 * and get the NUMA status using pg_numa_query_pages.
+		 *
+		 * In order to get reliable results we also need to touch memory
+		 * pages, so that inquiry about NUMA memory node doesn't return -2
+		 * (ENOENT, which indicates unmapped/unallocated pages).
+		 */
+		for (i = 0; i < shm_ent_page_count; i++)
+		{
+			volatile uint64 touch pg_attribute_unused();
+
+			page_ptrs[i] = startptr + (i * os_page_size);
+
+			if (firstNumaTouch)
+				pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+			elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+		/* Count number of NUMA nodes used for this shared memory entry */
+		memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+		for (i = 0; i < shm_ent_page_count; i++)
+		{
+			int			s = pages_status[i];
+
+			/* Ensure we are adding only valid index to the array */
+			if (s < 0 || s > max_nodes)
+			{
+				elog(ERROR, "invalid NUMA node id outside of allowed range "
+					 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+			}
+
+			nodes[s]++;
+		}
+
+		/*
+		 * Add one entry for each NUMA node, including those without allocated
+		 * memory for this segment.
+		 */
+		for (i = 0; i <= max_nodes; i++)
+		{
+			values[0] = CStringGetTextDatum(ent->key);
+			values[1] = i;
+			values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+								 values, nulls);
+		}
+	}
+
+	LWLockRelease(ShmemIndexLock);
+	firstNumaTouch = false;
+
+	return (Datum) 0;
+}
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@ -57,6 +57,6 @@
 */

 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202504072
+#define CATALOG_VERSION_NO	202504073

 #endif
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@ -8546,6 +8546,14 @@
  proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool',
  proargtypes => '', prosrc => 'pg_numa_available' },

+# shared memory usage with NUMA info
+{ oid => '4100', descr => 'NUMA mappings for the main shared memory segment',
+  proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't',
+  provolatile => 'v', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}',
+  proargnames => '{name,numa_node,size}',
+  prosrc => 'pg_get_shmem_allocations_numa' },
+
 # memory context of local backend
 { oid => '2282',
  descr => 'information about all memory contexts of local backend',
--- a/src/test/regress/expected/numa.out
+++ b/src/test/regress/expected/numa.out
@ -0,0 +1,13 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+-- switch to superuser
+\c -
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
+ ok 
+----
+ t
+(1 row)
+
--- a/src/test/regress/expected/numa_1.out
+++ b/src/test/regress/expected/numa_1.out
@ -0,0 +1,5 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+ERROR:  libnuma initialization failed or NUMA is not supported on this platform
+\quit
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@ -3219,8 +3219,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
 -- clean up
 DROP TABLE lock_table;
 DROP USER regress_locktable_user;
-- test to check privileges of system views pg_shmem_allocations and
-- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
 -- switch to superuser
 \c -
 CREATE ROLE regress_readallstats;
@ -3242,6 +3242,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
 f
 (1 row)

+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
+ has_table_privilege 
+---------------------
+ f
+(1 row)
+
 GRANT pg_read_all_stats TO regress_readallstats;
 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
 has_table_privilege 
@ -3261,6 +3267,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
 t
 (1 row)

+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
+ has_table_privilege 
+---------------------
+ t
+(1 row)
+
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
 SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name,
    size,
    allocated_size
   FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+pg_shmem_allocations_numa| SELECT name,
+    numa_node,
+    size
+   FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size);
 pg_stat_activity| SELECT s.datid,
    d.datname,
    s.pid,
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr
 # The stats test resets stats, so nothing else needing stats access can be in
 # this group.
 # ----------
-test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate
+test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa

 # event_trigger depends on create_am and cannot run concurrently with
 # any test that runs DDL
--- a/src/test/regress/sql/numa.sql
+++ b/src/test/regress/sql/numa.sql
@ -0,0 +1,10 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+
+-- switch to superuser
+\c -
+
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@ -1947,8 +1947,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
 DROP TABLE lock_table;
 DROP USER regress_locktable_user;

-- test to check privileges of system views pg_shmem_allocations and
-- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.

 -- switch to superuser
 \c -
@ -1958,12 +1958,14 @@ CREATE ROLE regress_readallstats;
 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no

 GRANT pg_read_all_stats TO regress_readallstats;

 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes

 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;