Handle ENOENT status when querying NUMA node

We've assumed that touching the memory is sufficient for a page to be located on one of the NUMA nodes. But a page may be moved to a swap after we touch it, due to memory pressure. We touch the memory before querying the status, but there is no guarantee it won't be moved to the swap in the meantime. The touching happens only on the first call, so later calls are more likely to be affected. And the batching increases the window too. It's up to the kernel if/when pages get moved to swap. We have to accept ENOENT (-2) as a valid result, and handle it without failing. This patch simply treats it as an unknown node, and returns NULL in the two affected views (pg_shmem_allocations_numa and pg_buffercache_numa). Hugepages cannot be swapped out, so this affects only regular pages. Reported by Christoph Berg, investigation and fix by me. Backpatch to 18, where the two views were introduced. Reported-by: Christoph Berg <myon@debian.org> Discussion: 18 Backpatch-through: https://postgr.es/m/aTq5Gt_n-oS_QSpL@msg.df7cb.de
1 month ago · 3fccbd94cb
parent 302879bd68
commit 3fccbd94cb
2 changed files with 35 additions and 9 deletions
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@ -551,8 +551,16 @@ pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)

 		if (fctx->include_numa)
 		{
-			values[2] = Int32GetDatum(fctx->record[i].numa_node);
-			nulls[2] = false;
+			/* status is valid node number */
+			if (fctx->record[i].numa_node >= 0)
+			{
+				values[2] = Int32GetDatum(fctx->record[i].numa_node);
+				nulls[2] = false;
+			} else {
+				/* some kind of error (e.g. pages moved to swap) */
+				values[2] = (Datum) 0;
+				nulls[2] = true;
+			}
 		}
 		else
 		{
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@ -599,7 +599,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 	InitMaterializedSRF(fcinfo, 0);

 	max_nodes = pg_numa_get_max_node();
-	nodes = palloc_array(Size, max_nodes + 1);
+	nodes = palloc_array(Size, max_nodes + 2);

 	/*
 	 * Shared memory allocations can vary in size and may not align with OS
@ -635,7 +635,6 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 	hash_seq_init(&hstat, ShmemIndex);

 	/* output all allocated entries */
-	memset(nulls, 0, sizeof(nulls));
 	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
 	{
 		int			i;
@ -684,22 +683,33 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 			elog(ERROR, "failed NUMA pages inquiry status: %m");

 		/* Count number of NUMA nodes used for this shared memory entry */
-		memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+		memset(nodes, 0, sizeof(Size) * (max_nodes + 2));

 		for (i = 0; i < shm_ent_page_count; i++)
 		{
 			int			s = pages_status[i];

 			/* Ensure we are adding only valid index to the array */
-			if (s < 0 || s > max_nodes)
+			if (s >= 0 && s <= max_nodes)
+			{
+				/* valid NUMA node */
+				nodes[s]++;
+				continue;
+			}
+			else if (s == -2)
 			{
-				elog(ERROR, "invalid NUMA node id outside of allowed range "
-					 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+				/* -2 means ENOENT (e.g. page was moved to swap) */
+				nodes[max_nodes + 1]++;
+				continue;
 			}

-			nodes[s]++;
+			elog(ERROR, "invalid NUMA node id outside of allowed range "
+				 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
 		}

+		/* no NULLs for regular nodes */
+		memset(nulls, 0, sizeof(nulls));
+
 		/*
 		 * Add one entry for each NUMA node, including those without allocated
 		 * memory for this segment.
@ -713,6 +723,14 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
 								 values, nulls);
 		}
+
+		/* The last entry is used for pages without a NUMA node. */
+		nulls[1] = true;
+		values[0] = CStringGetTextDatum(ent->key);
+		values[2] = Int64GetDatum(nodes[max_nodes + 1] * os_page_size);
+
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+							 values, nulls);
 	}

 	LWLockRelease(ShmemIndexLock);