From c5282933765ec322a0664d0a0268f8276e83b156 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Mierzwa?= <l.mierzwa@gmail.com>
Date: Mon, 9 Jun 2025 15:01:35 +0100
Subject: [PATCH] Enhancement: Reload all scrape pools concurrently (#16595)

* Reload all scrape pools concurrently

At the moment all scrape pools that need to be reloaded are reloaded one by one. While reloads are ongoing mtxScrape is locked.
For each pool that's being reloaded we need to wait until all targets are updated.
This whole process can take a while and the more scrape pools to reload the longer.
At the same time all pools are independent and there's no real reason to do them one-by-one.
Reload each pool in a seperate goroutine so we finish config reload as ASAP as possible and unlock the mtxScrape.

Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>

* Address PR review feedback

Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>

---------

Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>
---
 scrape/manager.go | 58 +++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/scrape/manager.go b/scrape/manager.go
index 6cd4477494..e46c7fdf86 100644
--- a/scrape/manager.go
+++ b/scrape/manager.go
@@ -26,6 +26,7 @@ import (
 	config_util "github.com/prometheus/common/config"
 	"github.com/prometheus/common/model"
 	"github.com/prometheus/common/promslog"
+	"go.uber.org/atomic"
 
 	"github.com/prometheus/prometheus/config"
 	"github.com/prometheus/prometheus/discovery/targetgroup"
@@ -287,29 +288,46 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error {
 	}
 
 	// Cleanup and reload pool if the configuration has changed.
-	var failed bool
-	for name, sp := range m.scrapePools {
-		switch cfg, ok := m.scrapeConfigs[name]; {
-		case !ok:
-			sp.stop()
-			delete(m.scrapePools, name)
-		case !reflect.DeepEqual(sp.config, cfg):
-			err := sp.reload(cfg)
-			if err != nil {
-				m.logger.Error("error reloading scrape pool", "err", err, "scrape_pool", name)
-				failed = true
-			}
-			fallthrough
-		case ok:
-			if l, ok := m.scrapeFailureLoggers[cfg.ScrapeFailureLogFile]; ok {
-				sp.SetScrapeFailureLogger(l)
-			} else {
-				sp.logger.Error("No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", name)
+	var (
+		failed   atomic.Bool
+		wg       sync.WaitGroup
+		toDelete sync.Map // Stores the list of names of pools to delete.
+	)
+	for poolName, pool := range m.scrapePools {
+		wg.Add(1)
+		cfg, ok := m.scrapeConfigs[poolName]
+		// Reload each scrape pool in a dedicated goroutine so we don't have to wait a long time
+		// if we have a lot of scrape pools to update.
+		go func(name string, sp *scrapePool, cfg *config.ScrapeConfig, ok bool) {
+			defer wg.Done()
+			switch {
+			case !ok:
+				sp.stop()
+				toDelete.Store(name, struct{}{})
+			case !reflect.DeepEqual(sp.config, cfg):
+				err := sp.reload(cfg)
+				if err != nil {
+					m.logger.Error("error reloading scrape pool", "err", err, "scrape_pool", name)
+					failed.Store(true)
+				}
+				fallthrough
+			case ok:
+				if l, ok := m.scrapeFailureLoggers[cfg.ScrapeFailureLogFile]; ok {
+					sp.SetScrapeFailureLogger(l)
+				} else {
+					sp.logger.Error("No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", name)
+				}
 			}
-		}
+		}(poolName, pool, cfg, ok)
 	}
+	wg.Wait()
+
+	toDelete.Range(func(name, _ any) bool {
+		delete(m.scrapePools, name.(string))
+		return true
+	})
 
-	if failed {
+	if failed.Load() {
 		return errors.New("failed to apply the new configuration")
 	}
 	return nil