sysfs metrics Add a new disabled-by-default collector that reads /sys/block/dm-* to discover Device Mapper multipath devices and expose path health metrics. Multipath devices are identified by checking that dm/uuid starts with "mpath-", which distinguishes them from LVM or other DM device types. The path state is reported as-is from /sys/block/<dev>/device/state, supporting both SCSI devices (running, offline, blocked, etc.) and NVMe devices (live, connecting, dead, etc.) without hardcoding a fixed set of states. All device-level metrics include both the DM friendly name (device) and the kernel block device name (sysfs_name, e.g. dm-0) to enable direct correlation with node_disk_* I/O metrics without recording rules. No special permissions are required — the collector reads only world-readable sysfs attributes. Exposed metrics: - node_dmmultipath_device_info - node_dmmultipath_device_active - node_dmmultipath_device_size_bytes - node_dmmultipath_device_paths - node_dmmultipath_device_paths_active - node_dmmultipath_device_paths_failed - node_dmmultipath_path_state Signed-off-by: Shirly Radco <sradco@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>pull/3581/head
parent
1a4cac6cc1
commit
618342bf17
@ -0,0 +1,143 @@ |
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build !nodmmultipath
|
||||
|
||||
package collector |
||||
|
||||
import ( |
||||
"errors" |
||||
"fmt" |
||||
"log/slog" |
||||
"os" |
||||
|
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/prometheus/procfs/blockdevice" |
||||
) |
||||
|
||||
// isPathActive returns true for device states that indicate a healthy,
|
||||
// usable path. This covers SCSI ("running") and NVMe ("live") devices.
|
||||
func isPathActive(state string) bool { |
||||
return state == "running" || state == "live" |
||||
} |
||||
|
||||
type dmMultipathCollector struct { |
||||
fs blockdevice.FS |
||||
logger *slog.Logger |
||||
|
||||
deviceInfo *prometheus.Desc |
||||
deviceActive *prometheus.Desc |
||||
deviceSizeBytes *prometheus.Desc |
||||
devicePaths *prometheus.Desc |
||||
devicePathsActive *prometheus.Desc |
||||
devicePathsFailed *prometheus.Desc |
||||
pathState *prometheus.Desc |
||||
} |
||||
|
||||
func init() { |
||||
registerCollector("dmmultipath", defaultDisabled, NewDMMultipathCollector) |
||||
} |
||||
|
||||
// NewDMMultipathCollector returns a new Collector exposing Device Mapper
|
||||
// multipath device metrics from /sys/block/dm-*.
|
||||
func NewDMMultipathCollector(logger *slog.Logger) (Collector, error) { |
||||
const subsystem = "dmmultipath" |
||||
|
||||
fs, err := blockdevice.NewFS(*procPath, *sysPath) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to open sysfs: %w", err) |
||||
} |
||||
|
||||
deviceLabels := []string{"device", "sysfs_name"} |
||||
|
||||
return &dmMultipathCollector{ |
||||
fs: fs, |
||||
logger: logger, |
||||
deviceInfo: prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, subsystem, "device_info"), |
||||
"Non-numeric information about a DM-multipath device.", |
||||
[]string{"device", "sysfs_name", "uuid"}, nil, |
||||
), |
||||
deviceActive: prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, subsystem, "device_active"), |
||||
"Whether the multipath device-mapper device is active (1) or suspended (0).", |
||||
deviceLabels, nil, |
||||
), |
||||
deviceSizeBytes: prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, subsystem, "device_size_bytes"), |
||||
"Size of the multipath device in bytes, read from /sys/block/<dm>/size.", |
||||
deviceLabels, nil, |
||||
), |
||||
devicePaths: prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, subsystem, "device_paths"), |
||||
"Number of paths for a multipath device.", |
||||
deviceLabels, nil, |
||||
), |
||||
devicePathsActive: prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, subsystem, "device_paths_active"), |
||||
"Number of paths in active state (SCSI running or NVMe live) for a multipath device.", |
||||
deviceLabels, nil, |
||||
), |
||||
devicePathsFailed: prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, subsystem, "device_paths_failed"), |
||||
"Number of paths not in active state for a multipath device.", |
||||
deviceLabels, nil, |
||||
), |
||||
pathState: prometheus.NewDesc( |
||||
prometheus.BuildFQName(namespace, subsystem, "path_state"), |
||||
"Reports the underlying device state for a multipath path, as read from /sys/block/<dev>/device/state.", |
||||
[]string{"device", "path", "state"}, nil, |
||||
), |
||||
}, nil |
||||
} |
||||
|
||||
func (c *dmMultipathCollector) Update(ch chan<- prometheus.Metric) error { |
||||
devices, err := c.fs.DMMultipathDevices() |
||||
if err != nil { |
||||
if errors.Is(err, os.ErrNotExist) || errors.Is(err, os.ErrPermission) { |
||||
c.logger.Debug("Could not read DM-multipath devices", "err", err) |
||||
return ErrNoData |
||||
} |
||||
return fmt.Errorf("failed to scan DM-multipath devices: %w", err) |
||||
} |
||||
|
||||
for _, dev := range devices { |
||||
ch <- prometheus.MustNewConstMetric(c.deviceInfo, prometheus.GaugeValue, 1, |
||||
dev.Name, dev.SysfsName, dev.UUID) |
||||
|
||||
active := 0.0 |
||||
if !dev.Suspended { |
||||
active = 1.0 |
||||
} |
||||
ch <- prometheus.MustNewConstMetric(c.deviceActive, prometheus.GaugeValue, active, dev.Name, dev.SysfsName) |
||||
ch <- prometheus.MustNewConstMetric(c.deviceSizeBytes, prometheus.GaugeValue, float64(dev.SizeBytes), dev.Name, dev.SysfsName) |
||||
|
||||
var activePaths, failedPaths float64 |
||||
for _, p := range dev.Paths { |
||||
if isPathActive(p.State) { |
||||
activePaths++ |
||||
} else { |
||||
failedPaths++ |
||||
} |
||||
|
||||
ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, 1, |
||||
dev.Name, p.Device, p.State) |
||||
} |
||||
|
||||
ch <- prometheus.MustNewConstMetric(c.devicePaths, prometheus.GaugeValue, float64(len(dev.Paths)), dev.Name, dev.SysfsName) |
||||
ch <- prometheus.MustNewConstMetric(c.devicePathsActive, prometheus.GaugeValue, activePaths, dev.Name, dev.SysfsName) |
||||
ch <- prometheus.MustNewConstMetric(c.devicePathsFailed, prometheus.GaugeValue, failedPaths, dev.Name, dev.SysfsName) |
||||
} |
||||
|
||||
return nil |
||||
} |
||||
@ -0,0 +1,151 @@ |
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build !nodmmultipath
|
||||
|
||||
package collector |
||||
|
||||
import ( |
||||
"io" |
||||
"log/slog" |
||||
"strings" |
||||
"testing" |
||||
|
||||
"github.com/prometheus/client_golang/prometheus" |
||||
dto "github.com/prometheus/client_model/go" |
||||
) |
||||
|
||||
func TestDMMultipathMetrics(t *testing.T) { |
||||
*procPath = "fixtures/proc" |
||||
*sysPath = "fixtures/sys" |
||||
|
||||
logger := slog.New(slog.NewTextHandler(io.Discard, nil)) |
||||
coll, err := NewDMMultipathCollector(logger) |
||||
if err != nil { |
||||
t.Fatal(err) |
||||
} |
||||
|
||||
c := coll.(*dmMultipathCollector) |
||||
|
||||
ch := make(chan prometheus.Metric, 200) |
||||
if err := c.Update(ch); err != nil { |
||||
t.Fatal(err) |
||||
} |
||||
close(ch) |
||||
|
||||
metrics := make(map[string][]*dto.Metric) |
||||
for m := range ch { |
||||
d := &dto.Metric{} |
||||
if err := m.Write(d); err != nil { |
||||
t.Fatal(err) |
||||
} |
||||
desc := m.Desc().String() |
||||
metrics[desc] = append(metrics[desc], d) |
||||
} |
||||
|
||||
assertGaugeValue(t, metrics, "device_active", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 1) |
||||
assertGaugeValue(t, metrics, "device_active", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 1) |
||||
assertGaugeValue(t, metrics, "device_size_bytes", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 53687091200) |
||||
assertGaugeValue(t, metrics, `device_paths"`, labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 4) |
||||
assertGaugeValue(t, metrics, `device_paths"`, labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 2) |
||||
|
||||
// mpathA: sdi, sdj, sdk are running; sdl is offline → 3 active, 1 failed.
|
||||
assertGaugeValue(t, metrics, "device_paths_active", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 3) |
||||
assertGaugeValue(t, metrics, "device_paths_failed", labelMap{"device": "mpathA", "sysfs_name": "dm-5"}, 1) |
||||
|
||||
// mpathB: sdm, sdn are both running → 2 active, 0 failed.
|
||||
assertGaugeValue(t, metrics, "device_paths_active", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 2) |
||||
assertGaugeValue(t, metrics, "device_paths_failed", labelMap{"device": "mpathB", "sysfs_name": "dm-6"}, 0) |
||||
|
||||
assertGaugeValue(t, metrics, "path_state", |
||||
labelMap{"device": "mpathA", "path": "sdi", "state": "running"}, 1) |
||||
assertGaugeValue(t, metrics, "path_state", |
||||
labelMap{"device": "mpathA", "path": "sdl", "state": "offline"}, 1) |
||||
} |
||||
|
||||
func TestDMMultipathNoDevices(t *testing.T) { |
||||
*procPath = "fixtures/proc" |
||||
*sysPath = t.TempDir() |
||||
|
||||
logger := slog.New(slog.NewTextHandler(io.Discard, nil)) |
||||
coll, err := NewDMMultipathCollector(logger) |
||||
if err != nil { |
||||
t.Fatal(err) |
||||
} |
||||
|
||||
c := coll.(*dmMultipathCollector) |
||||
|
||||
ch := make(chan prometheus.Metric, 200) |
||||
err = c.Update(ch) |
||||
close(ch) |
||||
|
||||
if err != ErrNoData { |
||||
t.Fatalf("expected ErrNoData, got %v", err) |
||||
} |
||||
} |
||||
|
||||
func TestIsPathActive(t *testing.T) { |
||||
tests := []struct { |
||||
state string |
||||
active bool |
||||
}{ |
||||
{"running", true}, |
||||
{"live", true}, |
||||
{"offline", false}, |
||||
{"blocked", false}, |
||||
{"transport-offline", false}, |
||||
{"dead", false}, |
||||
{"unknown", false}, |
||||
{"", false}, |
||||
} |
||||
for _, tc := range tests { |
||||
got := isPathActive(tc.state) |
||||
if got != tc.active { |
||||
t.Errorf("isPathActive(%q) = %v, want %v", tc.state, got, tc.active) |
||||
} |
||||
} |
||||
} |
||||
|
||||
type labelMap map[string]string |
||||
|
||||
func assertGaugeValue(t *testing.T, metrics map[string][]*dto.Metric, metricSubstring string, labels labelMap, expected float64) { |
||||
t.Helper() |
||||
for desc, ms := range metrics { |
||||
if !strings.Contains(desc, metricSubstring) { |
||||
continue |
||||
} |
||||
for _, m := range ms { |
||||
if matchLabels(m.GetLabel(), labels) { |
||||
got := m.GetGauge().GetValue() |
||||
if got != expected { |
||||
t.Errorf("%s%v: got %v, want %v", metricSubstring, labels, got, expected) |
||||
} |
||||
return |
||||
} |
||||
} |
||||
} |
||||
t.Errorf("metric %s%v not found", metricSubstring, labels) |
||||
} |
||||
|
||||
func matchLabels(pairs []*dto.LabelPair, want labelMap) bool { |
||||
if want == nil { |
||||
return len(pairs) == 0 |
||||
} |
||||
found := 0 |
||||
for _, lp := range pairs { |
||||
if v, ok := want[lp.GetName()]; ok && v == lp.GetValue() { |
||||
found++ |
||||
} |
||||
} |
||||
return found == len(want) |
||||
} |
||||
Loading…
Reference in new issue