From 4866adcb71577f81b018f5f807c41bf5878ea1ec Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Tue, 7 Feb 2017 10:46:51 -0600 Subject: [PATCH 1/5] Add new collector for InfiniBand statistics Add new metrics for the InfiniBand network protocol including the amount of packets sent and received, the number of times the link has been downed and how many times the link has recovered from an error state. Signed-Off-By: Robert Clark --- collector/fixtures/e2e-output.txt | 32 ++++ .../mlx4_0/ports/1/counters/link_downed | 1 + .../ports/1/counters/link_error_recovery | 1 + .../ports/1/counters/multicast_rcv_packets | 1 + .../ports/1/counters/multicast_xmit_packets | 1 + .../mlx4_0/ports/1/counters/port_rcv_data | 1 + .../mlx4_0/ports/1/counters/port_xmit_data | 1 + .../ports/1/counters/unicast_rcv_packets | 1 + .../ports/1/counters/unicast_xmit_packets | 1 + .../mlx4_0/ports/2/counters/link_downed | 1 + .../ports/2/counters/link_error_recovery | 1 + .../ports/2/counters/multicast_rcv_packets | 1 + .../ports/2/counters/multicast_xmit_packets | 1 + .../mlx4_0/ports/2/counters/port_rcv_data | 1 + .../mlx4_0/ports/2/counters/port_xmit_data | 1 + .../ports/2/counters/unicast_rcv_packets | 1 + .../ports/2/counters/unicast_xmit_packets | 1 + collector/infiniband_linux.go | 177 ++++++++++++++++++ end-to-end-test.sh | 1 + 19 files changed, 226 insertions(+) create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets create mode 100644 collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets create mode 100644 collector/infiniband_linux.go diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 9a08a561..87c4b012 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -654,6 +654,38 @@ node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp2"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84 +# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down +# TYPE node_infiniband_link_downed_total counter +node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 +node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state +# TYPE node_infiniband_link_error_recovery_total counter +node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 +node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) +# TYPE node_infiniband_multicast_packets_received_total counter +node_infiniband_multicast_packets_received_total{device="mlx4_0",port="1"} 93 +node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_multicast_packets_transmitted_total Number of multicast packets transmitted (including errors) +# TYPE node_infiniband_multicast_packets_transmitted_total counter +node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 +node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_data_received_bytes Number of data octets received on all links +# TYPE node_infiniband_port_data_received_bytes counter +node_infiniband_port_data_received_bytes{device="mlx4_0",port="1"} 4.631917e+06 +node_infiniband_port_data_received_bytes{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_data_transmitted_bytes Number of data octets transmitted on all links +# TYPE node_infiniband_port_data_transmitted_bytes counter +node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="1"} 3.73344e+06 +node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) +# TYPE node_infiniband_unicast_packets_received_total counter +node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148 +node_infiniband_unicast_packets_received_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_unicast_packets_transmitted_total Number of unicast packets transmitted (including errors) +# TYPE node_infiniband_unicast_packets_transmitted_total counter +node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="1"} 61239 +node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 # HELP node_intr Total number of interrupts serviced. # TYPE node_intr counter node_intr 8.885917e+06 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets new file mode 100644 index 00000000..c67f579c --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets @@ -0,0 +1 @@ +93 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets new file mode 100644 index 00000000..b6a7d89c --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets @@ -0,0 +1 @@ +16 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data new file mode 100644 index 00000000..496ea27d --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data @@ -0,0 +1 @@ +4631917 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data new file mode 100644 index 00000000..85ea8ebf --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data @@ -0,0 +1 @@ +3733440 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets new file mode 100644 index 00000000..2406651b --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets @@ -0,0 +1 @@ +61148 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets new file mode 100644 index 00000000..6279bd6a --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets @@ -0,0 +1 @@ +61239 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets @@ -0,0 +1 @@ +0 diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go new file mode 100644 index 00000000..34ce4dab --- /dev/null +++ b/collector/infiniband_linux.go @@ -0,0 +1,177 @@ +// Copyright 2017 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux +// +build !noinfiniband + +package collector + +import ( + "errors" + "path/filepath" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" +) + +const infinibandPath = "class/infiniband" + +var ( + errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected") + errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected") +) + +type infinibandCollector struct { + metricDescs map[string]*prometheus.Desc + counters map[string]infinibandMetric +} + +type infinibandMetric struct { + File string + Help string +} + +func init() { + Factories["infiniband"] = NewInfiniBandCollector +} + +func NewInfiniBandCollector() (Collector, error) { + var i infinibandCollector + + // Filenames of all InfiniBand counter metrics including a detailed description. + i.counters = map[string]infinibandMetric{ + "link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"}, + "link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"}, + "multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"}, + "multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"}, + "port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"}, + "port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"}, + "unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"}, + "unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"}, + } + + subsystem := "infiniband" + i.metricDescs = make(map[string]*prometheus.Desc) + + for metricName, infinibandMetric := range i.counters { + i.metricDescs[metricName] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, metricName), + infinibandMetric.Help, + []string{"device", "port"}, + nil, + ) + } + + return &i, nil +} + +// infinibandDevices retrieves a list of InfiniBand devices. +func infinibandDevices(infinibandPath string) ([]string, error) { + devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*")) + if err != nil { + return nil, err + } + + if len(devices) < 1 { + log.Debugf("Unable to detect InfiniBand devices") + err = errInfinibandNoDevicesFound + return nil, err + } + + // Extract just the filenames which equate to the device names. + for i, device := range devices { + devices[i] = filepath.Base(device) + } + + return devices, nil +} + +// Retrieve a list of ports for the InfiniBand device. +func infinibandPorts(infinibandPath, device string) ([]string, error) { + ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*")) + if err != nil { + return nil, err + } + + if len(ports) < 1 { + log.Debugf("Unable to detect ports for %s", device) + err = errInfinibandNoPortsFound + return nil, err + } + + // Extract just the filenames which equates to the port numbers. + for i, port := range ports { + ports[i] = filepath.Base(port) + } + + return ports, nil +} + +func readMetric(directory, metricFile string) (uint64, error) { + metric, err := readUintFromFile(filepath.Join(directory, metricFile)) + if err != nil { + log.Debugf("Error reading %q file", metricFile) + return 0, err + } + + return metric, nil +} + +func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) (err error) { + devices, err := infinibandDevices(sysFilePath(infinibandPath)) + + // If no devices are found or another error is raised while attempting to find devices, + // InfiniBand is likely not installed and the collector should be skipped. + switch err { + case nil: + case errInfinibandNoDevicesFound: + return nil + default: + return err + } + + for _, device := range devices { + ports, err := infinibandPorts(sysFilePath(infinibandPath), device) + + // If no ports are found for the specified device, skip to the next device. + switch err { + case nil: + case errInfinibandNoPortsFound: + continue + default: + return err + } + + for _, port := range ports { + portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port)) + + // Add metrics for the InfiniBand counters. + for metricName, infinibandMetric := range c.counters { + metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File) + if err != nil { + return err + } + + ch <- prometheus.MustNewConstMetric( + c.metricDescs[metricName], + prometheus.CounterValue, + float64(metric), + device, + port, + ) + } + } + } + + return nil +} diff --git a/end-to-end-test.sh b/end-to-end-test.sh index 2e92dad5..718e2590 100755 --- a/end-to-end-test.sh +++ b/end-to-end-test.sh @@ -10,6 +10,7 @@ collectors=$(cat << COLLECTORS entropy filefd hwmon + infiniband ksmd loadavg mdadm From 36f81282b763c551990d5af5676aeb804003a984 Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Tue, 7 Feb 2017 10:48:15 -0600 Subject: [PATCH 2/5] Add unit tests for InfiniBand collector Signed-Off-By: Robert Clark --- collector/infiniband_linux_test.go | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 collector/infiniband_linux_test.go diff --git a/collector/infiniband_linux_test.go b/collector/infiniband_linux_test.go new file mode 100644 index 00000000..991102d6 --- /dev/null +++ b/collector/infiniband_linux_test.go @@ -0,0 +1,40 @@ +// Copyright 2017 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "testing" +) + +func TestInfiniBandDevices(t *testing.T) { + devices, err := infinibandDevices("fixtures/sys/class/infiniband") + if err != nil { + t.Fatal(err) + } + + if l := len(devices); l != 1 { + t.Fatal("Retrieved an unexpected number of InfiniBand devices: %d", l) + } +} + +func TestInfiniBandPorts(t *testing.T) { + ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0") + if err != nil { + t.Fatal(err) + } + + if l := len(ports); l != 2 { + t.Fatal("Retrieved an unexpected number of InfiniBand ports: %d", l) + } +} From b0c9133cba860aa6f09304cf630a1271bcf51090 Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Tue, 7 Feb 2017 10:49:05 -0600 Subject: [PATCH 3/5] Enable InfiniBand by default Signed-Off-By: Robert Clark --- node_exporter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_exporter.go b/node_exporter.go index b6a7f0e1..3e87bc43 100644 --- a/node_exporter.go +++ b/node_exporter.go @@ -32,7 +32,7 @@ import ( ) const ( - defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs" + defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,infiniband,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs" ) var ( From b2d2b69af6da76cb177fed11ffc39f3834cc50fc Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Tue, 7 Feb 2017 10:49:53 -0600 Subject: [PATCH 4/5] Update README to include InfiniBand collector Signed-Off-By: Robert Clark --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 33ab0a6f..3c179886 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ entropy | Exposes available entropy. | Linux filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux +infiniband | Exposes network statistics specific to InfiniBand configurations. | Linux loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux From f809bfdde62b40dd98846cdc5eae33c200d25ec1 Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Tue, 7 Feb 2017 10:50:50 -0600 Subject: [PATCH 5/5] Update authors after InfiniBand commit Signed-Off-By: Robert Clark --- AUTHORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.md b/AUTHORS.md index 030d6171..8a42b93f 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -32,6 +32,7 @@ The following individuals have contributed code to this repository * Ken Herner * Matt Layher * Matthias Rampke +* Robert Clark * Siavash Safi * Stephen Shirley * Steve Durrheimer