Add perf exporter (#1274)

Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
7 years ago · 7882009870
parent 0c6b90be4e
commit 7882009870
41 changed files with 4285 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -32,6 +32,7 @@
 * [FEATURE] Add diskstats collector for OpenBSD #1250
 * [CHANGE] Bonding state uses mii_status #1124
 * [FEATURE] Add pressure collector exposing pressure stall information for Linux #1174
+* [FEATURE] Add perf exporter for Linux #1274

 ## 0.17.0 / 2018-11-30

--- a/README.md
+++ b/README.md
@ -63,6 +63,23 @@ zfs | Exposes [ZFS](http://open-zfs.org/) performance statistics. | [Linux](http

 ### Disabled by default

+The perf collector may not work by default on all Linux systems due to kernel
+configuration and security settings. To allow access, set the following sysctl
+parameter:
+
+```
+sysctl -w kernel.perf_event_paranoid=X
+```
+
+- 2 allow only user-space measurements (default since Linux 4.6).
+- 1 allow both kernel and user measurements (default before Linux 4.6).
+- 0 allow access to CPU-specific data but not raw tracepoint samples.
+- -1 no restrictions.
+
+Depending on the configured value different metrics will be available, for most
+cases `0` will provide the most complete set. For more information see [`man 2
+perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html).
+
 Name     | Description | OS
 ---------|-------------|----
 buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux
@ -81,6 +98,7 @@ supervisord | Exposes service status from [supervisord](http://supervisord.org/)
 systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux
 tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux
 wifi | Exposes WiFi device and station statistics. | Linux
+perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux

 ### Textfile Collector

--- a/collector/perf_linux.go
+++ b/collector/perf_linux.go
@ -0,0 +1,567 @@
+// Copyright 2019 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package collector
+
+import (
+	"fmt"
+	"runtime"
+
+	perf "github.com/hodgesds/perf-utils"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+const (
+	perfSubsystem = "perf"
+)
+
+func init() {
+	registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector)
+}
+
+// perfCollector is a Collecter that uses the perf subsystem to collect
+// metrics. It uses perf_event_open an ioctls for profiling. Due to the fact
+// that the perf subsystem is highly dependent on kernel configuration and
+// settings not all profiler values may be exposed on the target system at any
+// given time.
+type perfCollector struct {
+	perfHwProfilers    map[int]perf.HardwareProfiler
+	perfSwProfilers    map[int]perf.SoftwareProfiler
+	perfCacheProfilers map[int]perf.CacheProfiler
+	desc               map[string]*prometheus.Desc
+}
+
+// NewPerfCollector returns a new perf based collector, it creates a profiler
+// per CPU.
+func NewPerfCollector() (Collector, error) {
+	collector := &perfCollector{
+		perfHwProfilers:    map[int]perf.HardwareProfiler{},
+		perfSwProfilers:    map[int]perf.SoftwareProfiler{},
+		perfCacheProfilers: map[int]perf.CacheProfiler{},
+	}
+	ncpus := runtime.NumCPU()
+	for i := 0; i < ncpus; i++ {
+		// Use -1 to profile all processes on the CPU, see:
+		// man perf_event_open
+		collector.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i)
+		if err := collector.perfHwProfilers[i].Start(); err != nil {
+			return collector, err
+		}
+		collector.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i)
+		if err := collector.perfSwProfilers[i].Start(); err != nil {
+			return collector, err
+		}
+		collector.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i)
+		if err := collector.perfCacheProfilers[i].Start(); err != nil {
+			return collector, err
+		}
+	}
+	collector.desc = map[string]*prometheus.Desc{
+		"cpucycles_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cpucycles_total",
+			),
+			"Number of CPU cycles (frequency scaled)",
+			[]string{"cpu"},
+			nil,
+		),
+		"instructions_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"instructions_total",
+			),
+			"Number of CPU instructions",
+			[]string{"cpu"},
+			nil,
+		),
+		"branch_instructions_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"branch_instructions_total",
+			),
+			"Number of CPU branch instructions",
+			[]string{"cpu"},
+			nil,
+		),
+		"branch_misses_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"branch_misses_total",
+			),
+			"Number of CPU branch misses",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_refs_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_refs_total",
+			),
+			"Number of cache references (non frequency scaled)",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_misses_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_misses_total",
+			),
+			"Number of cache misses",
+			[]string{"cpu"},
+			nil,
+		),
+		"ref_cpucycles_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"ref_cpucycles_total",
+			),
+			"Number of CPU cycles",
+			[]string{"cpu"},
+			nil,
+		),
+		"page_faults_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"page_faults_total",
+			),
+			"Number of page faults",
+			[]string{"cpu"},
+			nil,
+		),
+		"context_switches_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"context_switches_total",
+			),
+			"Number of context switches",
+			[]string{"cpu"},
+			nil,
+		),
+		"cpu_migrations_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cpu_migrations_total",
+			),
+			"Number of CPU process migrations",
+			[]string{"cpu"},
+			nil,
+		),
+		"minor_faults_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"minor_faults_total",
+			),
+			"Number of minor page faults",
+			[]string{"cpu"},
+			nil,
+		),
+		"major_faults_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"major_faults_total",
+			),
+			"Number of major page faults",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_l1d_read_hits_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_l1d_read_hits_total",
+			),
+			"Number L1 data cache read hits",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_l1d_read_misses_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_l1d_read_misses_total",
+			),
+			"Number L1 data cache read misses",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_l1d_write_hits_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_l1d_write_hits_total",
+			),
+			"Number L1 data cache write hits",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_l1_instr_read_misses_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_l1_instr_read_misses_total",
+			),
+			"Number instruction L1 instruction read misses",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_tlb_instr_read_hits_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_tlb_instr_read_hits_total",
+			),
+			"Number instruction TLB read hits",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_tlb_instr_read_misses_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_tlb_instr_read_misses_total",
+			),
+			"Number instruction TLB read misses",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_ll_read_hits_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_ll_read_hits_total",
+			),
+			"Number last level read hits",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_ll_read_misses_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_ll_read_misses_total",
+			),
+			"Number last level read misses",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_ll_write_hits_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_ll_write_hits_total",
+			),
+			"Number last level write hits",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_ll_write_misses_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_ll_write_misses_total",
+			),
+			"Number last level write misses",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_bpu_read_hits_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_bpu_read_hits_total",
+			),
+			"Number BPU read hits",
+			[]string{"cpu"},
+			nil,
+		),
+		"cache_bpu_read_misses_total": prometheus.NewDesc(
+			prometheus.BuildFQName(
+				namespace,
+				perfSubsystem,
+				"cache_bpu_read_misses_total",
+			),
+			"Number BPU read misses",
+			[]string{"cpu"},
+			nil,
+		),
+	}
+
+	return collector, nil
+}
+
+// Update implements the Collector interface and will collect metrics per CPU.
+func (c *perfCollector) Update(ch chan<- prometheus.Metric) error {
+	if err := c.updateHardwareStats(ch); err != nil {
+		return err
+	}
+
+	if err := c.updateSoftwareStats(ch); err != nil {
+		return err
+	}
+
+	if err := c.updateCacheStats(ch); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
+	for cpu, profiler := range c.perfHwProfilers {
+		cpuStr := fmt.Sprintf("%d", cpu)
+		hwProfile, err := profiler.Profile()
+		if err != nil {
+			return err
+		}
+		if hwProfile == nil {
+			continue
+		}
+
+		if hwProfile.CPUCycles != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cpucycles_total"],
+				prometheus.CounterValue, float64(*hwProfile.CPUCycles),
+				cpuStr,
+			)
+		}
+
+		if hwProfile.Instructions != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["instructions_total"],
+				prometheus.CounterValue, float64(*hwProfile.Instructions),
+				cpuStr,
+			)
+		}
+
+		if hwProfile.BranchInstr != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["branch_instructions_total"],
+				prometheus.CounterValue, float64(*hwProfile.BranchInstr),
+				cpuStr,
+			)
+		}
+
+		if hwProfile.BranchMisses != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["branch_misses_total"],
+				prometheus.CounterValue, float64(*hwProfile.BranchMisses),
+				cpuStr,
+			)
+		}
+
+		if hwProfile.CacheRefs != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_refs_total"],
+				prometheus.CounterValue, float64(*hwProfile.CacheRefs),
+				cpuStr,
+			)
+		}
+
+		if hwProfile.CacheMisses != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_misses_total"],
+				prometheus.CounterValue, float64(*hwProfile.CacheMisses),
+				cpuStr,
+			)
+		}
+
+		if hwProfile.RefCPUCycles != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["ref_cpucycles_total"],
+				prometheus.CounterValue, float64(*hwProfile.RefCPUCycles),
+				cpuStr,
+			)
+		}
+	}
+
+	return nil
+}
+
+func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
+	for cpu, profiler := range c.perfSwProfilers {
+		cpuStr := fmt.Sprintf("%d", cpu)
+		swProfile, err := profiler.Profile()
+		if err != nil {
+			return err
+		}
+		if swProfile == nil {
+			continue
+		}
+
+		if swProfile.PageFaults != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["page_faults_total"],
+				prometheus.CounterValue, float64(*swProfile.PageFaults),
+				cpuStr,
+			)
+		}
+
+		if swProfile.ContextSwitches != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["context_switches_total"],
+				prometheus.CounterValue, float64(*swProfile.ContextSwitches),
+				cpuStr,
+			)
+		}
+
+		if swProfile.CPUMigrations != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cpu_migrations_total"],
+				prometheus.CounterValue, float64(*swProfile.CPUMigrations),
+				cpuStr,
+			)
+		}
+
+		if swProfile.MinorPageFaults != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["minor_faults_total"],
+				prometheus.CounterValue, float64(*swProfile.MinorPageFaults),
+				cpuStr,
+			)
+		}
+
+		if swProfile.MajorPageFaults != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["major_faults_total"],
+				prometheus.CounterValue, float64(*swProfile.MajorPageFaults),
+				cpuStr,
+			)
+		}
+	}
+
+	return nil
+}
+
+func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error {
+	for cpu, profiler := range c.perfCacheProfilers {
+		cpuStr := fmt.Sprintf("%d", cpu)
+		cacheProfile, err := profiler.Profile()
+		if err != nil {
+			return err
+		}
+		if cacheProfile == nil {
+			continue
+		}
+
+		if cacheProfile.L1DataReadHit != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_l1d_read_hits_total"],
+				prometheus.CounterValue, float64(*cacheProfile.L1DataReadHit),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.L1DataReadMiss != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_l1d_read_misses_total"],
+				prometheus.CounterValue, float64(*cacheProfile.L1DataReadMiss),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.L1DataWriteHit != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_l1d_write_hits_total"],
+				prometheus.CounterValue, float64(*cacheProfile.L1DataWriteHit),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.L1InstrReadMiss != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_l1_instr_read_misses_total"],
+				prometheus.CounterValue, float64(*cacheProfile.L1InstrReadMiss),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.InstrTLBReadHit != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_tlb_instr_read_hits_total"],
+				prometheus.CounterValue, float64(*cacheProfile.InstrTLBReadHit),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.InstrTLBReadMiss != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_tlb_instr_read_misses_total"],
+				prometheus.CounterValue, float64(*cacheProfile.InstrTLBReadMiss),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.LastLevelReadHit != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_ll_read_hits_total"],
+				prometheus.CounterValue, float64(*cacheProfile.LastLevelReadHit),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.LastLevelReadMiss != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_ll_read_misses_total"],
+				prometheus.CounterValue, float64(*cacheProfile.LastLevelReadMiss),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.LastLevelWriteHit != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_ll_write_hits_total"],
+				prometheus.CounterValue, float64(*cacheProfile.LastLevelWriteHit),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.LastLevelWriteMiss != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_ll_write_misses_total"],
+				prometheus.CounterValue, float64(*cacheProfile.LastLevelWriteMiss),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.BPUReadHit != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_bpu_read_hits_total"],
+				prometheus.CounterValue, float64(*cacheProfile.BPUReadHit),
+				cpuStr,
+			)
+		}
+
+		if cacheProfile.BPUReadMiss != nil {
+			ch <- prometheus.MustNewConstMetric(
+				c.desc["cache_bpu_read_misses_total"],
+				prometheus.CounterValue, float64(*cacheProfile.BPUReadMiss),
+				cpuStr,
+			)
+		}
+	}
+
+	return nil
+}
--- a/collector/perf_linux_test.go
+++ b/collector/perf_linux_test.go
@ -0,0 +1,55 @@
+// Copyright 2019 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !noprocesses
+
+package collector
+
+import (
+	"io/ioutil"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+func TestPerfCollector(t *testing.T) {
+	paranoidBytes, err := ioutil.ReadFile("/proc/sys/kernel/perf_event_paranoid")
+	if err != nil {
+		t.Skip("Procfs not mounted, skipping perf tests")
+	}
+	paranoidStr := strings.Replace(string(paranoidBytes), "\n", "", -1)
+	paranoid, err := strconv.Atoi(paranoidStr)
+	if err != nil {
+		t.Fatalf("Expected perf_event_paranoid to be an int, got: %s", paranoidStr)
+	}
+	if paranoid >= 1 {
+		t.Skip("Skipping perf tests, set perf_event_paranoid to 0")
+	}
+	collector, err := NewPerfCollector()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Setup background goroutine to capture metrics.
+	metrics := make(chan prometheus.Metric)
+	defer close(metrics)
+	go func() {
+		for range metrics {
+		}
+	}()
+	if err := collector.Update(metrics); err != nil {
+		t.Fatal(err)
+	}
+}
--- a/go.mod
+++ b/go.mod
@ -7,6 +7,7 @@ require (
 	github.com/godbus/dbus v0.0.0-20190402143921-271e53dc4968
 	github.com/golang/protobuf v1.3.1 // indirect
 	github.com/google/go-cmp v0.2.0 // indirect
+	github.com/hodgesds/perf-utils v0.0.6
 	github.com/konsorten/go-windows-terminal-sequences v1.0.2 // indirect
 	github.com/lufia/iostat v0.0.0-20170605150913-9f7362b77ad3
 	github.com/mattn/go-xmlrpc v0.0.1
@ -21,6 +22,8 @@ require (
 	github.com/sirupsen/logrus v1.4.1 // indirect
 	github.com/soundcloud/go-runit v0.0.0-20150630195641-06ad41a06c4a
 	github.com/stretchr/testify v1.3.0 // indirect
+	go.uber.org/atomic v1.3.2 // indirect
+	go.uber.org/multierr v1.1.0 // indirect
 	golang.org/x/net v0.0.0-20190328230028-74de082e2cca // indirect
 	golang.org/x/sync v0.0.0-20190423024810-112230192c58 // indirect
 	golang.org/x/sys v0.0.0-20190402142545-baf5eb976a8c
--- a/go.sum
+++ b/go.sum
@ -26,6 +26,8 @@ github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/hodgesds/perf-utils v0.0.6 h1:qtHULYRGc+LEIADV2+XI1tJrb9d4PrWl5bwdA94WV3c=
+github.com/hodgesds/perf-utils v0.0.6/go.mod h1:F6TfvsbtrF88i++hou29dTXlI2sfsJv+gRZDtmTJkAs=
 github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
@ -76,6 +78,10 @@ github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+go.uber.org/atomic v1.3.2 h1:2Oa65PReHzfn29GpvgsYwloV9AVFHPDk8tYxt2c2tr4=
+go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
+go.uber.org/multierr v1.1.0 h1:HoEmRHQPVSqub6w2z2d2EOVs2fjyFRGyofhKuyDq0QI=
+go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
 golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
--- a/vendor/github.com/hodgesds/perf-utils/.gitignore
+++ b/vendor/github.com/hodgesds/perf-utils/.gitignore
@ -0,0 +1,2 @@
+*.swp
+vendor
--- a/vendor/github.com/hodgesds/perf-utils/Gopkg.lock
+++ b/vendor/github.com/hodgesds/perf-utils/Gopkg.lock
@ -0,0 +1,15 @@
+# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
+
+
+[[projects]]
+  branch = "master"
+  name = "golang.org/x/sys"
+  packages = ["unix"]
+  revision = "90b0e4468f9980bf79a2290394adaf7f045c5d24"
+
+[solve-meta]
+  analyzer-name = "dep"
+  analyzer-version = 1
+  inputs-digest = "c188619af29e454f9af8a4b24b5d13720a55a70615395ba2ded3a628fa51776a"
+  solver-name = "gps-cdcl"
+  solver-version = 1
--- a/vendor/github.com/hodgesds/perf-utils/Gopkg.toml
+++ b/vendor/github.com/hodgesds/perf-utils/Gopkg.toml
@ -0,0 +1,34 @@
+# Gopkg.toml example
+#
+# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md
+# for detailed Gopkg.toml documentation.
+#
+# required = ["github.com/user/thing/cmd/thing"]
+# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"]
+#
+# [[constraint]]
+#   name = "github.com/user/project"
+#   version = "1.0.0"
+#
+# [[constraint]]
+#   name = "github.com/user/project2"
+#   branch = "dev"
+#   source = "github.com/myfork/project2"
+#
+# [[override]]
+#   name = "github.com/x/y"
+#   version = "2.4.0"
+#
+# [prune]
+#   non-go = false
+#   go-tests = true
+#   unused-packages = true
+
+
+[[constraint]]
+  branch = "master"
+  name = "golang.org/x/sys"
+
+[prune]
+  go-tests = true
+  unused-packages = true
--- a/vendor/github.com/hodgesds/perf-utils/LICENSE
+++ b/vendor/github.com/hodgesds/perf-utils/LICENSE
@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2019 Daniel Hodges
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/vendor/github.com/hodgesds/perf-utils/README.md
+++ b/vendor/github.com/hodgesds/perf-utils/README.md
@ -0,0 +1,120 @@
+# Perf
+[![GoDoc](https://godoc.org/github.com/hodgesds/perf-utils?status.svg)](https://godoc.org/github.com/hodgesds/perf-utils)
+
+This package is a go library for interacting with the `perf` subsystem in
+Linux. It allows you to do things like see how many CPU instructions a function
+takes, profile a process for various hardware events, and other interesting
+things. The library is by no means finalized and should be considered pre-alpha
+at best.
+
+# Use Cases
+A majority of the utility methods in this package should only be used for
+testing and/or debugging performance issues. Due to the nature of the go
+runtime profiling on the goroutine level is extremely tricky, with the
+exception of a long running worker goroutine locked to an OS thread. Eventually
+this library could be used to implement many of the features of `perf` but in
+accessible via Go directly.
+
+## Caveats
+* Some utility functions will call
+  [`runtime.LockOSThread`](https://golang.org/pkg/runtime/#LockOSThread) for
+  you, they will also unlock the thread after profiling. ***Note*** using these
+  utility functions will incur significant overhead.
+* Overflow handling is not implemented.
+
+# Setup
+Most likely you will need to tweak some system settings unless you are running as root. From `man perf_event_open`:
+
+```
+   perf_event related configuration files
+       Files in /proc/sys/kernel/
+
+           /proc/sys/kernel/perf_event_paranoid
+                  The perf_event_paranoid file can be set to restrict access to the performance counters.
+
+                  2   allow only user-space measurements (default since Linux 4.6).
+                  1   allow both kernel and user measurements (default before Linux 4.6).
+                  0   allow access to CPU-specific data but not raw tracepoint samples.
+                  -1  no restrictions.
+
+                  The existence of the perf_event_paranoid file is the official method for determining if a kernel supports perf_event_open().
+
+           /proc/sys/kernel/perf_event_max_sample_rate
+                  This sets the maximum sample rate.  Setting this too high can allow users to sample at a rate that impacts overall machine performance and potentially lock up the machine.  The default value is 100000  (samples  per
+                  second).
+
+           /proc/sys/kernel/perf_event_max_stack
+                  This file sets the maximum depth of stack frame entries reported when generating a call trace.
+
+           /proc/sys/kernel/perf_event_mlock_kb
+                  Maximum number of pages an unprivileged user can mlock(2).  The default is 516 (kB).
+
+```
+
+# Example
+Say you wanted to see how many CPU instructions a particular function took:
+
+```
+package main
+
+import (
+	"fmt"
+	"log"
+	"github.com/hodgesds/perf-utils"
+)
+
+func foo() error {
+	var total int
+	for i:=0;i<1000;i++ {
+		total++
+	}
+	return nil
+}
+
+func main() {
+	profileValue, err := perf.CPUInstructions(foo)
+	if err != nil {
+		log.Fatal(err)
+	}
+	fmt.Printf("CPU instructions: %+v\n", profileValue)
+}
+```
+
+# Benchmarks
+To profile a single function call there is an overhead of ~0.4ms.
+
+```
+$ go test  -bench=BenchmarkCPUCycles .
+goos: linux
+goarch: amd64
+pkg: github.com/hodgesds/perf-utils
+BenchmarkCPUCycles-8        3000            397924 ns/op              32 B/op          1 allocs/op
+PASS
+ok      github.com/hodgesds/perf-utils  1.255s
+```
+
+The `Profiler` interface has low overhead and suitable for many use cases:
+
+```
+$ go test  -bench=BenchmarkProfiler .
+goos: linux
+goarch: amd64
+pkg: github.com/hodgesds/perf-utils
+BenchmarkProfiler-8      3000000               488 ns/op              32 B/op          1 allocs/op
+PASS
+ok      github.com/hodgesds/perf-utils  1.981s
+```
+
+# BPF Support
+BPF is supported by using the `BPFProfiler` which is available via the
+`ProfileTracepoint` function. To use BPF you need to create the BPF program and
+then call `AttachBPF` with the file descriptor of the BPF program. This is not
+well tested so use at your own peril.
+
+# Misc
+Originally I set out to use `go generate` to build Go structs that were
+compatible with perf, I found a really good
+[article](https://utcc.utoronto.ca/~cks/space/blog/programming/GoCGoCompatibleStructs)
+on how to do so. Eventually, after digging through some of the `/x/sys/unix`
+code I found pretty much what I was needed. However, I think if you are
+interested in interacting with the kernel it is a worthwhile read.
--- a/vendor/github.com/hodgesds/perf-utils/bpf.go
+++ b/vendor/github.com/hodgesds/perf-utils/bpf.go
@ -0,0 +1,22 @@
+// +build linux
+
+package perf
+
+import (
+	"golang.org/x/sys/unix"
+)
+
+// BPFProfiler is a Profiler that allows attaching a Berkeley
+// Packet Filter (BPF) program to an existing kprobe tracepoint event.
+// You need CAP_SYS_ADMIN privileges to use this interface. See:
+// https://lwn.net/Articles/683504/
+type BPFProfiler interface {
+	Profiler
+	AttachBPF(int) error
+}
+
+// AttachBPF is used to attach a BPF program to a profiler by using the file
+// descriptor of the BPF program.
+func (p *profiler) AttachBPF(fd int) error {
+	return unix.IoctlSetInt(p.fd, unix.PERF_EVENT_IOC_SET_BPF, fd)
+}
--- a/vendor/github.com/hodgesds/perf-utils/cache_profiler.go
+++ b/vendor/github.com/hodgesds/perf-utils/cache_profiler.go
@ -0,0 +1,336 @@
+// +build linux
+
+package perf
+
+import (
+	"go.uber.org/multierr"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// L1DataReadHit is a constant...
+	L1DataReadHit = (unix.PERF_COUNT_HW_CACHE_L1D) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// L1DataReadMiss is a constant...
+	L1DataReadMiss = (unix.PERF_COUNT_HW_CACHE_L1D) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+	// L1DataWriteHit is a constant...
+	L1DataWriteHit = (unix.PERF_COUNT_HW_CACHE_L1D) | (unix.PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// L1InstrReadMiss is a constant...
+	L1InstrReadMiss = (unix.PERF_COUNT_HW_CACHE_L1I) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+
+	// LLReadHit is a constant...
+	LLReadHit = (unix.PERF_COUNT_HW_CACHE_LL) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// LLReadMiss is a constant...
+	LLReadMiss = (unix.PERF_COUNT_HW_CACHE_LL) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+	// LLWriteHit is a constant...
+	LLWriteHit = (unix.PERF_COUNT_HW_CACHE_LL) | (unix.PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// LLWriteMiss is a constant...
+	LLWriteMiss = (unix.PERF_COUNT_HW_CACHE_LL) | (unix.PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+
+	// DataTLBReadHit is a constant...
+	DataTLBReadHit = (unix.PERF_COUNT_HW_CACHE_DTLB) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// DataTLBReadMiss is a constant...
+	DataTLBReadMiss = (unix.PERF_COUNT_HW_CACHE_DTLB) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+	// DataTLBWriteHit is a constant...
+	DataTLBWriteHit = (unix.PERF_COUNT_HW_CACHE_DTLB) | (unix.PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// DataTLBWriteMiss is a constant...
+	DataTLBWriteMiss = (unix.PERF_COUNT_HW_CACHE_DTLB) | (unix.PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+
+	// InstrTLBReadHit is a constant...
+	InstrTLBReadHit = (unix.PERF_COUNT_HW_CACHE_ITLB) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// InstrTLBReadMiss is a constant...
+	InstrTLBReadMiss = (unix.PERF_COUNT_HW_CACHE_ITLB) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+
+	// BPUReadHit is a constant...
+	BPUReadHit = (unix.PERF_COUNT_HW_CACHE_BPU) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// BPUReadMiss is a constant...
+	BPUReadMiss = (unix.PERF_COUNT_HW_CACHE_BPU) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+
+	// NodeCacheReadHit is a constant...
+	NodeCacheReadHit = (unix.PERF_COUNT_HW_CACHE_NODE) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// NodeCacheReadMiss is a constant...
+	NodeCacheReadMiss = (unix.PERF_COUNT_HW_CACHE_NODE) | (unix.PERF_COUNT_HW_CACHE_OP_READ << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+	// NodeCacheWriteHit is a constant...
+	NodeCacheWriteHit = (unix.PERF_COUNT_HW_CACHE_NODE) | (unix.PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
+	// NodeCacheWriteMiss is a constant...
+	NodeCacheWriteMiss = (unix.PERF_COUNT_HW_CACHE_NODE) | (unix.PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (unix.PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
+)
+
+type cacheProfiler struct {
+	// map of perf counter type to file descriptor
+	profilers map[int]Profiler
+}
+
+// NewCacheProfiler returns a new cache profiler.
+func NewCacheProfiler(pid, cpu int, opts ...int) CacheProfiler {
+	profilers := map[int]Profiler{}
+
+	// L1 data
+	op := unix.PERF_COUNT_HW_CACHE_OP_READ
+	result := unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	l1dataReadHit, err := NewL1DataProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[L1DataReadHit] = l1dataReadHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	l1dataReadMiss, err := NewL1DataProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[L1DataReadMiss] = l1dataReadMiss
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_WRITE
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	l1dataWriteHit, err := NewL1DataProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[L1DataWriteHit] = l1dataWriteHit
+	}
+
+	// L1 instruction
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	l1InstrReadMiss, err := NewL1InstrProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[L1InstrReadMiss] = l1InstrReadMiss
+	}
+
+	// Last Level
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	llReadHit, err := NewLLCacheProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[LLReadHit] = llReadHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	llReadMiss, err := NewLLCacheProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[LLReadMiss] = llReadMiss
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_WRITE
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	llWriteHit, err := NewLLCacheProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[LLWriteHit] = llWriteHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_WRITE
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	llWriteMiss, err := NewLLCacheProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[LLWriteMiss] = llWriteMiss
+	}
+
+	// dTLB
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	dTLBReadHit, err := NewDataTLBProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[DataTLBReadHit] = dTLBReadHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	dTLBReadMiss, err := NewDataTLBProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[DataTLBReadMiss] = dTLBReadMiss
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_WRITE
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	dTLBWriteHit, err := NewDataTLBProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[DataTLBWriteHit] = dTLBWriteHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_WRITE
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	dTLBWriteMiss, err := NewDataTLBProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[DataTLBWriteMiss] = dTLBWriteMiss
+	}
+
+	// iTLB
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	iTLBReadHit, err := NewInstrTLBProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[InstrTLBReadHit] = iTLBReadHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	iTLBReadMiss, err := NewInstrTLBProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[InstrTLBReadMiss] = iTLBReadMiss
+	}
+
+	// BPU
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	bpuReadHit, err := NewBPUProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[BPUReadHit] = bpuReadHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	bpuReadMiss, err := NewBPUProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[BPUReadMiss] = bpuReadMiss
+	}
+
+	// Node
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	nodeReadHit, err := NewNodeCacheProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[NodeCacheReadHit] = nodeReadHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_READ
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	nodeReadMiss, err := NewNodeCacheProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[NodeCacheReadMiss] = nodeReadMiss
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_WRITE
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_ACCESS
+	nodeWriteHit, err := NewNodeCacheProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[NodeCacheWriteHit] = nodeWriteHit
+	}
+
+	op = unix.PERF_COUNT_HW_CACHE_OP_WRITE
+	result = unix.PERF_COUNT_HW_CACHE_RESULT_MISS
+	nodeWriteMiss, err := NewNodeCacheProfiler(pid, cpu, op, result, opts...)
+	if err == nil {
+		profilers[NodeCacheWriteMiss] = nodeWriteMiss
+	}
+
+	return &cacheProfiler{
+		profilers: profilers,
+	}
+}
+
+// Start is used to start the CacheProfiler, it will return an error if no
+// profilers are configured.
+func (p *cacheProfiler) Start() error {
+	if len(p.profilers) == 0 {
+		return ErrNoProfiler
+	}
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Start())
+	}
+	return err
+}
+
+// Reset is used to reset the CacheProfiler.
+func (p *cacheProfiler) Reset() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Reset())
+	}
+	return err
+}
+
+// Stop is used to reset the CacheProfiler.
+func (p *cacheProfiler) Stop() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Stop())
+	}
+	return err
+}
+
+// Close is used to reset the CacheProfiler.
+func (p *cacheProfiler) Close() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Close())
+	}
+	return err
+}
+
+// Profile is used to read the CacheProfiler CacheProfile it returns an
+// error only if all profiles fail.
+func (p *cacheProfiler) Profile() (*CacheProfile, error) {
+	var err error
+	cacheProfile := &CacheProfile{}
+	for profilerType, profiler := range p.profilers {
+		profileVal, err2 := profiler.Profile()
+		err = multierr.Append(err, err2)
+		if err2 == nil {
+			if cacheProfile.TimeEnabled == nil {
+				cacheProfile.TimeEnabled = &profileVal.TimeEnabled
+			}
+			if cacheProfile.TimeRunning == nil {
+				cacheProfile.TimeRunning = &profileVal.TimeRunning
+			}
+			switch {
+			// L1 data
+			case (profilerType ^ L1DataReadHit) == 0:
+				cacheProfile.L1DataReadHit = &profileVal.Value
+			case (profilerType ^ L1DataReadMiss) == 0:
+				cacheProfile.L1DataReadMiss = &profileVal.Value
+			case (profilerType ^ L1DataWriteHit) == 0:
+				cacheProfile.L1DataWriteHit = &profileVal.Value
+
+			// L1 instruction
+			case (profilerType ^ L1InstrReadMiss) == 0:
+				cacheProfile.L1InstrReadMiss = &profileVal.Value
+
+			// Last Level
+			case (profilerType ^ LLReadHit) == 0:
+				cacheProfile.LastLevelReadHit = &profileVal.Value
+			case (profilerType ^ LLReadMiss) == 0:
+				cacheProfile.LastLevelReadMiss = &profileVal.Value
+			case (profilerType ^ LLWriteHit) == 0:
+				cacheProfile.LastLevelWriteHit = &profileVal.Value
+			case (profilerType ^ LLWriteMiss) == 0:
+				cacheProfile.LastLevelWriteMiss = &profileVal.Value
+
+			// dTLB
+			case (profilerType ^ DataTLBReadHit) == 0:
+				cacheProfile.DataTLBReadHit = &profileVal.Value
+			case (profilerType ^ DataTLBReadMiss) == 0:
+				cacheProfile.DataTLBReadMiss = &profileVal.Value
+			case (profilerType ^ DataTLBWriteHit) == 0:
+				cacheProfile.DataTLBWriteHit = &profileVal.Value
+			case (profilerType ^ DataTLBWriteMiss) == 0:
+				cacheProfile.DataTLBWriteMiss = &profileVal.Value
+
+			// iTLB
+			case (profilerType ^ InstrTLBReadHit) == 0:
+				cacheProfile.InstrTLBReadHit = &profileVal.Value
+			case (profilerType ^ InstrTLBReadMiss) == 0:
+				cacheProfile.InstrTLBReadMiss = &profileVal.Value
+
+			// BPU
+			case (profilerType ^ BPUReadHit) == 0:
+				cacheProfile.BPUReadHit = &profileVal.Value
+			case (profilerType ^ BPUReadMiss) == 0:
+				cacheProfile.BPUReadMiss = &profileVal.Value
+
+			// node
+			case (profilerType ^ NodeCacheReadHit) == 0:
+				cacheProfile.NodeReadHit = &profileVal.Value
+			case (profilerType ^ NodeCacheReadMiss) == 0:
+				cacheProfile.NodeReadMiss = &profileVal.Value
+			case (profilerType ^ NodeCacheWriteHit) == 0:
+				cacheProfile.NodeWriteHit = &profileVal.Value
+			case (profilerType ^ NodeCacheWriteMiss) == 0:
+				cacheProfile.NodeWriteMiss = &profileVal.Value
+			}
+		}
+	}
+	if len(multierr.Errors(err)) == len(p.profilers) {
+		return nil, err
+	}
+
+	return cacheProfile, nil
+}
--- a/vendor/github.com/hodgesds/perf-utils/events.go
+++ b/vendor/github.com/hodgesds/perf-utils/events.go
@ -0,0 +1,98 @@
+// +build linux
+
+package perf
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// PERF_TYPE_TRACEPOINT is a kernel tracepoint.
+	PERF_TYPE_TRACEPOINT = 2
+)
+
+// AvailableEvents returns the list of available events.
+func AvailableEvents() (map[string][]string, error) {
+	events := map[string][]string{}
+	rawEvents, err := fileToStrings(TracingDir + "/available_events")
+	// Events are colon delimited by type so parse the type and add sub
+	// events appropriately.
+	if err != nil {
+		return events, err
+	}
+	for _, rawEvent := range rawEvents {
+		splits := strings.Split(rawEvent, ":")
+		if len(splits) <= 1 {
+			continue
+		}
+		eventTypeEvents, found := events[splits[0]]
+		if found {
+			events[splits[0]] = append(eventTypeEvents, splits[1])
+			continue
+		}
+		events[splits[0]] = []string{splits[1]}
+	}
+	return events, err
+}
+
+// AvailableTracers returns the list of available tracers.
+func AvailableTracers() ([]string, error) {
+	return fileToStrings(TracingDir + "/available_tracers")
+}
+
+// CurrentTracer returns the current tracer.
+func CurrentTracer() (string, error) {
+	res, err := fileToStrings(TracingDir + "/current_tracer")
+	return res[0], err
+}
+
+// getTracepointConfig is used to get the configuration for a trace event.
+func getTracepointConfig(kind, event string) (uint64, error) {
+	res, err := fileToStrings(TracingDir + fmt.Sprintf("/events/%s/%s/id", kind, event))
+	if err != nil {
+		return 0, err
+	}
+	return strconv.ParseUint(res[0], 10, 64)
+}
+
+// ProfileTracepoint is used to profile a kernel tracepoint event. Events can
+// be listed with `perf list` for Tracepoint Events or in the
+// /sys/kernel/debug/tracing/events directory with the kind being the directory
+// and the event being the subdirectory.
+func ProfileTracepoint(kind, event string, pid, cpu int, opts ...int) (BPFProfiler, error) {
+	config, err := getTracepointConfig(kind, event)
+	if err != nil {
+		return nil, err
+	}
+	eventAttr := &unix.PerfEventAttr{
+		Type:        PERF_TYPE_TRACEPOINT,
+		Config:      config,
+		Size:        uint32(unsafe.Sizeof(unix.PerfEventAttr{})),
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+		Sample_type: PERF_SAMPLE_IDENTIFIER,
+	}
+	var eventOps int
+	if len(opts) > 0 {
+		eventOps = opts[0]
+	}
+	fd, err := unix.PerfEventOpen(
+		eventAttr,
+		pid,
+		cpu,
+		-1,
+		eventOps,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &profiler{
+		fd: fd,
+	}, nil
+}
--- a/vendor/github.com/hodgesds/perf-utils/fs_utils.go
+++ b/vendor/github.com/hodgesds/perf-utils/fs_utils.go
@ -0,0 +1,102 @@
+// +build linux
+
+package perf
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strings"
+)
+
+const (
+	// DebugFS is the filesystem type for debugfs.
+	DebugFS = "debugfs"
+
+	// TraceFS is the filesystem type for tracefs.
+	TraceFS = "tracefs"
+
+	// ProcMounts is the mount point for file systems in procfs.
+	ProcMounts = "/proc/mounts"
+
+	// PerfMaxStack is the mount point for the max perf event size.
+	PerfMaxStack = "/proc/sys/kernel/perf_event_max_stack"
+
+	// PerfMaxContexts is a sysfs mount that contains the max perf contexts.
+	PerfMaxContexts = "/proc/sys/kernel/perf_event_max_contexts_per_stack"
+
+	// SyscallsDir is a constant of the default tracing event syscalls directory.
+	SyscallsDir = "/sys/kernel/debug/tracing/events/syscalls/"
+
+	// TracingDir is a constant of the default tracing directory.
+	TracingDir = "/sys/kernel/debug/tracing"
+)
+
+var (
+	// ErrNoMount is when there is no such mount.
+	ErrNoMount = fmt.Errorf("no such mount")
+)
+
+// TraceFSMount returns the first found mount point of a tracefs file system.
+func TraceFSMount() (string, error) {
+	mounts, err := GetFSMount(TraceFS)
+	if err != nil {
+		return "", err
+	}
+	if len(mounts) == 0 {
+		return "", ErrNoMount
+	}
+	return mounts[0], nil
+}
+
+// DebugFSMount returns the first found mount point of a debugfs file system.
+func DebugFSMount() (string, error) {
+	mounts, err := GetFSMount(DebugFS)
+	if err != nil {
+		return "", err
+	}
+	if len(mounts) == 0 {
+		return "", ErrNoMount
+	}
+	return mounts[0], nil
+}
+
+// GetFSMount is a helper function to get a mount file system type.
+func GetFSMount(mountType string) ([]string, error) {
+	mounts := []string{}
+	file, err := os.Open(ProcMounts)
+	if err != nil {
+		return mounts, err
+	}
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		mountInfo := strings.Split(scanner.Text(), " ")
+		if len(mountInfo) > 3 && mountInfo[2] == mountType {
+			mounts = append(mounts, mountInfo[1])
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return mounts, err
+	}
+
+	return mounts, file.Close()
+}
+
+// fileToStrings is a helper method that reads a line line by line and returns
+// a slice of strings.
+func fileToStrings(path string) ([]string, error) {
+	res := []string{}
+	f, err := os.Open(path)
+	if err != nil {
+		return res, err
+	}
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		res = append(res, scanner.Text())
+	}
+	if err := scanner.Err(); err != nil {
+		return res, err
+	}
+
+	return res, nil
+}
--- a/vendor/github.com/hodgesds/perf-utils/group_profiler.go
+++ b/vendor/github.com/hodgesds/perf-utils/group_profiler.go
@ -0,0 +1,170 @@
+// +build linux
+
+package perf
+
+import (
+	"encoding/binary"
+	"fmt"
+	"syscall"
+
+	"go.uber.org/multierr"
+	"golang.org/x/sys/unix"
+)
+
+// ErrNoLeader is returned when a leader of a GroupProfiler is not defined.
+var ErrNoLeader = fmt.Errorf("No leader defined")
+
+// GroupProfileValue is returned from a GroupProfiler.
+type GroupProfileValue struct {
+	Events      uint64
+	TimeEnabled uint64
+	TimeRunning uint64
+	Values      []uint64
+}
+
+// GroupProfiler is used to setup a group profiler.
+type GroupProfiler interface {
+	Start() error
+	Reset() error
+	Stop() error
+	Close() error
+	Profile() (*GroupProfileValue, error)
+}
+
+// groupProfiler implements the GroupProfiler interface.
+type groupProfiler struct {
+	fds []int // leader is always element 0
+}
+
+// NewGroupProfiler returns a GroupProfiler.
+func NewGroupProfiler(pid, cpu, opts int, eventAttrs ...unix.PerfEventAttr) (GroupProfiler, error) {
+	fds := make([]int, len(eventAttrs))
+
+	for i, eventAttr := range eventAttrs {
+		// common configs
+		eventAttr.Size = EventAttrSize
+		eventAttr.Sample_type = PERF_SAMPLE_IDENTIFIER
+
+		// Leader fd must be opened first
+		if i == 0 {
+			// leader specific configs
+			eventAttr.Bits = unix.PerfBitDisabled | unix.PerfBitExcludeHv
+			eventAttr.Read_format = unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED | unix.PERF_FORMAT_GROUP
+
+			fd, err := unix.PerfEventOpen(
+				&eventAttr,
+				pid,
+				cpu,
+				-1,
+				opts,
+			)
+			if err != nil {
+				return nil, err
+			}
+			fds[i] = fd
+			continue
+		}
+
+		// non leader configs
+		eventAttr.Read_format = unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED | unix.PERF_FORMAT_GROUP
+		eventAttr.Bits = unix.PerfBitExcludeHv
+
+		fd, err := unix.PerfEventOpen(
+			&eventAttr,
+			pid,
+			cpu,
+			fds[0],
+			opts,
+		)
+		if err != nil {
+			// cleanup any old Fds
+			for ii, fd2 := range fds {
+				if ii == i {
+					break
+				}
+				err = multierr.Append(err, unix.Close(fd2))
+			}
+			return nil, err
+		}
+		fds[i] = fd
+	}
+
+	return &groupProfiler{
+		fds: fds,
+	}, nil
+}
+
+// Start is used to start the GroupProfiler.
+func (p *groupProfiler) Start() error {
+	if len(p.fds) == 0 {
+		return ErrNoLeader
+	}
+	return unix.IoctlSetInt(p.fds[0], unix.PERF_EVENT_IOC_ENABLE, 0)
+}
+
+// Reset is used to reset the GroupProfiler.
+func (p *groupProfiler) Reset() error {
+	if len(p.fds) == 0 {
+		return ErrNoLeader
+	}
+	return unix.IoctlSetInt(p.fds[0], unix.PERF_EVENT_IOC_RESET, 0)
+}
+
+// Stop is used to stop the GroupProfiler.
+func (p *groupProfiler) Stop() error {
+	if len(p.fds) == 0 {
+		return ErrNoLeader
+	}
+	return unix.IoctlSetInt(p.fds[0], unix.PERF_EVENT_IOC_DISABLE, 0)
+}
+
+// Close is used to close the GroupProfiler.
+func (p *groupProfiler) Close() error {
+	var err error
+	for _, fd := range p.fds {
+		err = multierr.Append(err, unix.Close(fd))
+	}
+	return err
+}
+
+// Profile is used to return the GroupProfileValue of the GroupProfiler.
+func (p *groupProfiler) Profile() (*GroupProfileValue, error) {
+	nEvents := len(p.fds)
+	if nEvents == 0 {
+		return nil, ErrNoLeader
+	}
+
+	// read format of the raw event looks like this:
+	/*
+		     struct read_format {
+			 u64 nr;            // The number of events /
+			 u64 time_enabled;  // if PERF_FORMAT_TOTAL_TIME_ENABLED
+			 u64 time_running;  // if PERF_FORMAT_TOTAL_TIME_RUNNING
+			 struct {
+			     u64 value;     // The value of the event
+			     u64 id;        // if PERF_FORMAT_ID
+			 } values[nr];
+		     };
+	*/
+
+	buf := make([]byte, 24+8*nEvents)
+	_, err := syscall.Read(p.fds[0], buf)
+	if err != nil {
+		return nil, err
+	}
+
+	val := &GroupProfileValue{
+		Events:      binary.LittleEndian.Uint64(buf[0:8]),
+		TimeEnabled: binary.LittleEndian.Uint64(buf[8:16]),
+		TimeRunning: binary.LittleEndian.Uint64(buf[16:24]),
+		Values:      make([]uint64, len(p.fds)),
+	}
+
+	offset := 24
+	for i := range p.fds {
+		val.Values[i] = binary.LittleEndian.Uint64(buf[offset : offset+8])
+		offset += 8
+	}
+
+	return val, nil
+}
--- a/vendor/github.com/hodgesds/perf-utils/hardware_profiler.go
+++ b/vendor/github.com/hodgesds/perf-utils/hardware_profiler.go
@ -0,0 +1,157 @@
+// +build linux
+
+package perf
+
+import (
+	"go.uber.org/multierr"
+	"golang.org/x/sys/unix"
+)
+
+type hardwareProfiler struct {
+	// map of perf counter type to file descriptor
+	profilers map[int]Profiler
+}
+
+// NewHardwareProfiler returns a new hardware profiler.
+func NewHardwareProfiler(pid, cpu int, opts ...int) HardwareProfiler {
+	profilers := map[int]Profiler{}
+
+	cpuCycleProfiler, err := NewCPUCycleProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_CPU_CYCLES] = cpuCycleProfiler
+	}
+
+	instrProfiler, err := NewInstrProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_INSTRUCTIONS] = instrProfiler
+	}
+
+	cacheRefProfiler, err := NewCacheRefProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_CACHE_REFERENCES] = cacheRefProfiler
+	}
+
+	cacheMissesProfiler, err := NewCacheMissesProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_CACHE_MISSES] = cacheMissesProfiler
+	}
+
+	branchInstrProfiler, err := NewBranchInstrProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = branchInstrProfiler
+	}
+
+	branchMissesProfiler, err := NewBranchMissesProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_BRANCH_MISSES] = branchMissesProfiler
+	}
+
+	busCyclesProfiler, err := NewBusCyclesProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_BUS_CYCLES] = busCyclesProfiler
+	}
+
+	stalledCyclesFrontProfiler, err := NewStalledCyclesFrontProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = stalledCyclesFrontProfiler
+	}
+
+	stalledCyclesBackProfiler, err := NewStalledCyclesBackProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = stalledCyclesBackProfiler
+	}
+
+	refCPUCyclesProfiler, err := NewRefCPUCyclesProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_HW_REF_CPU_CYCLES] = refCPUCyclesProfiler
+	}
+
+	return &hardwareProfiler{
+		profilers: profilers,
+	}
+}
+
+// Start is used to start the HardwareProfiler.
+func (p *hardwareProfiler) Start() error {
+	if len(p.profilers) == 0 {
+		return ErrNoProfiler
+	}
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Start())
+	}
+	return err
+}
+
+// Reset is used to reset the HardwareProfiler.
+func (p *hardwareProfiler) Reset() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Reset())
+	}
+	return err
+}
+
+// Stop is used to reset the HardwareProfiler.
+func (p *hardwareProfiler) Stop() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Stop())
+	}
+	return err
+}
+
+// Close is used to reset the HardwareProfiler.
+func (p *hardwareProfiler) Close() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Close())
+	}
+	return err
+}
+
+// Profile is used to read the HardwareProfiler HardwareProfile it returns an
+// error only if all profiles fail.
+func (p *hardwareProfiler) Profile() (*HardwareProfile, error) {
+	var err error
+	hwProfile := &HardwareProfile{}
+	for profilerType, profiler := range p.profilers {
+		profileVal, err2 := profiler.Profile()
+		err = multierr.Append(err, err2)
+		if err2 == nil {
+			if hwProfile.TimeEnabled == nil {
+				hwProfile.TimeEnabled = &profileVal.TimeEnabled
+			}
+			if hwProfile.TimeRunning == nil {
+				hwProfile.TimeRunning = &profileVal.TimeRunning
+			}
+			switch profilerType {
+			case unix.PERF_COUNT_HW_CPU_CYCLES:
+				hwProfile.CPUCycles = &profileVal.Value
+			case unix.PERF_COUNT_HW_INSTRUCTIONS:
+				hwProfile.Instructions = &profileVal.Value
+			case unix.PERF_COUNT_HW_CACHE_REFERENCES:
+				hwProfile.CacheRefs = &profileVal.Value
+			case unix.PERF_COUNT_HW_CACHE_MISSES:
+				hwProfile.CacheMisses = &profileVal.Value
+			case unix.PERF_COUNT_HW_BRANCH_INSTRUCTIONS:
+				hwProfile.BranchInstr = &profileVal.Value
+			case unix.PERF_COUNT_HW_BRANCH_MISSES:
+				hwProfile.BranchMisses = &profileVal.Value
+			case unix.PERF_COUNT_HW_BUS_CYCLES:
+				hwProfile.BusCycles = &profileVal.Value
+			case unix.PERF_COUNT_HW_STALLED_CYCLES_FRONTEND:
+				hwProfile.StalledCyclesFrontend = &profileVal.Value
+			case unix.PERF_COUNT_HW_STALLED_CYCLES_BACKEND:
+				hwProfile.StalledCyclesBackend = &profileVal.Value
+			case unix.PERF_COUNT_HW_REF_CPU_CYCLES:
+				hwProfile.RefCPUCycles = &profileVal.Value
+			}
+		}
+	}
+	if len(multierr.Errors(err)) == len(p.profilers) {
+		return nil, err
+	}
+
+	return hwProfile, nil
+}
--- a/vendor/github.com/hodgesds/perf-utils/process_profile.go
+++ b/vendor/github.com/hodgesds/perf-utils/process_profile.go
@ -0,0 +1,507 @@
+// +build linux
+
+package perf
+
+import (
+	"encoding/binary"
+	"fmt"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// PERF_SAMPLE_IDENTIFIER is not defined in x/sys/unix.
+	PERF_SAMPLE_IDENTIFIER = 1 << 16
+
+	// PERF_IOC_FLAG_GROUP is not defined in x/sys/unix.
+	PERF_IOC_FLAG_GROUP = 1 << 0
+)
+
+var (
+	// ErrNoProfiler is returned when no profiler is available for profiling.
+	ErrNoProfiler = fmt.Errorf("No profiler available")
+)
+
+// Profiler is a profiler.
+type Profiler interface {
+	Start() error
+	Reset() error
+	Stop() error
+	Close() error
+	Profile() (*ProfileValue, error)
+}
+
+// HardwareProfiler is a hardware profiler.
+type HardwareProfiler interface {
+	Start() error
+	Reset() error
+	Stop() error
+	Close() error
+	Profile() (*HardwareProfile, error)
+}
+
+// HardwareProfile is returned by a HardwareProfiler. Depending on kernel
+// configuration some fields may return nil.
+type HardwareProfile struct {
+	CPUCycles             *uint64 `json:"cpu_cycles,omitempty"`
+	Instructions          *uint64 `json:"instructions,omitempty"`
+	CacheRefs             *uint64 `json:"cache_refs,omitempty"`
+	CacheMisses           *uint64 `json:"cache_misses,omitempty"`
+	BranchInstr           *uint64 `json:"branch_instr,omitempty"`
+	BranchMisses          *uint64 `json:"branch_misses,omitempty"`
+	BusCycles             *uint64 `json:"bus_cycles,omitempty"`
+	StalledCyclesFrontend *uint64 `json:"stalled_cycles_frontend,omitempty"`
+	StalledCyclesBackend  *uint64 `json:"stalled_cycles_backend,omitempty"`
+	RefCPUCycles          *uint64 `json:"ref_cpu_cycles,omitempty"`
+	TimeEnabled           *uint64 `json:"time_enabled,omitempty"`
+	TimeRunning           *uint64 `json:"time_running,omitempty"`
+}
+
+// SoftwareProfiler is a software profiler.
+type SoftwareProfiler interface {
+	Start() error
+	Reset() error
+	Stop() error
+	Close() error
+	Profile() (*SoftwareProfile, error)
+}
+
+// SoftwareProfile is returned by a SoftwareProfiler.
+type SoftwareProfile struct {
+	CPUClock        *uint64 `json:"cpu_clock,omitempty"`
+	TaskClock       *uint64 `json:"task_clock,omitempty"`
+	PageFaults      *uint64 `json:"page_faults,omitempty"`
+	ContextSwitches *uint64 `json:"context_switches,omitempty"`
+	CPUMigrations   *uint64 `json:"cpu_migrations,omitempty"`
+	MinorPageFaults *uint64 `json:"minor_page_faults,omitempty"`
+	MajorPageFaults *uint64 `json:"major_page_faults,omitempty"`
+	AlignmentFaults *uint64 `json:"alignment_faults,omitempty"`
+	EmulationFaults *uint64 `json:"emulation_faults,omitempty"`
+	TimeEnabled     *uint64 `json:"time_enabled,omitempty"`
+	TimeRunning     *uint64 `json:"time_running,omitempty"`
+}
+
+// CacheProfiler is a cache profiler.
+type CacheProfiler interface {
+	Start() error
+	Reset() error
+	Stop() error
+	Close() error
+	Profile() (*CacheProfile, error)
+}
+
+// CacheProfile is returned by a CacheProfiler.
+type CacheProfile struct {
+	L1DataReadHit      *uint64 `json:"l1_data_read_hit,omitempty"`
+	L1DataReadMiss     *uint64 `json:"l1_data_read_miss,omitempty"`
+	L1DataWriteHit     *uint64 `json:"l1_data_write_hit,omitempty"`
+	L1InstrReadMiss    *uint64 `json:"l1_instr_read_miss,omitempty"`
+	LastLevelReadHit   *uint64 `json:"last_level_read_hit,omitempty"`
+	LastLevelReadMiss  *uint64 `json:"last_level_read_miss,omitempty"`
+	LastLevelWriteHit  *uint64 `json:"last_level_write_hit,omitempty"`
+	LastLevelWriteMiss *uint64 `json:"last_level_write_miss,omitempty"`
+	DataTLBReadHit     *uint64 `json:"data_tlb_read_hit,omitempty"`
+	DataTLBReadMiss    *uint64 `json:"data_tlb_read_miss,omitempty"`
+	DataTLBWriteHit    *uint64 `json:"data_tlb_write_hit,omitempty"`
+	DataTLBWriteMiss   *uint64 `json:"data_tlb_write_miss,omitempty"`
+	InstrTLBReadHit    *uint64 `json:"instr_tlb_read_hit,omitempty"`
+	InstrTLBReadMiss   *uint64 `json:"instr_tlb_read_miss,omitempty"`
+	BPUReadHit         *uint64 `json:"bpu_read_hit,omitempty"`
+	BPUReadMiss        *uint64 `json:"bpu_read_miss,omitempty"`
+	NodeReadHit        *uint64 `json:"node_read_hit,omitempty"`
+	NodeReadMiss       *uint64 `json:"node_read_miss,omitempty"`
+	NodeWriteHit       *uint64 `json:"node_write_hit,omitempty"`
+	NodeWriteMiss      *uint64 `json:"node_write_miss,omitempty"`
+	TimeEnabled        *uint64 `json:"time_enabled,omitempty"`
+	TimeRunning        *uint64 `json:"time_running,omitempty"`
+}
+
+// ProfileValue is a value returned by a profiler.
+type ProfileValue struct {
+	Value       uint64
+	TimeEnabled uint64
+	TimeRunning uint64
+}
+
+// profiler is used to profile a process.
+type profiler struct {
+	fd int
+}
+
+// NewProfiler creates a new hardware profiler. It does not support grouping.
+func NewProfiler(profilerType uint32, config uint64, pid, cpu int, opts ...int) (Profiler, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        profilerType,
+		Config:      config,
+		Size:        uint32(unsafe.Sizeof(unix.PerfEventAttr{})),
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeHv | unix.PerfBitInherit,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+		Sample_type: PERF_SAMPLE_IDENTIFIER,
+	}
+	var eventOps int
+	if len(opts) > 0 {
+		eventOps = opts[0]
+	}
+	fd, err := unix.PerfEventOpen(
+		eventAttr,
+		pid,
+		cpu,
+		-1,
+		eventOps,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	return &profiler{
+		fd: fd,
+	}, nil
+}
+
+// NewCPUCycleProfiler returns a Profiler that profiles CPU cycles.
+func NewCPUCycleProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_CPU_CYCLES,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewInstrProfiler returns a Profiler that profiles CPU instructions.
+func NewInstrProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_INSTRUCTIONS,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewCacheRefProfiler returns a Profiler that profiles cache references.
+func NewCacheRefProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_CACHE_REFERENCES,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewCacheMissesProfiler returns a Profiler that profiles cache misses.
+func NewCacheMissesProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_CACHE_MISSES,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewBranchInstrProfiler returns a Profiler that profiles branch instructions.
+func NewBranchInstrProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewBranchMissesProfiler returns a Profiler that profiles branch misses.
+func NewBranchMissesProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_BRANCH_MISSES,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewBusCyclesProfiler returns a Profiler that profiles bus cycles.
+func NewBusCyclesProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_BUS_CYCLES,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewStalledCyclesFrontProfiler returns a Profiler that profiles stalled
+// frontend cycles.
+func NewStalledCyclesFrontProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewStalledCyclesBackProfiler returns a Profiler that profiles stalled
+// backend cycles.
+func NewStalledCyclesBackProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewRefCPUCyclesProfiler returns a Profiler that profiles CPU cycles, it
+// is not affected by frequency scaling.
+func NewRefCPUCyclesProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HARDWARE,
+		unix.PERF_COUNT_HW_REF_CPU_CYCLES,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewCPUClockProfiler returns a Profiler that profiles CPU clock speed.
+func NewCPUClockProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_CPU_CLOCK,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewTaskClockProfiler returns a Profiler that profiles clock count of the
+// running task.
+func NewTaskClockProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_TASK_CLOCK,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewPageFaultProfiler returns a Profiler that profiles the number of page
+// faults.
+func NewPageFaultProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_PAGE_FAULTS,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewCtxSwitchesProfiler returns a Profiler that profiles the number of context
+// switches.
+func NewCtxSwitchesProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_CONTEXT_SWITCHES,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewCPUMigrationsProfiler returns a Profiler that profiles the number of times
+// the process has migrated to a new CPU.
+func NewCPUMigrationsProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_CPU_MIGRATIONS,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewMinorFaultsProfiler returns a Profiler that profiles the number of minor
+// page faults.
+func NewMinorFaultsProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_PAGE_FAULTS_MIN,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewMajorFaultsProfiler returns a Profiler that profiles the number of major
+// page faults.
+func NewMajorFaultsProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_PAGE_FAULTS_MAJ,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewAlignFaultsProfiler returns a Profiler that profiles the number of
+// alignment faults.
+func NewAlignFaultsProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_ALIGNMENT_FAULTS,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewEmulationFaultsProfiler returns a Profiler that profiles the number of
+// alignment faults.
+func NewEmulationFaultsProfiler(pid, cpu int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_SOFTWARE,
+		unix.PERF_COUNT_SW_EMULATION_FAULTS,
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewL1DataProfiler returns a Profiler that profiles L1 cache data.
+func NewL1DataProfiler(pid, cpu, op, result int, opts ...int) (Profiler, error) {
+
+	return NewProfiler(
+		unix.PERF_TYPE_HW_CACHE,
+		uint64((unix.PERF_COUNT_HW_CACHE_L1D)|(op<<8)|(result<<16)),
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewL1InstrProfiler returns a Profiler that profiles L1 instruction data.
+func NewL1InstrProfiler(pid, cpu, op, result int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HW_CACHE,
+		uint64((unix.PERF_COUNT_HW_CACHE_L1I)|(op<<8)|(result<<16)),
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewLLCacheProfiler returns a Profiler that profiles last level cache.
+func NewLLCacheProfiler(pid, cpu, op, result int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HW_CACHE,
+		uint64((unix.PERF_COUNT_HW_CACHE_LL)|(op<<8)|(result<<16)),
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewDataTLBProfiler returns a Profiler that profiles the data TLB.
+func NewDataTLBProfiler(pid, cpu, op, result int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HW_CACHE,
+		uint64((unix.PERF_COUNT_HW_CACHE_DTLB)|(op<<8)|(result<<16)),
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewInstrTLBProfiler returns a Profiler that profiles the instruction TLB.
+func NewInstrTLBProfiler(pid, cpu, op, result int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HW_CACHE,
+		uint64((unix.PERF_COUNT_HW_CACHE_ITLB)|(op<<8)|(result<<16)),
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewBPUProfiler returns a Profiler that profiles the BPU (branch prediction unit).
+func NewBPUProfiler(pid, cpu, op, result int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HW_CACHE,
+		uint64((unix.PERF_COUNT_HW_CACHE_BPU)|(op<<8)|(result<<16)),
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// NewNodeCacheProfiler returns a Profiler that profiles the node cache accesses.
+func NewNodeCacheProfiler(pid, cpu, op, result int, opts ...int) (Profiler, error) {
+	return NewProfiler(
+		unix.PERF_TYPE_HW_CACHE,
+		uint64((unix.PERF_COUNT_HW_CACHE_NODE)|(op<<8)|(result<<16)),
+		pid,
+		cpu,
+		opts...,
+	)
+}
+
+// Reset is used to reset the counters of the profiler.
+func (p *profiler) Reset() error {
+	return unix.IoctlSetInt(p.fd, unix.PERF_EVENT_IOC_RESET, 0)
+}
+
+// Start is used to Start the profiler.
+func (p *profiler) Start() error {
+	return unix.IoctlSetInt(p.fd, unix.PERF_EVENT_IOC_ENABLE, 0)
+}
+
+// Stop is used to stop the profiler.
+func (p *profiler) Stop() error {
+	return unix.IoctlSetInt(p.fd, unix.PERF_EVENT_IOC_DISABLE, 0)
+}
+
+// Profile returns the current Profile.
+func (p *profiler) Profile() (*ProfileValue, error) {
+	// The underlying struct that gets read from the profiler looks like:
+	/*
+		     struct read_format {
+			 u64 value;         // The value of the event
+			 u64 time_enabled;  // if PERF_FORMAT_TOTAL_TIME_ENABLED
+			 u64 time_running;  // if PERF_FORMAT_TOTAL_TIME_RUNNING
+			 u64 id;            // if PERF_FORMAT_ID
+		     };
+	*/
+
+	// read 24 bytes since PERF_FORMAT_TOTAL_TIME_ENABLED and
+	// PERF_FORMAT_TOTAL_TIME_RUNNING are always set.
+	// XXX: allow profile ids?
+	buf := make([]byte, 24, 24)
+	_, err := syscall.Read(p.fd, buf)
+	if err != nil {
+		return nil, err
+	}
+
+	return &ProfileValue{
+		Value:       binary.LittleEndian.Uint64(buf[0:8]),
+		TimeEnabled: binary.LittleEndian.Uint64(buf[8:16]),
+		TimeRunning: binary.LittleEndian.Uint64(buf[16:24]),
+	}, nil
+}
+
+// Close is used to close the perf context.
+func (p *profiler) Close() error {
+	return unix.Close(p.fd)
+}
--- a/vendor/github.com/hodgesds/perf-utils/software_profiler.go
+++ b/vendor/github.com/hodgesds/perf-utils/software_profiler.go
@ -0,0 +1,151 @@
+// +build linux
+
+package perf
+
+import (
+	"go.uber.org/multierr"
+	"golang.org/x/sys/unix"
+)
+
+type softwareProfiler struct {
+	// map of perf counter type to file descriptor
+	profilers map[int]Profiler
+}
+
+// NewSoftwareProfiler returns a new software profiler.
+func NewSoftwareProfiler(pid, cpu int, opts ...int) SoftwareProfiler {
+	profilers := map[int]Profiler{}
+
+	cpuClockProfiler, err := NewCPUClockProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_CPU_CLOCK] = cpuClockProfiler
+	}
+
+	taskClockProfiler, err := NewTaskClockProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_TASK_CLOCK] = taskClockProfiler
+	}
+
+	pageFaultProfiler, err := NewPageFaultProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_PAGE_FAULTS] = pageFaultProfiler
+	}
+
+	ctxSwitchesProfiler, err := NewCtxSwitchesProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_CONTEXT_SWITCHES] = ctxSwitchesProfiler
+	}
+
+	cpuMigrationsProfiler, err := NewCPUMigrationsProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_CPU_MIGRATIONS] = cpuMigrationsProfiler
+	}
+
+	minorFaultProfiler, err := NewMinorFaultsProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_PAGE_FAULTS_MIN] = minorFaultProfiler
+	}
+
+	majorFaultProfiler, err := NewMajorFaultsProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_PAGE_FAULTS_MAJ] = majorFaultProfiler
+	}
+
+	alignFaultsFrontProfiler, err := NewAlignFaultsProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_ALIGNMENT_FAULTS] = alignFaultsFrontProfiler
+	}
+
+	emuFaultProfiler, err := NewEmulationFaultsProfiler(pid, cpu, opts...)
+	if err == nil {
+		profilers[unix.PERF_COUNT_SW_EMULATION_FAULTS] = emuFaultProfiler
+	}
+
+	return &softwareProfiler{
+		profilers: profilers,
+	}
+}
+
+// Start is used to start the SoftwareProfiler.
+func (p *softwareProfiler) Start() error {
+	if len(p.profilers) == 0 {
+		return ErrNoProfiler
+	}
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Start())
+	}
+	return err
+}
+
+// Reset is used to reset the SoftwareProfiler.
+func (p *softwareProfiler) Reset() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Reset())
+	}
+	return err
+}
+
+// Stop is used to reset the SoftwareProfiler.
+func (p *softwareProfiler) Stop() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Stop())
+	}
+	return err
+}
+
+// Close is used to reset the SoftwareProfiler.
+func (p *softwareProfiler) Close() error {
+	var err error
+	for _, profiler := range p.profilers {
+		err = multierr.Append(err, profiler.Close())
+	}
+	return err
+}
+
+// Profile is used to read the SoftwareProfiler SoftwareProfile it returns an
+// error only if all profiles fail.
+func (p *softwareProfiler) Profile() (*SoftwareProfile, error) {
+	var err error
+	swProfile := &SoftwareProfile{}
+	for profilerType, profiler := range p.profilers {
+		profileVal, err2 := profiler.Profile()
+		err = multierr.Append(err, err2)
+		if err2 == nil {
+			if swProfile.TimeEnabled == nil {
+				swProfile.TimeEnabled = &profileVal.TimeEnabled
+			}
+			if swProfile.TimeRunning == nil {
+				swProfile.TimeRunning = &profileVal.TimeRunning
+			}
+			switch profilerType {
+			case unix.PERF_COUNT_SW_CPU_CLOCK:
+				swProfile.CPUClock = &profileVal.Value
+			case unix.PERF_COUNT_SW_TASK_CLOCK:
+				swProfile.TaskClock = &profileVal.Value
+			case unix.PERF_COUNT_SW_PAGE_FAULTS:
+				swProfile.PageFaults = &profileVal.Value
+			case unix.PERF_COUNT_SW_CONTEXT_SWITCHES:
+				swProfile.ContextSwitches = &profileVal.Value
+			case unix.PERF_COUNT_SW_CPU_MIGRATIONS:
+				swProfile.CPUMigrations = &profileVal.Value
+			case unix.PERF_COUNT_SW_PAGE_FAULTS_MIN:
+				swProfile.MinorPageFaults = &profileVal.Value
+			case unix.PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+				swProfile.MajorPageFaults = &profileVal.Value
+			case unix.PERF_COUNT_SW_ALIGNMENT_FAULTS:
+				swProfile.AlignmentFaults = &profileVal.Value
+			case unix.PERF_COUNT_SW_EMULATION_FAULTS:
+				swProfile.EmulationFaults = &profileVal.Value
+			default:
+			}
+		}
+	}
+	if len(multierr.Errors(err)) == len(p.profilers) {
+		return nil, err
+	}
+
+	return swProfile, nil
+}
--- a/vendor/github.com/hodgesds/perf-utils/utils.go
+++ b/vendor/github.com/hodgesds/perf-utils/utils.go
@ -0,0 +1,681 @@
+// +build linux
+
+package perf
+
+import (
+	"encoding/binary"
+	"runtime"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+var (
+	// EventAttrSize is the size of a PerfEventAttr
+	EventAttrSize = uint32(unsafe.Sizeof(unix.PerfEventAttr{}))
+)
+
+// profileFn is a helper function to profile a function.
+func profileFn(eventAttr *unix.PerfEventAttr, f func() error) (*ProfileValue, error) {
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+	fd, err := unix.PerfEventOpen(
+		eventAttr,
+		unix.Gettid(),
+		-1,
+		-1,
+		0,
+	)
+	if err != nil {
+		return nil, err
+	}
+	if err := unix.IoctlSetInt(fd, unix.PERF_EVENT_IOC_RESET, 0); err != nil {
+		return nil, err
+	}
+	if err := unix.IoctlSetInt(fd, unix.PERF_EVENT_IOC_ENABLE, 0); err != nil {
+		return nil, err
+	}
+	if err := f(); err != nil {
+		return nil, err
+	}
+	if err := unix.IoctlSetInt(fd, unix.PERF_EVENT_IOC_DISABLE, 0); err != nil {
+		return nil, err
+	}
+	buf := make([]byte, 24)
+	if _, err := syscall.Read(fd, buf); err != nil {
+		return nil, err
+	}
+	return &ProfileValue{
+		Value:       binary.LittleEndian.Uint64(buf[0:8]),
+		TimeEnabled: binary.LittleEndian.Uint64(buf[8:16]),
+		TimeRunning: binary.LittleEndian.Uint64(buf[16:24]),
+	}, unix.Close(fd)
+}
+
+// CPUInstructions is used to profile a function and return the number of CPU instructions.
+// Note that it will call runtime.LockOSThread to ensure accurate profilng.
+func CPUInstructions(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_INSTRUCTIONS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// CPUInstructionsEventAttr returns a unix.PerfEventAttr configured for CPUInstructions.
+func CPUInstructionsEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_INSTRUCTIONS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// CPUCycles is used to profile a function and return the number of CPU cycles.
+// Note that it will call runtime.LockOSThread to ensure accurate profilng.
+func CPUCycles(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_CPU_CYCLES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// CPUCyclesEventAttr returns a unix.PerfEventAttr configured for CPUCycles.
+func CPUCyclesEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_CPU_CYCLES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// CacheRef is used to profile a function and return the number of cache
+// references. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func CacheRef(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_CACHE_REFERENCES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// CacheRefEventAttr returns a unix.PerfEventAttr configured for CacheRef.
+func CacheRefEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_CACHE_REFERENCES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// CacheMiss is used to profile a function and return the number of cache
+// misses. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func CacheMiss(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_CACHE_MISSES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// CacheMissEventAttr returns a unix.PerfEventAttr configured for CacheMisses.
+func CacheMissEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_CACHE_MISSES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// BusCycles is used to profile a function and return the number of bus
+// cycles. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func BusCycles(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_BUS_CYCLES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// BusCyclesEventAttr returns a unix.PerfEventAttr configured for BusCycles.
+func BusCyclesEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_BUS_CYCLES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// StalledFrontendCycles is used to profile a function and return the number of
+// stalled frontend cycles. Note that it will call runtime.LockOSThread to
+// ensure accurate profilng.
+func StalledFrontendCycles(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// StalledFrontendCyclesEventAttr returns a unix.PerfEventAttr configured for StalledFrontendCycles.
+func StalledFrontendCyclesEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// StalledBackendCycles is used to profile a function and return the number of
+// stalled backend cycles. Note that it will call runtime.LockOSThread to
+// ensure accurate profilng.
+func StalledBackendCycles(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// StalledBackendCyclesEventAttr returns a unix.PerfEventAttr configured for StalledBackendCycles.
+func StalledBackendCyclesEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// CPURefCycles is used to profile a function and return the number of CPU
+// references cycles which are not affected by frequency scaling. Note that it
+// will call runtime.LockOSThread to ensure accurate profilng.
+func CPURefCycles(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_REF_CPU_CYCLES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// CPURefCyclesEventAttr returns a unix.PerfEventAttr configured for CPURefCycles.
+func CPURefCyclesEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HARDWARE,
+		Config:      unix.PERF_COUNT_HW_REF_CPU_CYCLES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// CPUClock is used to profile a function and return the CPU clock timer. Note
+// that it will call runtime.LockOSThread to ensure accurate profilng.
+func CPUClock(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_CPU_CLOCK,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// CPUClockEventAttr returns a unix.PerfEventAttr configured for CPUClock.
+func CPUClockEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_CPU_CLOCK,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// CPUTaskClock is used to profile a function and return the CPU clock timer
+// for the running task. Note that it will call runtime.LockOSThread to ensure
+// accurate profilng.
+func CPUTaskClock(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_TASK_CLOCK,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// CPUTaskClockEventAttr returns a unix.PerfEventAttr configured for CPUTaskClock.
+func CPUTaskClockEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_TASK_CLOCK,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// PageFaults is used to profile a function and return the number of page
+// faults. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func PageFaults(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_PAGE_FAULTS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// PageFaultsEventAttr returns a unix.PerfEventAttr configured for PageFaults.
+func PageFaultsEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_PAGE_FAULTS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// ContextSwitches is used to profile a function and return the number of
+// context switches. Note that it will call runtime.LockOSThread to ensure
+// accurate profilng.
+func ContextSwitches(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_CONTEXT_SWITCHES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// ContextSwitchesEventAttr returns a unix.PerfEventAttr configured for ContextSwitches.
+func ContextSwitchesEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_CONTEXT_SWITCHES,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// CPUMigrations is used to profile a function and return the number of times
+// the thread has been migrated to a new CPU. Note that it will call
+// runtime.LockOSThread to ensure accurate profilng.
+func CPUMigrations(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_CPU_MIGRATIONS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// CPUMigrationsEventAttr returns a unix.PerfEventAttr configured for CPUMigrations.
+func CPUMigrationsEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_CPU_MIGRATIONS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// MinorPageFaults is used to profile a function and return the number of minor
+// page faults. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func MinorPageFaults(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_PAGE_FAULTS_MIN,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// MinorPageFaultsEventAttr returns a unix.PerfEventAttr configured for MinorPageFaults.
+func MinorPageFaultsEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_PAGE_FAULTS_MIN,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// MajorPageFaults is used to profile a function and return the number of major
+// page faults. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func MajorPageFaults(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_PAGE_FAULTS_MAJ,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// MajorPageFaultsEventAttr returns a unix.PerfEventAttr configured for MajorPageFaults.
+func MajorPageFaultsEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_PAGE_FAULTS_MAJ,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// AlignmentFaults is used to profile a function and return the number of alignment
+// faults. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func AlignmentFaults(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_ALIGNMENT_FAULTS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// AlignmentFaultsEventAttr returns a unix.PerfEventAttr configured for AlignmentFaults.
+func AlignmentFaultsEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_ALIGNMENT_FAULTS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// EmulationFaults is used to profile a function and return the number of emulation
+// faults. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func EmulationFaults(f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_EMULATION_FAULTS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// EmulationFaultsEventAttr returns a unix.PerfEventAttr configured for EmulationFaults.
+func EmulationFaultsEventAttr() unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_SOFTWARE,
+		Config:      unix.PERF_COUNT_SW_EMULATION_FAULTS,
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// L1Data is used to profile a function and the L1 data cache faults. Use
+// PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_OP_WRITE, or
+// PERF_COUNT_HW_CACHE_OP_PREFETCH for the opt and
+// PERF_COUNT_HW_CACHE_RESULT_ACCESS or PERF_COUNT_HW_CACHE_RESULT_MISS for the
+// result. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func L1Data(op, result int, f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_L1D) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// L1DataEventAttr returns a unix.PerfEventAttr configured for L1Data.
+func L1DataEventAttr(op, result int) unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_L1D) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// L1Instructions is used to profile a function for the instruction level L1
+// cache. Use PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_OP_WRITE, or
+// PERF_COUNT_HW_CACHE_OP_PREFETCH for the opt and
+// PERF_COUNT_HW_CACHE_RESULT_ACCESS or PERF_COUNT_HW_CACHE_RESULT_MISS for the
+// result. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func L1Instructions(op, result int, f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_L1I) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// L1InstructionsEventAttr returns a unix.PerfEventAttr configured for L1Instructions.
+func L1InstructionsEventAttr(op, result int) unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_L1I) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// LLCache is used to profile a function and return the number of emulation
+// PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_OP_WRITE, or
+// PERF_COUNT_HW_CACHE_OP_PREFETCH for the opt and
+// PERF_COUNT_HW_CACHE_RESULT_ACCESS or PERF_COUNT_HW_CACHE_RESULT_MISS for the
+// result. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func LLCache(op, result int, f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_LL) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// LLCacheEventAttr returns a unix.PerfEventAttr configured for LLCache.
+func LLCacheEventAttr(op, result int) unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_LL) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// DataTLB is used to profile the data TLB. Use PERF_COUNT_HW_CACHE_OP_READ,
+// PERF_COUNT_HW_CACHE_OP_WRITE, or PERF_COUNT_HW_CACHE_OP_PREFETCH for the opt
+// and PERF_COUNT_HW_CACHE_RESULT_ACCESS or PERF_COUNT_HW_CACHE_RESULT_MISS for
+// the result. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func DataTLB(op, result int, f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_DTLB) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// DataTLBEventAttr returns a unix.PerfEventAttr configured for DataTLB.
+func DataTLBEventAttr(op, result int) unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_DTLB) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// InstructionTLB is used to profile the instruction TLB. Use
+// PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_OP_WRITE, or
+// PERF_COUNT_HW_CACHE_OP_PREFETCH for the opt and
+// PERF_COUNT_HW_CACHE_RESULT_ACCESS or PERF_COUNT_HW_CACHE_RESULT_MISS for the
+// result. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func InstructionTLB(op, result int, f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_ITLB) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// InstructionTLBEventAttr returns a unix.PerfEventAttr configured for InstructionTLB.
+func InstructionTLBEventAttr(op, result int) unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_ITLB) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+
+}
+
+// BPU is used to profile a function for the Branch Predictor Unit.
+// Use PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_OP_WRITE, or
+// PERF_COUNT_HW_CACHE_OP_PREFETCH for the opt and
+// PERF_COUNT_HW_CACHE_RESULT_ACCESS or PERF_COUNT_HW_CACHE_RESULT_MISS for the
+// result. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func BPU(op, result int, f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_BPU) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// BPUEventAttr returns a unix.PerfEventAttr configured for BPU events.
+func BPUEventAttr(op, result int) unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_BPU) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
+
+// NodeCache is used to profile a function for NUMA operations. Use Use
+// PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_OP_WRITE, or
+// PERF_COUNT_HW_CACHE_OP_PREFETCH for the opt and
+// PERF_COUNT_HW_CACHE_RESULT_ACCESS or PERF_COUNT_HW_CACHE_RESULT_MISS for the
+// result. Note that it will call runtime.LockOSThread to ensure accurate
+// profilng.
+func NodeCache(op, result int, f func() error) (*ProfileValue, error) {
+	eventAttr := &unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_NODE) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitDisabled | unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+	return profileFn(eventAttr, f)
+}
+
+// NodeCacheEventAttr returns a unix.PerfEventAttr configured for NUMA cache operations.
+func NodeCacheEventAttr(op, result int) unix.PerfEventAttr {
+	return unix.PerfEventAttr{
+		Type:        unix.PERF_TYPE_HW_CACHE,
+		Config:      uint64((unix.PERF_COUNT_HW_CACHE_NODE) | (op << 8) | (result << 16)),
+		Size:        EventAttrSize,
+		Bits:        unix.PerfBitExcludeKernel | unix.PerfBitExcludeHv,
+		Read_format: unix.PERF_FORMAT_TOTAL_TIME_RUNNING | unix.PERF_FORMAT_TOTAL_TIME_ENABLED,
+	}
+}
--- a/vendor/go.uber.org/atomic/.codecov.yml
+++ b/vendor/go.uber.org/atomic/.codecov.yml
@ -0,0 +1,15 @@
+coverage:
+  range: 80..100
+  round: down
+  precision: 2
+
+  status:
+    project:                   # measuring the overall project coverage
+      default:                 # context, you can create multiple ones with custom titles
+        enabled: yes           # must be yes|true to enable this status
+        target: 100            # specify the target coverage for each commit status
+                               #   option: "auto" (must increase from parent commit or pull request base)
+                               #   option: "X%" a static target percentage to hit
+        if_not_found: success  # if parent is not found report status as success, error, or failure
+        if_ci_failed: error    # if ci fails report status as success, error, or failure
+
--- a/vendor/go.uber.org/atomic/.gitignore
+++ b/vendor/go.uber.org/atomic/.gitignore
@ -0,0 +1,11 @@
+.DS_Store
+/vendor
+/cover
+cover.out
+lint.log
+
+# Binaries
+*.test
+
+# Profiling output
+*.prof
--- a/vendor/go.uber.org/atomic/.travis.yml
+++ b/vendor/go.uber.org/atomic/.travis.yml
@ -0,0 +1,23 @@
+sudo: false
+language: go
+go_import_path: go.uber.org/atomic
+
+go:
+  - 1.7
+  - 1.8
+  - 1.9
+
+cache:
+  directories:
+    - vendor
+
+install:
+  - make install_ci
+
+script:
+  - make test_ci
+  - scripts/test-ubergo.sh
+  - make lint
+
+after_success:
+  - bash <(curl -s https://codecov.io/bash)
--- a/vendor/go.uber.org/atomic/LICENSE.txt
+++ b/vendor/go.uber.org/atomic/LICENSE.txt
@ -0,0 +1,19 @@
+Copyright (c) 2016 Uber Technologies, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/vendor/go.uber.org/atomic/Makefile
+++ b/vendor/go.uber.org/atomic/Makefile
@ -0,0 +1,64 @@
+PACKAGES := $(shell glide nv)
+# Many Go tools take file globs or directories as arguments instead of packages.
+PACKAGE_FILES ?= *.go
+
+
+# The linting tools evolve with each Go version, so run them only on the latest
+# stable release.
+GO_VERSION := $(shell go version | cut -d " " -f 3)
+GO_MINOR_VERSION := $(word 2,$(subst ., ,$(GO_VERSION)))
+LINTABLE_MINOR_VERSIONS := 7 8
+ifneq ($(filter $(LINTABLE_MINOR_VERSIONS),$(GO_MINOR_VERSION)),)
+SHOULD_LINT := true
+endif
+
+
+export GO15VENDOREXPERIMENT=1
+
+
+.PHONY: build
+build:
+	go build -i $(PACKAGES)
+
+
+.PHONY: install
+install:
+	glide --version || go get github.com/Masterminds/glide
+	glide install
+
+
+.PHONY: test
+test:
+	go test -cover -race $(PACKAGES)
+
+
+.PHONY: install_ci
+install_ci: install
+	go get github.com/wadey/gocovmerge
+	go get github.com/mattn/goveralls
+	go get golang.org/x/tools/cmd/cover
+ifdef SHOULD_LINT
+	go get github.com/golang/lint/golint
+endif
+
+.PHONY: lint
+lint:
+ifdef SHOULD_LINT
+	@rm -rf lint.log
+	@echo "Checking formatting..."
+	@gofmt -d -s $(PACKAGE_FILES) 2>&1 | tee lint.log
+	@echo "Checking vet..."
+	@$(foreach dir,$(PACKAGE_FILES),go tool vet $(dir) 2>&1 | tee -a lint.log;)
+	@echo "Checking lint..."
+	@$(foreach dir,$(PKGS),golint $(dir) 2>&1 | tee -a lint.log;)
+	@echo "Checking for unresolved FIXMEs..."
+	@git grep -i fixme | grep -v -e vendor -e Makefile | tee -a lint.log
+	@[ ! -s lint.log ]
+else
+	@echo "Skipping linters on" $(GO_VERSION)
+endif
+
+
+.PHONY: test_ci
+test_ci: install_ci build
+	./scripts/cover.sh $(shell go list $(PACKAGES))
--- a/vendor/go.uber.org/atomic/README.md
+++ b/vendor/go.uber.org/atomic/README.md
@ -0,0 +1,36 @@
+# atomic [![GoDoc][doc-img]][doc] [![Build Status][ci-img]][ci] [![Coverage Status][cov-img]][cov] [![Go Report Card][reportcard-img]][reportcard]
+
+Simple wrappers for primitive types to enforce atomic access.
+
+## Installation
+`go get -u go.uber.org/atomic`
+
+## Usage
+The standard library's `sync/atomic` is powerful, but it's easy to forget which
+variables must be accessed atomically. `go.uber.org/atomic` preserves all the
+functionality of the standard library, but wraps the primitive types to
+provide a safer, more convenient API.
+
+```go
+var atom atomic.Uint32
+atom.Store(42)
+atom.Sub(2)
+atom.CAS(40, 11)
+```
+
+See the [documentation][doc] for a complete API specification.
+
+## Development Status
+Stable.
+
+<hr>
+Released under the [MIT License](LICENSE.txt).
+
+[doc-img]: https://godoc.org/github.com/uber-go/atomic?status.svg
+[doc]: https://godoc.org/go.uber.org/atomic
+[ci-img]: https://travis-ci.org/uber-go/atomic.svg?branch=master
+[ci]: https://travis-ci.org/uber-go/atomic
+[cov-img]: https://codecov.io/gh/uber-go/atomic/branch/master/graph/badge.svg
+[cov]: https://codecov.io/gh/uber-go/atomic
+[reportcard-img]: https://goreportcard.com/badge/go.uber.org/atomic
+[reportcard]: https://goreportcard.com/report/go.uber.org/atomic
--- a/vendor/go.uber.org/atomic/atomic.go
+++ b/vendor/go.uber.org/atomic/atomic.go
@ -0,0 +1,351 @@
+// Copyright (c) 2016 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// Package atomic provides simple wrappers around numerics to enforce atomic
+// access.
+package atomic
+
+import (
+	"math"
+	"sync/atomic"
+	"time"
+)
+
+// Int32 is an atomic wrapper around an int32.
+type Int32 struct{ v int32 }
+
+// NewInt32 creates an Int32.
+func NewInt32(i int32) *Int32 {
+	return &Int32{i}
+}
+
+// Load atomically loads the wrapped value.
+func (i *Int32) Load() int32 {
+	return atomic.LoadInt32(&i.v)
+}
+
+// Add atomically adds to the wrapped int32 and returns the new value.
+func (i *Int32) Add(n int32) int32 {
+	return atomic.AddInt32(&i.v, n)
+}
+
+// Sub atomically subtracts from the wrapped int32 and returns the new value.
+func (i *Int32) Sub(n int32) int32 {
+	return atomic.AddInt32(&i.v, -n)
+}
+
+// Inc atomically increments the wrapped int32 and returns the new value.
+func (i *Int32) Inc() int32 {
+	return i.Add(1)
+}
+
+// Dec atomically decrements the wrapped int32 and returns the new value.
+func (i *Int32) Dec() int32 {
+	return i.Sub(1)
+}
+
+// CAS is an atomic compare-and-swap.
+func (i *Int32) CAS(old, new int32) bool {
+	return atomic.CompareAndSwapInt32(&i.v, old, new)
+}
+
+// Store atomically stores the passed value.
+func (i *Int32) Store(n int32) {
+	atomic.StoreInt32(&i.v, n)
+}
+
+// Swap atomically swaps the wrapped int32 and returns the old value.
+func (i *Int32) Swap(n int32) int32 {
+	return atomic.SwapInt32(&i.v, n)
+}
+
+// Int64 is an atomic wrapper around an int64.
+type Int64 struct{ v int64 }
+
+// NewInt64 creates an Int64.
+func NewInt64(i int64) *Int64 {
+	return &Int64{i}
+}
+
+// Load atomically loads the wrapped value.
+func (i *Int64) Load() int64 {
+	return atomic.LoadInt64(&i.v)
+}
+
+// Add atomically adds to the wrapped int64 and returns the new value.
+func (i *Int64) Add(n int64) int64 {
+	return atomic.AddInt64(&i.v, n)
+}
+
+// Sub atomically subtracts from the wrapped int64 and returns the new value.
+func (i *Int64) Sub(n int64) int64 {
+	return atomic.AddInt64(&i.v, -n)
+}
+
+// Inc atomically increments the wrapped int64 and returns the new value.
+func (i *Int64) Inc() int64 {
+	return i.Add(1)
+}
+
+// Dec atomically decrements the wrapped int64 and returns the new value.
+func (i *Int64) Dec() int64 {
+	return i.Sub(1)
+}
+
+// CAS is an atomic compare-and-swap.
+func (i *Int64) CAS(old, new int64) bool {
+	return atomic.CompareAndSwapInt64(&i.v, old, new)
+}
+
+// Store atomically stores the passed value.
+func (i *Int64) Store(n int64) {
+	atomic.StoreInt64(&i.v, n)
+}
+
+// Swap atomically swaps the wrapped int64 and returns the old value.
+func (i *Int64) Swap(n int64) int64 {
+	return atomic.SwapInt64(&i.v, n)
+}
+
+// Uint32 is an atomic wrapper around an uint32.
+type Uint32 struct{ v uint32 }
+
+// NewUint32 creates a Uint32.
+func NewUint32(i uint32) *Uint32 {
+	return &Uint32{i}
+}
+
+// Load atomically loads the wrapped value.
+func (i *Uint32) Load() uint32 {
+	return atomic.LoadUint32(&i.v)
+}
+
+// Add atomically adds to the wrapped uint32 and returns the new value.
+func (i *Uint32) Add(n uint32) uint32 {
+	return atomic.AddUint32(&i.v, n)
+}
+
+// Sub atomically subtracts from the wrapped uint32 and returns the new value.
+func (i *Uint32) Sub(n uint32) uint32 {
+	return atomic.AddUint32(&i.v, ^(n - 1))
+}
+
+// Inc atomically increments the wrapped uint32 and returns the new value.
+func (i *Uint32) Inc() uint32 {
+	return i.Add(1)
+}
+
+// Dec atomically decrements the wrapped int32 and returns the new value.
+func (i *Uint32) Dec() uint32 {
+	return i.Sub(1)
+}
+
+// CAS is an atomic compare-and-swap.
+func (i *Uint32) CAS(old, new uint32) bool {
+	return atomic.CompareAndSwapUint32(&i.v, old, new)
+}
+
+// Store atomically stores the passed value.
+func (i *Uint32) Store(n uint32) {
+	atomic.StoreUint32(&i.v, n)
+}
+
+// Swap atomically swaps the wrapped uint32 and returns the old value.
+func (i *Uint32) Swap(n uint32) uint32 {
+	return atomic.SwapUint32(&i.v, n)
+}
+
+// Uint64 is an atomic wrapper around a uint64.
+type Uint64 struct{ v uint64 }
+
+// NewUint64 creates a Uint64.
+func NewUint64(i uint64) *Uint64 {
+	return &Uint64{i}
+}
+
+// Load atomically loads the wrapped value.
+func (i *Uint64) Load() uint64 {
+	return atomic.LoadUint64(&i.v)
+}
+
+// Add atomically adds to the wrapped uint64 and returns the new value.
+func (i *Uint64) Add(n uint64) uint64 {
+	return atomic.AddUint64(&i.v, n)
+}
+
+// Sub atomically subtracts from the wrapped uint64 and returns the new value.
+func (i *Uint64) Sub(n uint64) uint64 {
+	return atomic.AddUint64(&i.v, ^(n - 1))
+}
+
+// Inc atomically increments the wrapped uint64 and returns the new value.
+func (i *Uint64) Inc() uint64 {
+	return i.Add(1)
+}
+
+// Dec atomically decrements the wrapped uint64 and returns the new value.
+func (i *Uint64) Dec() uint64 {
+	return i.Sub(1)
+}
+
+// CAS is an atomic compare-and-swap.
+func (i *Uint64) CAS(old, new uint64) bool {
+	return atomic.CompareAndSwapUint64(&i.v, old, new)
+}
+
+// Store atomically stores the passed value.
+func (i *Uint64) Store(n uint64) {
+	atomic.StoreUint64(&i.v, n)
+}
+
+// Swap atomically swaps the wrapped uint64 and returns the old value.
+func (i *Uint64) Swap(n uint64) uint64 {
+	return atomic.SwapUint64(&i.v, n)
+}
+
+// Bool is an atomic Boolean.
+type Bool struct{ v uint32 }
+
+// NewBool creates a Bool.
+func NewBool(initial bool) *Bool {
+	return &Bool{boolToInt(initial)}
+}
+
+// Load atomically loads the Boolean.
+func (b *Bool) Load() bool {
+	return truthy(atomic.LoadUint32(&b.v))
+}
+
+// CAS is an atomic compare-and-swap.
+func (b *Bool) CAS(old, new bool) bool {
+	return atomic.CompareAndSwapUint32(&b.v, boolToInt(old), boolToInt(new))
+}
+
+// Store atomically stores the passed value.
+func (b *Bool) Store(new bool) {
+	atomic.StoreUint32(&b.v, boolToInt(new))
+}
+
+// Swap sets the given value and returns the previous value.
+func (b *Bool) Swap(new bool) bool {
+	return truthy(atomic.SwapUint32(&b.v, boolToInt(new)))
+}
+
+// Toggle atomically negates the Boolean and returns the previous value.
+func (b *Bool) Toggle() bool {
+	return truthy(atomic.AddUint32(&b.v, 1) - 1)
+}
+
+func truthy(n uint32) bool {
+	return n&1 == 1
+}
+
+func boolToInt(b bool) uint32 {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+// Float64 is an atomic wrapper around float64.
+type Float64 struct {
+	v uint64
+}
+
+// NewFloat64 creates a Float64.
+func NewFloat64(f float64) *Float64 {
+	return &Float64{math.Float64bits(f)}
+}
+
+// Load atomically loads the wrapped value.
+func (f *Float64) Load() float64 {
+	return math.Float64frombits(atomic.LoadUint64(&f.v))
+}
+
+// Store atomically stores the passed value.
+func (f *Float64) Store(s float64) {
+	atomic.StoreUint64(&f.v, math.Float64bits(s))
+}
+
+// Add atomically adds to the wrapped float64 and returns the new value.
+func (f *Float64) Add(s float64) float64 {
+	for {
+		old := f.Load()
+		new := old + s
+		if f.CAS(old, new) {
+			return new
+		}
+	}
+}
+
+// Sub atomically subtracts from the wrapped float64 and returns the new value.
+func (f *Float64) Sub(s float64) float64 {
+	return f.Add(-s)
+}
+
+// CAS is an atomic compare-and-swap.
+func (f *Float64) CAS(old, new float64) bool {
+	return atomic.CompareAndSwapUint64(&f.v, math.Float64bits(old), math.Float64bits(new))
+}
+
+// Duration is an atomic wrapper around time.Duration
+// https://godoc.org/time#Duration
+type Duration struct {
+	v Int64
+}
+
+// NewDuration creates a Duration.
+func NewDuration(d time.Duration) *Duration {
+	return &Duration{v: *NewInt64(int64(d))}
+}
+
+// Load atomically loads the wrapped value.
+func (d *Duration) Load() time.Duration {
+	return time.Duration(d.v.Load())
+}
+
+// Store atomically stores the passed value.
+func (d *Duration) Store(n time.Duration) {
+	d.v.Store(int64(n))
+}
+
+// Add atomically adds to the wrapped time.Duration and returns the new value.
+func (d *Duration) Add(n time.Duration) time.Duration {
+	return time.Duration(d.v.Add(int64(n)))
+}
+
+// Sub atomically subtracts from the wrapped time.Duration and returns the new value.
+func (d *Duration) Sub(n time.Duration) time.Duration {
+	return time.Duration(d.v.Sub(int64(n)))
+}
+
+// Swap atomically swaps the wrapped time.Duration and returns the old value.
+func (d *Duration) Swap(n time.Duration) time.Duration {
+	return time.Duration(d.v.Swap(int64(n)))
+}
+
+// CAS is an atomic compare-and-swap.
+func (d *Duration) CAS(old, new time.Duration) bool {
+	return d.v.CAS(int64(old), int64(new))
+}
+
+// Value shadows the type of the same name from sync/atomic
+// https://godoc.org/sync/atomic#Value
+type Value struct{ atomic.Value }
--- a/vendor/go.uber.org/atomic/glide.lock
+++ b/vendor/go.uber.org/atomic/glide.lock
@ -0,0 +1,17 @@
+hash: f14d51408e3e0e4f73b34e4039484c78059cd7fc5f4996fdd73db20dc8d24f53
+updated: 2016-10-27T00:10:51.16960137-07:00
+imports: []
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 5215b55f46b2b919f50a1df0eaa5886afe4e3b3d
+  subpackages:
+  - spew
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: d77da356e56a7428ad25149ca77381849a6a5232
+  subpackages:
+  - assert
+  - require
--- a/vendor/go.uber.org/atomic/glide.yaml
+++ b/vendor/go.uber.org/atomic/glide.yaml
@ -0,0 +1,6 @@
+package: go.uber.org/atomic
+testImport:
+- package: github.com/stretchr/testify
+  subpackages:
+  - assert
+  - require
--- a/vendor/go.uber.org/atomic/string.go
+++ b/vendor/go.uber.org/atomic/string.go
@ -0,0 +1,49 @@
+// Copyright (c) 2016 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+package atomic
+
+// String is an atomic type-safe wrapper around Value for strings.
+type String struct{ v Value }
+
+// NewString creates a String.
+func NewString(str string) *String {
+	s := &String{}
+	if str != "" {
+		s.Store(str)
+	}
+	return s
+}
+
+// Load atomically loads the wrapped string.
+func (s *String) Load() string {
+	v := s.v.Load()
+	if v == nil {
+		return ""
+	}
+	return v.(string)
+}
+
+// Store atomically stores the passed string.
+// Note: Converting the string to an interface{} to store in the Value
+// requires an allocation.
+func (s *String) Store(str string) {
+	s.v.Store(str)
+}
--- a/vendor/go.uber.org/multierr/.codecov.yml
+++ b/vendor/go.uber.org/multierr/.codecov.yml
@ -0,0 +1,15 @@
+coverage:
+  range: 80..100
+  round: down
+  precision: 2
+
+  status:
+    project:                   # measuring the overall project coverage
+      default:                 # context, you can create multiple ones with custom titles
+        enabled: yes           # must be yes|true to enable this status
+        target: 100            # specify the target coverage for each commit status
+                               #   option: "auto" (must increase from parent commit or pull request base)
+                               #   option: "X%" a static target percentage to hit
+        if_not_found: success  # if parent is not found report status as success, error, or failure
+        if_ci_failed: error    # if ci fails report status as success, error, or failure
+
--- a/vendor/go.uber.org/multierr/.gitignore
+++ b/vendor/go.uber.org/multierr/.gitignore
@ -0,0 +1 @@
+/vendor
--- a/vendor/go.uber.org/multierr/.travis.yml
+++ b/vendor/go.uber.org/multierr/.travis.yml
@ -0,0 +1,33 @@
+sudo: false
+language: go
+go_import_path: go.uber.org/multierr
+
+env:
+  global:
+    - GO15VENDOREXPERIMENT=1
+
+go:
+  - 1.7
+  - 1.8
+  - tip
+
+cache:
+  directories:
+    - vendor
+
+before_install:
+- go version
+
+install:
+- |
+  set -e
+  make install_ci
+
+script:
+- |
+  set -e
+  make lint
+  make test_ci
+
+after_success:
+- bash <(curl -s https://codecov.io/bash)
--- a/vendor/go.uber.org/multierr/CHANGELOG.md
+++ b/vendor/go.uber.org/multierr/CHANGELOG.md
@ -0,0 +1,28 @@
+Releases
+========
+
+v1.1.0 (2017-06-30)
+===================
+
+-   Added an `Errors(error) []error` function to extract the underlying list of
+    errors for a multierr error.
+
+
+v1.0.0 (2017-05-31)
+===================
+
+No changes since v0.2.0. This release is committing to making no breaking
+changes to the current API in the 1.X series.
+
+
+v0.2.0 (2017-04-11)
+===================
+
+-   Repeatedly appending to the same error is now faster due to fewer
+    allocations.
+
+
+v0.1.0 (2017-31-03)
+===================
+
+-   Initial release
--- a/vendor/go.uber.org/multierr/LICENSE.txt
+++ b/vendor/go.uber.org/multierr/LICENSE.txt
@ -0,0 +1,19 @@
+Copyright (c) 2017 Uber Technologies, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/vendor/go.uber.org/multierr/Makefile
+++ b/vendor/go.uber.org/multierr/Makefile
@ -0,0 +1,74 @@
+export GO15VENDOREXPERIMENT=1
+
+PACKAGES := $(shell glide nv)
+
+GO_FILES := $(shell \
+	find . '(' -path '*/.*' -o -path './vendor' ')' -prune \
+	-o -name '*.go' -print | cut -b3-)
+
+.PHONY: install
+install:
+	glide --version || go get github.com/Masterminds/glide
+	glide install
+
+.PHONY: build
+build:
+	go build -i $(PACKAGES)
+
+.PHONY: test
+test:
+	go test -cover -race $(PACKAGES)
+
+.PHONY: gofmt
+gofmt:
+	$(eval FMT_LOG := $(shell mktemp -t gofmt.XXXXX))
+	@gofmt -e -s -l $(GO_FILES) > $(FMT_LOG) || true
+	@[ ! -s "$(FMT_LOG)" ] || (echo "gofmt failed:" | cat - $(FMT_LOG) && false)
+
+.PHONY: govet
+govet:
+	$(eval VET_LOG := $(shell mktemp -t govet.XXXXX))
+	@go vet $(PACKAGES) 2>&1 \
+		| grep -v '^exit status' > $(VET_LOG) || true
+	@[ ! -s "$(VET_LOG)" ] || (echo "govet failed:" | cat - $(VET_LOG) && false)
+
+.PHONY: golint
+golint:
+	@go get github.com/golang/lint/golint
+	$(eval LINT_LOG := $(shell mktemp -t golint.XXXXX))
+	@cat /dev/null > $(LINT_LOG)
+	@$(foreach pkg, $(PACKAGES), golint $(pkg) >> $(LINT_LOG) || true;)
+	@[ ! -s "$(LINT_LOG)" ] || (echo "golint failed:" | cat - $(LINT_LOG) && false)
+
+.PHONY: staticcheck
+staticcheck:
+	@go get honnef.co/go/tools/cmd/staticcheck
+	$(eval STATICCHECK_LOG := $(shell mktemp -t staticcheck.XXXXX))
+	@staticcheck $(PACKAGES) 2>&1 > $(STATICCHECK_LOG) || true
+	@[ ! -s "$(STATICCHECK_LOG)" ] || (echo "staticcheck failed:" | cat - $(STATICCHECK_LOG) && false)
+
+.PHONY: lint
+lint: gofmt govet golint staticcheck
+
+.PHONY: cover
+cover:
+	./scripts/cover.sh $(shell go list $(PACKAGES))
+	go tool cover -html=cover.out -o cover.html
+
+update-license:
+	@go get go.uber.org/tools/update-license
+	@update-license \
+		$(shell go list -json $(PACKAGES) | \
+			jq -r '.Dir + "/" + (.GoFiles | .[])')
+
+##############################################################################
+
+.PHONY: install_ci
+install_ci: install
+	go get github.com/wadey/gocovmerge
+	go get github.com/mattn/goveralls
+	go get golang.org/x/tools/cmd/cover
+
+.PHONY: test_ci
+test_ci: install_ci
+	./scripts/cover.sh $(shell go list $(PACKAGES))
--- a/vendor/go.uber.org/multierr/README.md
+++ b/vendor/go.uber.org/multierr/README.md
@ -0,0 +1,23 @@
+# multierr [![GoDoc][doc-img]][doc] [![Build Status][ci-img]][ci] [![Coverage Status][cov-img]][cov]
+
+`multierr` allows combining one or more Go `error`s together.
+
+## Installation
+
+    go get -u go.uber.org/multierr
+
+## Status
+
+Stable: No breaking changes will be made before 2.0.
+
+-------------------------------------------------------------------------------
+
+Released under the [MIT License].
+
+[MIT License]: LICENSE.txt
+[doc-img]: https://godoc.org/go.uber.org/multierr?status.svg
+[doc]: https://godoc.org/go.uber.org/multierr
+[ci-img]: https://travis-ci.org/uber-go/multierr.svg?branch=master
+[cov-img]: https://codecov.io/gh/uber-go/multierr/branch/master/graph/badge.svg
+[ci]: https://travis-ci.org/uber-go/multierr
+[cov]: https://codecov.io/gh/uber-go/multierr
--- a/vendor/go.uber.org/multierr/error.go
+++ b/vendor/go.uber.org/multierr/error.go
@ -0,0 +1,401 @@
+// Copyright (c) 2017 Uber Technologies, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// Package multierr allows combining one or more errors together.
+//
+// Overview
+//
+// Errors can be combined with the use of the Combine function.
+//
+// 	multierr.Combine(
+// 		reader.Close(),
+// 		writer.Close(),
+// 		conn.Close(),
+// 	)
+//
+// If only two errors are being combined, the Append function may be used
+// instead.
+//
+// 	err = multierr.Combine(reader.Close(), writer.Close())
+//
+// This makes it possible to record resource cleanup failures from deferred
+// blocks with the help of named return values.
+//
+// 	func sendRequest(req Request) (err error) {
+// 		conn, err := openConnection()
+// 		if err != nil {
+// 			return err
+// 		}
+// 		defer func() {
+// 			err = multierr.Append(err, conn.Close())
+// 		}()
+// 		// ...
+// 	}
+//
+// The underlying list of errors for a returned error object may be retrieved
+// with the Errors function.
+//
+// 	errors := multierr.Errors(err)
+// 	if len(errors) > 0 {
+// 		fmt.Println("The following errors occurred:")
+// 	}
+//
+// Advanced Usage
+//
+// Errors returned by Combine and Append MAY implement the following
+// interface.
+//
+// 	type errorGroup interface {
+// 		// Returns a slice containing the underlying list of errors.
+// 		//
+// 		// This slice MUST NOT be modified by the caller.
+// 		Errors() []error
+// 	}
+//
+// Note that if you need access to list of errors behind a multierr error, you
+// should prefer using the Errors function. That said, if you need cheap
+// read-only access to the underlying errors slice, you can attempt to cast
+// the error to this interface. You MUST handle the failure case gracefully
+// because errors returned by Combine and Append are not guaranteed to
+// implement this interface.
+//
+// 	var errors []error
+// 	group, ok := err.(errorGroup)
+// 	if ok {
+// 		errors = group.Errors()
+// 	} else {
+// 		errors = []error{err}
+// 	}
+package multierr // import "go.uber.org/multierr"
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"strings"
+	"sync"
+
+	"go.uber.org/atomic"
+)
+
+var (
+	// Separator for single-line error messages.
+	_singlelineSeparator = []byte("; ")
+
+	_newline = []byte("\n")
+
+	// Prefix for multi-line messages
+	_multilinePrefix = []byte("the following errors occurred:")
+
+	// Prefix for the first and following lines of an item in a list of
+	// multi-line error messages.
+	//
+	// For example, if a single item is:
+	//
+	// 	foo
+	// 	bar
+	//
+	// It will become,
+	//
+	// 	 -  foo
+	// 	    bar
+	_multilineSeparator = []byte("\n -  ")
+	_multilineIndent    = []byte("    ")
+)
+
+// _bufferPool is a pool of bytes.Buffers.
+var _bufferPool = sync.Pool{
+	New: func() interface{} {
+		return &bytes.Buffer{}
+	},
+}
+
+type errorGroup interface {
+	Errors() []error
+}
+
+// Errors returns a slice containing zero or more errors that the supplied
+// error is composed of. If the error is nil, the returned slice is empty.
+//
+// 	err := multierr.Append(r.Close(), w.Close())
+// 	errors := multierr.Errors(err)
+//
+// If the error is not composed of other errors, the returned slice contains
+// just the error that was passed in.
+//
+// Callers of this function are free to modify the returned slice.
+func Errors(err error) []error {
+	if err == nil {
+		return nil
+	}
+
+	// Note that we're casting to multiError, not errorGroup. Our contract is
+	// that returned errors MAY implement errorGroup. Errors, however, only
+	// has special behavior for multierr-specific error objects.
+	//
+	// This behavior can be expanded in the future but I think it's prudent to
+	// start with as little as possible in terms of contract and possibility
+	// of misuse.
+	eg, ok := err.(*multiError)
+	if !ok {
+		return []error{err}
+	}
+
+	errors := eg.Errors()
+	result := make([]error, len(errors))
+	copy(result, errors)
+	return result
+}
+
+// multiError is an error that holds one or more errors.
+//
+// An instance of this is guaranteed to be non-empty and flattened. That is,
+// none of the errors inside multiError are other multiErrors.
+//
+// multiError formats to a semi-colon delimited list of error messages with
+// %v and with a more readable multi-line format with %+v.
+type multiError struct {
+	copyNeeded atomic.Bool
+	errors     []error
+}
+
+var _ errorGroup = (*multiError)(nil)
+
+// Errors returns the list of underlying errors.
+//
+// This slice MUST NOT be modified.
+func (merr *multiError) Errors() []error {
+	if merr == nil {
+		return nil
+	}
+	return merr.errors
+}
+
+func (merr *multiError) Error() string {
+	if merr == nil {
+		return ""
+	}
+
+	buff := _bufferPool.Get().(*bytes.Buffer)
+	buff.Reset()
+
+	merr.writeSingleline(buff)
+
+	result := buff.String()
+	_bufferPool.Put(buff)
+	return result
+}
+
+func (merr *multiError) Format(f fmt.State, c rune) {
+	if c == 'v' && f.Flag('+') {
+		merr.writeMultiline(f)
+	} else {
+		merr.writeSingleline(f)
+	}
+}
+
+func (merr *multiError) writeSingleline(w io.Writer) {
+	first := true
+	for _, item := range merr.errors {
+		if first {
+			first = false
+		} else {
+			w.Write(_singlelineSeparator)
+		}
+		io.WriteString(w, item.Error())
+	}
+}
+
+func (merr *multiError) writeMultiline(w io.Writer) {
+	w.Write(_multilinePrefix)
+	for _, item := range merr.errors {
+		w.Write(_multilineSeparator)
+		writePrefixLine(w, _multilineIndent, fmt.Sprintf("%+v", item))
+	}
+}
+
+// Writes s to the writer with the given prefix added before each line after
+// the first.
+func writePrefixLine(w io.Writer, prefix []byte, s string) {
+	first := true
+	for len(s) > 0 {
+		if first {
+			first = false
+		} else {
+			w.Write(prefix)
+		}
+
+		idx := strings.IndexByte(s, '\n')
+		if idx < 0 {
+			idx = len(s) - 1
+		}
+
+		io.WriteString(w, s[:idx+1])
+		s = s[idx+1:]
+	}
+}
+
+type inspectResult struct {
+	// Number of top-level non-nil errors
+	Count int
+
+	// Total number of errors including multiErrors
+	Capacity int
+
+	// Index of the first non-nil error in the list. Value is meaningless if
+	// Count is zero.
+	FirstErrorIdx int
+
+	// Whether the list contains at least one multiError
+	ContainsMultiError bool
+}
+
+// Inspects the given slice of errors so that we can efficiently allocate
+// space for it.
+func inspect(errors []error) (res inspectResult) {
+	first := true
+	for i, err := range errors {
+		if err == nil {
+			continue
+		}
+
+		res.Count++
+		if first {
+			first = false
+			res.FirstErrorIdx = i
+		}
+
+		if merr, ok := err.(*multiError); ok {
+			res.Capacity += len(merr.errors)
+			res.ContainsMultiError = true
+		} else {
+			res.Capacity++
+		}
+	}
+	return
+}
+
+// fromSlice converts the given list of errors into a single error.
+func fromSlice(errors []error) error {
+	res := inspect(errors)
+	switch res.Count {
+	case 0:
+		return nil
+	case 1:
+		// only one non-nil entry
+		return errors[res.FirstErrorIdx]
+	case len(errors):
+		if !res.ContainsMultiError {
+			// already flat
+			return &multiError{errors: errors}
+		}
+	}
+
+	nonNilErrs := make([]error, 0, res.Capacity)
+	for _, err := range errors[res.FirstErrorIdx:] {
+		if err == nil {
+			continue
+		}
+
+		if nested, ok := err.(*multiError); ok {
+			nonNilErrs = append(nonNilErrs, nested.errors...)
+		} else {
+			nonNilErrs = append(nonNilErrs, err)
+		}
+	}
+
+	return &multiError{errors: nonNilErrs}
+}
+
+// Combine combines the passed errors into a single error.
+//
+// If zero arguments were passed or if all items are nil, a nil error is
+// returned.
+//
+// 	Combine(nil, nil)  // == nil
+//
+// If only a single error was passed, it is returned as-is.
+//
+// 	Combine(err)  // == err
+//
+// Combine skips over nil arguments so this function may be used to combine
+// together errors from operations that fail independently of each other.
+//
+// 	multierr.Combine(
+// 		reader.Close(),
+// 		writer.Close(),
+// 		pipe.Close(),
+// 	)
+//
+// If any of the passed errors is a multierr error, it will be flattened along
+// with the other errors.
+//
+// 	multierr.Combine(multierr.Combine(err1, err2), err3)
+// 	// is the same as
+// 	multierr.Combine(err1, err2, err3)
+//
+// The returned error formats into a readable multi-line error message if
+// formatted with %+v.
+//
+// 	fmt.Sprintf("%+v", multierr.Combine(err1, err2))
+func Combine(errors ...error) error {
+	return fromSlice(errors)
+}
+
+// Append appends the given errors together. Either value may be nil.
+//
+// This function is a specialization of Combine for the common case where
+// there are only two errors.
+//
+// 	err = multierr.Append(reader.Close(), writer.Close())
+//
+// The following pattern may also be used to record failure of deferred
+// operations without losing information about the original error.
+//
+// 	func doSomething(..) (err error) {
+// 		f := acquireResource()
+// 		defer func() {
+// 			err = multierr.Append(err, f.Close())
+// 		}()
+func Append(left error, right error) error {
+	switch {
+	case left == nil:
+		return right
+	case right == nil:
+		return left
+	}
+
+	if _, ok := right.(*multiError); !ok {
+		if l, ok := left.(*multiError); ok && !l.copyNeeded.Swap(true) {
+			// Common case where the error on the left is constantly being
+			// appended to.
+			errs := append(l.errors, right)
+			return &multiError{errors: errs}
+		} else if !ok {
+			// Both errors are single errors.
+			return &multiError{errors: []error{left, right}}
+		}
+	}
+
+	// Either right or both, left and right, are multiErrors. Rely on usual
+	// expensive logic.
+	errors := [2]error{left, right}
+	return fromSlice(errors[0:])
+}
--- a/vendor/go.uber.org/multierr/glide.lock
+++ b/vendor/go.uber.org/multierr/glide.lock
@ -0,0 +1,19 @@
+hash: b53b5e9a84b9cb3cc4b2d0499e23da2feca1eec318ce9bb717ecf35bf24bf221
+updated: 2017-04-10T13:34:45.671678062-07:00
+imports:
+- name: go.uber.org/atomic
+  version: 3b8db5e93c4c02efbc313e17b2e796b0914a01fb
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 6d212800a42e8ab5c146b8ace3490ee17e5225f9
+  subpackages:
+  - spew
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: 69483b4bd14f5845b5a1e55bca19e954e827f1d0
+  subpackages:
+  - assert
+  - require
--- a/vendor/go.uber.org/multierr/glide.yaml
+++ b/vendor/go.uber.org/multierr/glide.yaml
@ -0,0 +1,8 @@
+package: go.uber.org/multierr
+import:
+- package: go.uber.org/atomic
+  version: ^1
+testImport:
+- package: github.com/stretchr/testify
+  subpackages:
+  - assert
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -15,6 +15,8 @@ github.com/ema/qdisc
 github.com/godbus/dbus
 # github.com/golang/protobuf v1.3.1
 github.com/golang/protobuf/proto
+# github.com/hodgesds/perf-utils v0.0.6
+github.com/hodgesds/perf-utils
 # github.com/konsorten/go-windows-terminal-sequences v1.0.2
 github.com/konsorten/go-windows-terminal-sequences
 # github.com/lufia/iostat v0.0.0-20170605150913-9f7362b77ad3
@ -57,6 +59,10 @@ github.com/siebenmann/go-kstat
 github.com/sirupsen/logrus
 # github.com/soundcloud/go-runit v0.0.0-20150630195641-06ad41a06c4a
 github.com/soundcloud/go-runit/runit
+# go.uber.org/atomic v1.3.2
+go.uber.org/atomic
+# go.uber.org/multierr v1.1.0
+go.uber.org/multierr
 # golang.org/x/net v0.0.0-20190328230028-74de082e2cca
 golang.org/x/net/ipv4
 golang.org/x/net/bpf