Promtail: Add tenant label to client drop metrics and logs (#7593)

**What this PR does / why we need it**:
Provides better visibility for per-tenant drops/retries in Promtail

**Which issue(s) this PR fixes**:
Fixes #7570 

**Special notes for your reviewer**:
Does anything depend on these metrics only having 1 label that this
change would break? I decided against a reason label due to cardinality
concerns given how widely promtail is often deployed.

**Checklist**
- [x] Reviewed the `CONTRIBUTING.md` guide
- [x] Documentation added
- [x] Tests updated
- [x] `CHANGELOG.md` updated
- [x] Changes that require user attention or interaction to upgrade are
documented in `docs/sources/upgrading/_index.md`
pull/7793/head
Chris Hodges 4 years ago committed by GitHub
parent a63ad06509
commit e80b4deb43
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      CHANGELOG.md
  2. 52
      clients/pkg/promtail/client/client.go
  3. 20
      clients/pkg/promtail/client/client_test.go

@ -99,7 +99,7 @@ Check the history of the branch FIXME.
#### Promtail
##### Enhancements
* [7593](https://github.com/grafana/loki/pull/7593) **chodges15**: Promtail: Add tenant label to client drop metrics and logs
* [7101](https://github.com/grafana/loki/pull/7101) **liguozhong**: Promtail: Add support for max stream limit.
* [7247](https://github.com/grafana/loki/pull/7247) **liguozhong**: Add config reload endpoint / signal to promtail.
* [6708](https://github.com/grafana/loki/pull/6708) **DylanGuedes**: Add compressed files support to Promtail.

@ -38,20 +38,22 @@ const (
LatencyLabel = "filename"
HostLabel = "host"
ClientLabel = "client"
TenantLabel = "tenant"
)
var UserAgent = fmt.Sprintf("promtail/%s", build.Version)
type Metrics struct {
encodedBytes *prometheus.CounterVec
sentBytes *prometheus.CounterVec
droppedBytes *prometheus.CounterVec
sentEntries *prometheus.CounterVec
droppedEntries *prometheus.CounterVec
requestDuration *prometheus.HistogramVec
batchRetries *prometheus.CounterVec
countersWithHost []*prometheus.CounterVec
streamLag *prometheus.GaugeVec
encodedBytes *prometheus.CounterVec
sentBytes *prometheus.CounterVec
droppedBytes *prometheus.CounterVec
sentEntries *prometheus.CounterVec
droppedEntries *prometheus.CounterVec
requestDuration *prometheus.HistogramVec
batchRetries *prometheus.CounterVec
countersWithHost []*prometheus.CounterVec
countersWithTenant []*prometheus.CounterVec
streamLag *prometheus.GaugeVec
}
func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics {
@ -71,7 +73,7 @@ func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics {
Namespace: "promtail",
Name: "dropped_bytes_total",
Help: "Number of bytes dropped because failed to be sent to the ingester after all retries.",
}, []string{HostLabel})
}, []string{HostLabel, TenantLabel})
m.sentEntries = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "promtail",
Name: "sent_entries_total",
@ -81,7 +83,7 @@ func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics {
Namespace: "promtail",
Name: "dropped_entries_total",
Help: "Number of log entries dropped because failed to be sent to the ingester after all retries.",
}, []string{HostLabel})
}, []string{HostLabel, TenantLabel})
m.requestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "promtail",
Name: "request_duration_seconds",
@ -91,10 +93,14 @@ func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics {
Namespace: "promtail",
Name: "batch_retries_total",
Help: "Number of times batches has had to be retried.",
}, []string{HostLabel})
}, []string{HostLabel, TenantLabel})
m.countersWithHost = []*prometheus.CounterVec{
m.encodedBytes, m.sentBytes, m.droppedBytes, m.sentEntries, m.droppedEntries, m.batchRetries,
m.encodedBytes, m.sentBytes, m.sentEntries,
}
m.countersWithTenant = []*prometheus.CounterVec{
m.droppedBytes, m.droppedEntries, m.batchRetries,
}
streamLagLabelsMerged := []string{HostLabel, ClientLabel}
@ -270,6 +276,11 @@ func (c *client) run() {
// If the batch doesn't exist yet, we create a new one with the entry
if !ok {
batches[tenantID] = newBatch(c.maxStreams, e)
// Initialize counters to 0 so the metrics are exported before the first
// occurrence of incrementing to avoid missing metrics.
for _, counter := range c.metrics.countersWithTenant {
counter.WithLabelValues(c.cfg.URL.Host, tenantID).Add(0)
}
break
}
@ -285,8 +296,9 @@ func (c *client) run() {
// The max size of the batch isn't reached, so we can add the entry
err := batch.add(e)
if err != nil {
level.Error(c.logger).Log("msg", "batch add err", "error", err)
c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host).Inc()
level.Error(c.logger).Log("msg", "batch add err", "tenant", tenantID, "error", err)
c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID).Add(float64(len(e.Line)))
c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID).Inc()
return
}
case <-maxWaitCheck.C:
@ -376,8 +388,8 @@ func (c *client) sendBatch(tenantID string, batch *batch) {
break
}
level.Warn(c.logger).Log("msg", "error sending batch, will retry", "status", status, "error", err)
c.metrics.batchRetries.WithLabelValues(c.cfg.URL.Host).Inc()
level.Warn(c.logger).Log("msg", "error sending batch, will retry", "status", status, "tenant", tenantID, "error", err)
c.metrics.batchRetries.WithLabelValues(c.cfg.URL.Host, tenantID).Inc()
backoff.Wait()
// Make sure it sends at least once before checking for retry.
@ -387,9 +399,9 @@ func (c *client) sendBatch(tenantID string, batch *batch) {
}
if err != nil {
level.Error(c.logger).Log("msg", "final error sending batch", "status", status, "error", err)
c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host).Add(bufBytes)
c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host).Add(float64(entriesCount))
level.Error(c.logger).Log("msg", "final error sending batch", "status", status, "tenant", tenantID, "error", err)
c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID).Add(bufBytes)
c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID).Add(float64(entriesCount))
}
}

@ -75,7 +75,7 @@ func TestClient_Handle(t *testing.T) {
promtail_sent_entries_total{host="__HOST__"} 3.0
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 0
promtail_dropped_entries_total{host="__HOST__", tenant=""} 0
`,
},
"batch log entries together until the batch wait time is reached": {
@ -101,7 +101,7 @@ func TestClient_Handle(t *testing.T) {
promtail_sent_entries_total{host="__HOST__"} 2.0
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 0
promtail_dropped_entries_total{host="__HOST__", tenant=""} 0
`,
},
"retry send a batch up to backoff's max retries in case the server responds with a 5xx": {
@ -127,7 +127,7 @@ func TestClient_Handle(t *testing.T) {
expectedMetrics: `
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 1.0
promtail_dropped_entries_total{host="__HOST__", tenant=""} 1.0
# HELP promtail_sent_entries_total Number of log entries sent to the ingester.
# TYPE promtail_sent_entries_total counter
promtail_sent_entries_total{host="__HOST__"} 0
@ -148,7 +148,7 @@ func TestClient_Handle(t *testing.T) {
expectedMetrics: `
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 1.0
promtail_dropped_entries_total{host="__HOST__", tenant=""} 1.0
# HELP promtail_sent_entries_total Number of log entries sent to the ingester.
# TYPE promtail_sent_entries_total counter
promtail_sent_entries_total{host="__HOST__"} 0
@ -177,7 +177,7 @@ func TestClient_Handle(t *testing.T) {
expectedMetrics: `
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 1.0
promtail_dropped_entries_total{host="__HOST__", tenant=""} 1.0
# HELP promtail_sent_entries_total Number of log entries sent to the ingester.
# TYPE promtail_sent_entries_total counter
promtail_sent_entries_total{host="__HOST__"} 0
@ -202,7 +202,7 @@ func TestClient_Handle(t *testing.T) {
promtail_sent_entries_total{host="__HOST__"} 2.0
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 0
promtail_dropped_entries_total{host="__HOST__", tenant="tenant-default"} 0
`,
},
"batch log entries together honoring the tenant ID overridden while processing the pipeline stages": {
@ -232,7 +232,9 @@ func TestClient_Handle(t *testing.T) {
promtail_sent_entries_total{host="__HOST__"} 4.0
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 0
promtail_dropped_entries_total{host="__HOST__", tenant="tenant-1"} 0
promtail_dropped_entries_total{host="__HOST__", tenant="tenant-2"} 0
promtail_dropped_entries_total{host="__HOST__", tenant="tenant-default"} 0
`,
},
}
@ -343,7 +345,7 @@ func TestClient_StopNow(t *testing.T) {
promtail_sent_entries_total{host="__HOST__"} 3.0
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 0
promtail_dropped_entries_total{host="__HOST__", tenant=""} 0
`,
},
{
@ -362,7 +364,7 @@ func TestClient_StopNow(t *testing.T) {
expectedMetrics: `
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 1.0
promtail_dropped_entries_total{host="__HOST__", tenant=""} 1.0
# HELP promtail_sent_entries_total Number of log entries sent to the ingester.
# TYPE promtail_sent_entries_total counter
promtail_sent_entries_total{host="__HOST__"} 0

Loading…
Cancel
Save