Add metrics for gcplog scrape. (#4235)

* Add metrics for gcplog scrape. Also fix the Ready() method of target * Fix typo with help message
4 years ago · b36bc5ab32
parent b0646e7156
commit b36bc5ab32
2 changed files with 16 additions and 5 deletions
--- a/clients/pkg/promtail/targets/gcplog/metrics.go
+++ b/clients/pkg/promtail/targets/gcplog/metrics.go
@ -7,8 +7,9 @@ type Metrics struct {
 	// reg is the Registerer used to create this set of metrics.
 	reg prometheus.Registerer

-	gcplogEntries *prometheus.CounterVec
-	gcplogErrors  *prometheus.CounterVec
+	gcplogEntries                 *prometheus.CounterVec
+	gcplogErrors                  *prometheus.CounterVec
+	gcplogTargetLastSuccessScrape *prometheus.GaugeVec
 }

 // NewMetrics creates a new set of metrics. Metrics will be registered to reg.
@ -28,6 +29,12 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
 		Help:      "Total number of parsing errors while receiving gcplog messages",
 	}, []string{"project"})

+	m.gcplogTargetLastSuccessScrape = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: "promtail",
+		Name:      "gcplog_target_last_success_scrape",
+		Help:      "Timestamp of the specific target's last successful poll",
+	}, []string{"project", "target"})
+
 	reg.MustRegister(m.gcplogEntries, m.gcplogErrors)
 	return &m
 }
--- a/clients/pkg/promtail/targets/gcplog/target.go
+++ b/clients/pkg/promtail/targets/gcplog/target.go
@ -108,9 +108,9 @@ func (t *GcplogTarget) run() error {
 			t.msgs <- m
 		})
 		if err != nil {
-			// TODO(kavi): Add proper error propagation maybe?
-			level.Error(t.logger).Log("error", err)
+			level.Error(t.logger).Log("msg", "failed to receive pubsub messages", "error", err)
 			t.metrics.gcplogErrors.WithLabelValues(t.config.ProjectID).Inc()
+			t.metrics.gcplogTargetLastSuccessScrape.WithLabelValues(t.config.ProjectID, t.config.Subscription).SetToCurrentTime()
 		}
 	}()

@ -138,7 +138,11 @@ func (t *GcplogTarget) Type() target.TargetType {
 }

 func (t *GcplogTarget) Ready() bool {
-	return t.ctx.Err() == nil
+	// Return true just like all other targets.
+	// Rationale is gcplog scraping shouldn't stop because of some transient timeout errors.
+	// This transient failure can cause promtail readyness probe to fail which may prevent pod from starting.
+	// We have metrics now to track if scraping failed (`gcplog_target_last_success_scrape`).
+	return true
 }

 func (t *GcplogTarget) DiscoveredLabels() model.LabelSet {