Add per-tenant request counter metric to index gateway server (#9797)

This commit add a counter metric `loki_index_gateway_requests_total` with labels `operation`, `tenant`, `status` for gRPC requests that are served by the index gateway. **What for?** The per-tenant RPS on the index gateway is used to derive the per-tenant shard factor. **Why tracking on the server?** Unlike tracking index gateway RPS on the client side, tracking on the server side does not yield that many series, even in multi-tenant installations with a lot of tenants, because the amount of index gateway instances is relatively small compared to the amount of queriers and frontends. **Special notes for your reviewer**: The previous approach of tracking requests on the client https://github.com/grafana/loki/pull/9781 has been abandoned. Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
2 years ago · a65c99d9bf
parent 8ca035ffbf
commit a65c99d9bf
4 changed files with 58 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,7 @@

 ##### Enhancements

+* [9797](https://github.com/grafana/loki/pull/9797) **chaudum**: Add new `loki_index_gateway_requests_total` counter metric to observe per-tenant RPS
 * [9710](https://github.com/grafana/loki/pull/9710) **chaudum**: Add shuffle sharding to index gateway
 * [9573](https://github.com/grafana/loki/pull/9573) **CCOLLOT**: Lambda-Promtail: Add support for AWS CloudFront log ingestion.
 * [9497](https://github.com/grafana/loki/pull/9497) **CCOLLOT**: Lambda-Promtail: Add support for AWS CloudTrail log ingestion.
--- a/pkg/loki/loki.go
+++ b/pkg/loki/loki.go
@ -636,6 +636,7 @@ func (t *Loki) setupModuleManager() error {
 	mm.RegisterModule(Compactor, t.initCompactor)
 	mm.RegisterModule(IndexGateway, t.initIndexGateway)
 	mm.RegisterModule(IndexGatewayRing, t.initIndexGatewayRing, modules.UserInvisibleModule)
+	mm.RegisterModule(IndexGatewayInterceptors, t.initIndexGatewayInterceptors, modules.UserInvisibleModule)
 	mm.RegisterModule(QueryScheduler, t.initQueryScheduler)
 	mm.RegisterModule(QuerySchedulerRing, t.initQuerySchedulerRing, modules.UserInvisibleModule)
 	mm.RegisterModule(Analytics, t.initAnalytics)
@ -664,7 +665,7 @@ func (t *Loki) setupModuleManager() error {
 		RuleEvaluator:            {Ring, Server, Store, IngesterQuerier, Overrides, TenantConfigs, Analytics},
 		TableManager:             {Server, Analytics},
 		Compactor:                {Server, Overrides, MemberlistKV, Analytics},
-		IndexGateway:             {Server, Store, Overrides, Analytics, MemberlistKV, IndexGatewayRing},
+		IndexGateway:             {Server, Store, Overrides, Analytics, MemberlistKV, IndexGatewayRing, IndexGatewayInterceptors},
 		IngesterQuerier:          {Ring},
 		QuerySchedulerRing:       {Overrides, Server, MemberlistKV},
 		IndexGatewayRing:         {Overrides, Server, MemberlistKV},
--- a/pkg/loki/modules.go
+++ b/pkg/loki/modules.go
@ -104,6 +104,7 @@ const (
 	Compactor                string = "compactor"
 	IndexGateway             string = "index-gateway"
 	IndexGatewayRing         string = "index-gateway-ring"
+	IndexGatewayInterceptors string = "index-gateway-interceptors"
 	QueryScheduler           string = "query-scheduler"
 	QuerySchedulerRing       string = "query-scheduler-ring"
 	All                      string = "all"
@ -1242,6 +1243,15 @@ func (t *Loki) initIndexGatewayRing() (_ services.Service, err error) {
 	return t.indexGatewayRingManager, nil
 }

+func (t *Loki) initIndexGatewayInterceptors() (services.Service, error) {
+	// Only expose per-tenant metric if index gateway runs as standalone service
+	if t.Cfg.isModuleEnabled(IndexGateway) {
+		interceptors := indexgateway.NewServerInterceptors(prometheus.DefaultRegisterer)
+		t.Cfg.Server.GRPCMiddleware = append(t.Cfg.Server.GRPCMiddleware, interceptors.PerTenantRequestCount)
+	}
+	return nil, nil
+}
+
 func (t *Loki) initQueryScheduler() (services.Service, error) {
 	s, err := scheduler.NewScheduler(t.Cfg.QueryScheduler, t.Overrides, util_log.Logger, t.querySchedulerRingManager, prometheus.DefaultRegisterer)
 	if err != nil {
--- a/pkg/storage/stores/shipper/indexgateway/grpc.go
+++ b/pkg/storage/stores/shipper/indexgateway/grpc.go
@ -0,0 +1,45 @@
+package indexgateway
+
+import (
+	"context"
+
+	"github.com/grafana/dskit/tenant"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+	"google.golang.org/grpc"
+)
+
+type ServerInterceptors struct {
+	reqCount              *prometheus.CounterVec
+	PerTenantRequestCount grpc.UnaryServerInterceptor
+}
+
+func NewServerInterceptors(r prometheus.Registerer) *ServerInterceptors {
+	requestCount := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
+		Namespace: "loki",
+		Subsystem: "index_gateway",
+		Name:      "requests_total",
+		Help:      "Total amount of requests served by the index gateway",
+	}, []string{"operation", "status", "tenant"})
+
+	perTenantRequestCount := func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) {
+		tenantID, err := tenant.TenantID(ctx)
+		if err != nil {
+			// ignore requests without tenantID
+			return handler(ctx, req)
+		}
+
+		resp, err = handler(ctx, req)
+		status := "success"
+		if err != nil {
+			status = "error"
+		}
+		requestCount.WithLabelValues(info.FullMethod, status, tenantID).Inc()
+		return
+	}
+
+	return &ServerInterceptors{
+		reqCount:              requestCount,
+		PerTenantRequestCount: perTenantRequestCount,
+	}
+}