From 9e7ac3d853ac452f470fba0cd72df69cf6a0326b Mon Sep 17 00:00:00 2001 From: Sandeep Sukhani Date: Mon, 19 Dec 2022 14:44:19 +0530 Subject: [PATCH] fix query-frontend request load balancing when using k8s service (#7966) **What this PR does / why we need it**: We noticed an imbalance in the requests sent to the Query Frontend pods using `query-frontend` k8s service. This seems to be caused by `query-frontend` being a headless service that resolves to each QF pod IP and leaves it up to the client to load balance the requests as mentioned [here](https://kubernetes.io/docs/concepts/services-networking/service/#headless-services). This PR fixes the issue by creating two separate services for pod IP discovery and load balancing of queries: * `query-frontend` to be used for load balancing incoming Loki queries. * `query-frontend-headless` to be used for discovering QF pod IPs from queriers to connect as workers. **Checklist** - [x] `CHANGELOG.md` updated - [x] Changes that require user attention or interaction to upgrade are documented in `docs/sources/upgrading/_index.md` --- CHANGELOG.md | 1 + docs/sources/upgrading/_index.md | 23 +++++++++++++++++++ .../ksonnet/loki/query-frontend.libsonnet | 11 +++++++-- .../ksonnet/loki/query-scheduler.libsonnet | 2 +- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1177b6f866..61afe0f8c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ * [7880](https://github.com/grafana/loki/pull/7880) **sandeepsukhani**: consider range and offset in queries while looking for schema config for query sharding. * [7937](https://github.com/grafana/loki/pull/7937) **ssncferreira**: Deprecate CLI flag `-ruler.wal-cleaer.period` and replace it with `-ruler.wal-cleaner.period`. * [7906](https://github.com/grafana/loki/pull/7906) **kavirajk**: Add API endpoint that formats LogQL expressions and support new `fmt` subcommand in `logcli` to format LogQL query. +* [7966](https://github.com/grafana/loki/pull/7966) **sandeepsukhani**: Fix query-frontend request load balancing when using k8s service. ##### Changes diff --git a/docs/sources/upgrading/_index.md b/docs/sources/upgrading/_index.md index 31f9c45932..15e5769df4 100644 --- a/docs/sources/upgrading/_index.md +++ b/docs/sources/upgrading/_index.md @@ -57,6 +57,29 @@ ruler: period: 5s ``` +### Querier + +#### query-frontend k8s headless service changed to load balanced service + +*Note:* This is relevant only if you are using [jsonnet for deploying Loki in Kubernetes](https://grafana.com/docs/loki/latest/installation/tanka/) + +The `query-frontend` k8s service was previously headless and was used for two purposes: +* Distributing the Loki query requests amongst all the available Query Frontend pods. +* Discover IPs of Query Frontend pods from Queriers to connect as workers. + +The problem here is that a headless service does not support load balancing and leaves it up to the client to balance the load. +Additionally, a load-balanced service does not let us discover the IPs of the underlying pods. + +To meet both these requirements, we have made the following changes: +* Changed the existing `query-frontend` k8s service from headless to load-balanced to have a fair load distribution on all the Query Frontend instances. +* Added `query-frontend-headless` to discover QF pod IPs from queriers to connect as workers. + +If you are deploying Loki with Query Scheduler by setting [query_scheduler_enabled](https://github.com/grafana/loki/blob/cc4ab7487ab3cd3b07c63601b074101b0324083b/production/ksonnet/loki/config.libsonnet#L18) config to `true`, then there is nothing to do here for this change. +If you are not using Query Scheduler, then to avoid any issues on the Read path until the rollout finishes, it would be good to follow below steps: +* Create just the `query-frontend-headless` service without applying any changes to the `query-frontend` service. +* Rollout changes to `queriers`. +* Roll out the rest of the changes. + ## 2.7.0 ### Loki diff --git a/production/ksonnet/loki/query-frontend.libsonnet b/production/ksonnet/loki/query-frontend.libsonnet index 081c5acb89..2694715880 100644 --- a/production/ksonnet/loki/query-frontend.libsonnet +++ b/production/ksonnet/loki/query-frontend.libsonnet @@ -42,7 +42,11 @@ local k = import 'ksonnet-util/kausal.libsonnet'; local service = k.core.v1.service, - query_frontend_service: + // A headless service for discovering IPs of each query-frontend pod. + // It leaves it up to the client to do any load-balancing of requests, + // so if the intention is to use the k8s service for load balancing, + // it is advised to use the below `query-frontend` service instead. + query_frontend_headless_service: $.util.grpclbServiceFor($.query_frontend_deployment) + // Make sure that query frontend worker, running in the querier, do resolve // each query-frontend pod IP and NOT the service IP. To make it, we do NOT @@ -52,6 +56,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; // Query frontend will not become ready until at least one querier connects // which creates a chicken and egg scenario if we don't publish the // query-frontend address before it's ready. - service.mixin.spec.withPublishNotReadyAddresses(true), + service.mixin.spec.withPublishNotReadyAddresses(true) + + service.mixin.metadata.withName('query-frontend-headless'), + query_frontend_service: + k.util.serviceFor($.query_frontend_deployment, $._config.service_ignored_labels), } diff --git a/production/ksonnet/loki/query-scheduler.libsonnet b/production/ksonnet/loki/query-scheduler.libsonnet index bb63b08c12..1c221940d6 100644 --- a/production/ksonnet/loki/query-scheduler.libsonnet +++ b/production/ksonnet/loki/query-scheduler.libsonnet @@ -20,7 +20,7 @@ local k = import 'ksonnet-util/kausal.libsonnet'; max_outstanding_per_tenant: max_outstanding, }, frontend_worker+: { - frontend_address: 'query-frontend.%s.svc.cluster.local.:9095' % $._config.namespace, + frontend_address: 'query-frontend-headless.%s.svc.cluster.local.:9095' % $._config.namespace, }, }, },