diff --git a/CHANGELOG.md b/CHANGELOG.md index 378fe530ce..1c343c50ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -76,6 +76,10 @@ #### Jsonnet * [7923](https://github.com/grafana/loki/pull/7923) **manohar-koukuntla**: Add zone aware ingesters in jsonnet deployment +##### Fixes + +* [8247](https://github.com/grafana/loki/pull/8247) **Whyeasy** fix usage of cluster label within Mixin. + #### Build * [7938](https://github.com/grafana/loki/pull/7938) **ssncferreira**: Add DroneCI pipeline step to validate configuration flags documentation generation. diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index b1e979ed16..77f285b99c 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -26,7 +26,7 @@ groups: message: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. expr: | - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 for: 15m labels: severity: critical diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-reads.json b/production/loki-mixin-compiled-ssd/dashboards/loki-reads.json index 6cae30f081..cbec8fde99 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-reads.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-reads.json @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} Average", diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json b/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json index ebaf33ea42..17d54cf81f 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"}) / sum(job:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})", + "expr": "1e3 * sum(cluster_job:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}) / sum(cluster_job:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", diff --git a/production/loki-mixin-compiled-ssd/rules.yaml b/production/loki-mixin-compiled-ssd/rules.yaml index c876cd90de..2a54ed4fb2 100644 --- a/production/loki-mixin-compiled-ssd/rules.yaml +++ b/production/loki-mixin-compiled-ssd/rules.yaml @@ -2,48 +2,52 @@ groups: - name: loki_rules rules: - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job)) - record: job:loki_request_duration_seconds:99quantile + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job)) - record: job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (job) - record: job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job) - record: job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) - record: job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job) - record: job:loki_request_duration_seconds_count:sum_rate + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route)) - record: job_route:loki_request_duration_seconds:99quantile + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route)) - record: job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m])) - by (job, route) - record: job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route) - record: job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) - record: job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) - record: job_route:loki_request_duration_seconds_count:sum_rate + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route)) - record: namespace_job_route:loki_request_duration_seconds:99quantile + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route)) - record: namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, - route) - record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_count:sum_rate + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index b1e979ed16..77f285b99c 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -26,7 +26,7 @@ groups: message: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. expr: | - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 for: 15m labels: severity: critical diff --git a/production/loki-mixin-compiled/dashboards/loki-reads.json b/production/loki-mixin-compiled/dashboards/loki-reads.json index 8dd046d4c9..9f7ce574f6 100644 --- a/production/loki-mixin-compiled/dashboards/loki-reads.json +++ b/production/loki-mixin-compiled/dashboards/loki-reads.json @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} Average", @@ -330,7 +330,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 99th Percentile", @@ -338,7 +338,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 50th Percentile", @@ -346,7 +346,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} Average", @@ -518,7 +518,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 99th Percentile", @@ -526,7 +526,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 50th Percentile", @@ -534,7 +534,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} Average", @@ -706,7 +706,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 99th Percentile", @@ -714,7 +714,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} 50th Percentile", @@ -722,7 +722,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ route }} Average", diff --git a/production/loki-mixin-compiled/dashboards/loki-writes.json b/production/loki-mixin-compiled/dashboards/loki-writes.json index 54652992dd..15b5fd29eb 100644 --- a/production/loki-mixin-compiled/dashboards/loki-writes.json +++ b/production/loki-mixin-compiled/dashboards/loki-writes.json @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/distributor\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/distributor\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/distributor\", cluster=~\"$cluster\"}) / sum(job:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/distributor\", cluster=~\"$cluster\"})", + "expr": "1e3 * sum(cluster_job:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}) / sum(cluster_job:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", @@ -330,7 +330,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -338,7 +338,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -346,7 +346,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"}) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\"}) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", @@ -518,7 +518,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -526,7 +526,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -534,7 +534,7 @@ "step": 10 }, { - "expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"}) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\"}) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", diff --git a/production/loki-mixin-compiled/rules.yaml b/production/loki-mixin-compiled/rules.yaml index c876cd90de..2a54ed4fb2 100644 --- a/production/loki-mixin-compiled/rules.yaml +++ b/production/loki-mixin-compiled/rules.yaml @@ -2,48 +2,52 @@ groups: - name: loki_rules rules: - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job)) - record: job:loki_request_duration_seconds:99quantile + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job)) - record: job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (job) - record: job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job) - record: job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) - record: job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job) - record: job:loki_request_duration_seconds_count:sum_rate + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route)) - record: job_route:loki_request_duration_seconds:99quantile + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route)) - record: job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m])) - by (job, route) - record: job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route) - record: job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) - record: job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) - record: job_route:loki_request_duration_seconds_count:sum_rate + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route)) - record: namespace_job_route:loki_request_duration_seconds:99quantile + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route)) - record: namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, - route) - record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_count:sum_rate + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index bb0234fd7d..0045cc194b 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -39,8 +39,8 @@ { alert: 'LokiRequestLatency', expr: ||| - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - |||, + %s_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + ||| % $._config.per_cluster_label, 'for': '15m', labels: { severity: 'critical', diff --git a/production/loki-mixin/dashboards/loki-reads.libsonnet b/production/loki-mixin/dashboards/loki-reads.libsonnet index 7d4a37fbb5..aac0b2bbe6 100644 --- a/production/loki-mixin/dashboards/loki-reads.libsonnet +++ b/production/loki-mixin/dashboards/loki-reads.libsonnet @@ -56,8 +56,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-reads.json'].matchers.cortexgateway + [utils.selector.re('route', http_routes)], - extra_selectors=dashboards['loki-reads.json'].clusterMatchers, + dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.cortexgateway + [utils.selector.re('route', http_routes)], sum_by=['route'] ) ) @@ -72,8 +71,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-reads.json'].matchers.queryFrontend + [utils.selector.re('route', http_routes)], - extra_selectors=dashboards['loki-reads.json'].clusterMatchers, + dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.queryFrontend + [utils.selector.re('route', http_routes)], sum_by=['route'] ) ) @@ -89,8 +87,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-reads.json'].matchers.querier + [utils.selector.re('route', http_routes)], - extra_selectors=dashboards['loki-reads.json'].clusterMatchers, + dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.querier + [utils.selector.re('route', http_routes)], sum_by=['route'] ) ) @@ -106,8 +103,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-reads.json'].matchers.ingester + [utils.selector.re('route', grpc_routes)], - extra_selectors=dashboards['loki-reads.json'].clusterMatchers, + dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.ingester + [utils.selector.re('route', grpc_routes)], sum_by=['route'] ) ) @@ -124,8 +120,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-reads.json'].matchers.ingesterZoneAware + [utils.selector.re('route', grpc_routes)], - extra_selectors=dashboards['loki-reads.json'].clusterMatchers, + dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.ingesterZoneAware + [utils.selector.re('route', grpc_routes)], sum_by=['route'] ) ) @@ -153,7 +148,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_bigtable_request_duration_seconds', - dashboards['loki-reads.json'].matchers.querier + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')] + dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.querier + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')] ) ) ) diff --git a/production/loki-mixin/dashboards/loki-writes.libsonnet b/production/loki-mixin/dashboards/loki-writes.libsonnet index d1e9899389..b51096b645 100644 --- a/production/loki-mixin/dashboards/loki-writes.libsonnet +++ b/production/loki-mixin/dashboards/loki-writes.libsonnet @@ -49,8 +49,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].matchers.cortexgateway + [utils.selector.re('route', 'api_prom_push|loki_api_v1_push')], - extra_selectors=dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.cortexgateway + [utils.selector.re('route', 'api_prom_push|loki_api_v1_push')], ) ) ) @@ -64,8 +63,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].matchers.distributor, - extra_selectors=dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.distributor, ) ) ) @@ -80,8 +78,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].matchers.ingester_zone + [utils.selector.eq('route', '/logproto.Pusher/Push')], - extra_selectors=dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.ingester_zone + [utils.selector.eq('route', '/logproto.Pusher/Push')], ) ) ) @@ -97,8 +94,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].matchers.ingester + [utils.selector.eq('route', '/logproto.Pusher/Push')], - extra_selectors=dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.ingester + [utils.selector.eq('route', '/logproto.Pusher/Push')], ) ) ) @@ -125,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_bigtable_request_duration_seconds', - dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.ingester + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')] + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.ingester + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')] ) ) ) diff --git a/production/loki-mixin/recording_rules.libsonnet b/production/loki-mixin/recording_rules.libsonnet index 9717a9a8aa..2feda5cac6 100644 --- a/production/loki-mixin/recording_rules.libsonnet +++ b/production/loki-mixin/recording_rules.libsonnet @@ -5,9 +5,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; groups+: [{ name: 'loki_rules', rules: - utils.histogramRules('loki_request_duration_seconds', ['job']) + - utils.histogramRules('loki_request_duration_seconds', ['job', 'route']) + - utils.histogramRules('loki_request_duration_seconds', ['namespace', 'job', 'route']), + utils.histogramRules('loki_request_duration_seconds', [$._config.per_cluster_label, 'job']) + + utils.histogramRules('loki_request_duration_seconds', [$._config.per_cluster_label, 'job', 'route']) + + utils.histogramRules('loki_request_duration_seconds', [$._config.per_cluster_label, 'namespace', 'job', 'route']), }], }, }