Sharding optimizations (#10101)

A few bugfixes and more sharding optimizations * fix bug on `<aggr> by|without ()` groupings which removed the grouping while downstreaming * shardable implementations for `max+min`, operation specific merge strategies which enable many more types of sharded requests, even when label-reduction is performed at edge.
2 years ago · 9097f1ff42
parent 42b8a6cbca
commit 9097f1ff42
5 changed files with 352 additions and 254 deletions
--- a/pkg/logql/shardmapper.go
+++ b/pkg/logql/shardmapper.go
@ -180,96 +180,125 @@ func (m ShardMapper) mapSampleExpr(expr syntax.SampleExpr, r *downstreamRecorder
 	return head, bytesPerShard, nil
 }

+// turn a vector aggr into a wrapped+sharded variant,
+// used as a subroutine in mapping
+func (m ShardMapper) wrappedShardedVectorAggr(expr *syntax.VectorAggregationExpr, r *downstreamRecorder) (*syntax.VectorAggregationExpr, uint64, error) {
+	sharded, bytesPerShard, err := m.mapSampleExpr(expr, r)
+	if err != nil {
+		return nil, 0, err
+	}
+	return &syntax.VectorAggregationExpr{
+		Left:      sharded,
+		Grouping:  expr.Grouping,
+		Params:    expr.Params,
+		Operation: expr.Operation,
+	}, bytesPerShard, nil
+}
+
 // technically, std{dev,var} are also parallelizable if there is no cross-shard merging
 // in descendent nodes in the AST. This optimization is currently avoided for simplicity.
 func (m ShardMapper) mapVectorAggregationExpr(expr *syntax.VectorAggregationExpr, r *downstreamRecorder) (syntax.SampleExpr, uint64, error) {
-	// if this AST contains unshardable operations, don't shard this at this level,
-	// but attempt to shard a child node.
-	if !expr.Shardable() {
-		subMapped, bytesPerShard, err := m.Map(expr.Left, r)
-		if err != nil {
-			return nil, 0, err
-		}
-		sampleExpr, ok := subMapped.(syntax.SampleExpr)
-		if !ok {
-			return nil, 0, badASTMapping(subMapped)
-		}
+	if expr.Shardable() {

-		return &syntax.VectorAggregationExpr{
-			Left:      sampleExpr,
-			Grouping:  expr.Grouping,
-			Params:    expr.Params,
-			Operation: expr.Operation,
-		}, bytesPerShard, nil
+		switch expr.Operation {

-	}
+		case syntax.OpTypeSum:
+			// sum(x) -> sum(sum(x, shard=1) ++ sum(x, shard=2)...)
+			return m.wrappedShardedVectorAggr(expr, r)

-	switch expr.Operation {
-	case syntax.OpTypeSum:
-		// sum(x) -> sum(sum(x, shard=1) ++ sum(x, shard=2)...)
-		sharded, bytesPerShard, err := m.mapSampleExpr(expr, r)
-		if err != nil {
-			return nil, 0, err
-		}
-		return &syntax.VectorAggregationExpr{
-			Left:      sharded,
-			Grouping:  expr.Grouping,
-			Params:    expr.Params,
-			Operation: expr.Operation,
-		}, bytesPerShard, nil
-
-	case syntax.OpTypeAvg:
-		// avg(x) -> sum(x)/count(x)
-		lhs, lhsBytesPerShard, err := m.mapVectorAggregationExpr(&syntax.VectorAggregationExpr{
-			Left:      expr.Left,
-			Grouping:  expr.Grouping,
-			Operation: syntax.OpTypeSum,
-		}, r)
-		if err != nil {
-			return nil, 0, err
-		}
-		rhs, rhsBytesPerShard, err := m.mapVectorAggregationExpr(&syntax.VectorAggregationExpr{
-			Left:      expr.Left,
-			Grouping:  expr.Grouping,
-			Operation: syntax.OpTypeCount,
-		}, r)
-		if err != nil {
-			return nil, 0, err
-		}
+		case syntax.OpTypeMin, syntax.OpTypeMax:
+			if syntax.ReducesLabels(expr) {
+				// skip sharding optimizations at this level. If labels are reduced,
+				// the same series may exist on multiple shards and must be aggregated
+				// together before a max|min is applied
+				break
+			}
+			// max(x) -> max(max(x, shard=1) ++ max(x, shard=2)...)
+			// min(x) -> min(min(x, shard=1) ++ min(x, shard=2)...)
+			return m.wrappedShardedVectorAggr(expr, r)
+
+		case syntax.OpTypeAvg:
+			// avg(x) -> sum(x)/count(x), which is parallelizable
+			lhs, lhsBytesPerShard, err := m.mapVectorAggregationExpr(&syntax.VectorAggregationExpr{
+				Left:      expr.Left,
+				Grouping:  expr.Grouping,
+				Operation: syntax.OpTypeSum,
+			}, r)
+			if err != nil {
+				return nil, 0, err
+			}

-		// We take the maximum bytes per shard of both sides of the operation
-		bytesPerShard := uint64(math.Max(int(lhsBytesPerShard), int(rhsBytesPerShard)))
+			rhs, rhsBytesPerShard, err := m.mapVectorAggregationExpr(&syntax.VectorAggregationExpr{
+				Left:      expr.Left,
+				Grouping:  expr.Grouping,
+				Operation: syntax.OpTypeCount,
+			}, r)
+			if err != nil {
+				return nil, 0, err
+			}

-		return &syntax.BinOpExpr{
-			SampleExpr: lhs,
-			RHS:        rhs,
-			Op:         syntax.OpTypeDiv,
-		}, bytesPerShard, nil
+			// We take the maximum bytes per shard of both sides of the operation
+			bytesPerShard := uint64(math.Max(int(lhsBytesPerShard), int(rhsBytesPerShard)))
+
+			return &syntax.BinOpExpr{
+				SampleExpr: lhs,
+				RHS:        rhs,
+				Op:         syntax.OpTypeDiv,
+			}, bytesPerShard, nil
+
+		case syntax.OpTypeCount:
+			if syntax.ReducesLabels(expr) {
+				// skip sharding optimizations at this level. If labels are reduced,
+				// the same series may exist on multiple shards and must be aggregated
+				// together before a count is applied
+				break
+			}

-	case syntax.OpTypeCount:
-		// count(x) -> sum(count(x, shard=1) ++ count(x, shard=2)...)
-		sharded, bytesPerShard, err := m.mapSampleExpr(expr, r)
-		if err != nil {
-			return nil, 0, err
-		}
-		return &syntax.VectorAggregationExpr{
-			Left:      sharded,
-			Grouping:  expr.Grouping,
-			Operation: syntax.OpTypeSum,
-		}, bytesPerShard, nil
-	default:
-		// this should not be reachable. If an operation is shardable it should
-		// have an optimization listed.
-		level.Warn(util_log.Logger).Log(
-			"msg", "unexpected operation which appears shardable, ignoring",
-			"operation", expr.Operation,
-		)
-		exprStats, err := m.shards.GetStats(expr)
-		if err != nil {
-			return nil, 0, err
+			// count(x) -> sum(count(x, shard=1) ++ count(x, shard=2)...)
+			sharded, bytesPerShard, err := m.mapSampleExpr(expr, r)
+			if err != nil {
+				return nil, 0, err
+			}
+			return &syntax.VectorAggregationExpr{
+				Left:      sharded,
+				Grouping:  expr.Grouping,
+				Operation: syntax.OpTypeSum,
+			}, bytesPerShard, nil
+		default:
+			// this should not be reachable. If an operation is shardable it should
+			// have an optimization listed. Nonetheless, we log this as a warning
+			// and return the original expression unsharded.
+			level.Warn(util_log.Logger).Log(
+				"msg", "unexpected operation which appears shardable, ignoring",
+				"operation", expr.Operation,
+			)
+			exprStats, err := m.shards.GetStats(expr)
+			if err != nil {
+				return nil, 0, err
+			}
+			return expr, exprStats.Bytes, nil
 		}
-		return expr, exprStats.Bytes, nil
+
 	}
+
+	// if this AST contains unshardable operations, don't shard this at this level,
+	// but attempt to shard a child node.
+	subMapped, bytesPerShard, err := m.Map(expr.Left, r)
+	if err != nil {
+		return nil, 0, err
+	}
+	sampleExpr, ok := subMapped.(syntax.SampleExpr)
+	if !ok {
+		return nil, 0, badASTMapping(subMapped)
+	}
+
+	return &syntax.VectorAggregationExpr{
+		Left:      sampleExpr,
+		Grouping:  expr.Grouping,
+		Params:    expr.Params,
+		Operation: expr.Operation,
+	}, bytesPerShard, nil
+
 }

 func (m ShardMapper) mapLabelReplaceExpr(expr *syntax.LabelReplaceExpr, r *downstreamRecorder) (syntax.SampleExpr, uint64, error) {
@ -283,52 +312,77 @@ func (m ShardMapper) mapLabelReplaceExpr(expr *syntax.LabelReplaceExpr, r *downs
 }

 func (m ShardMapper) mapRangeAggregationExpr(expr *syntax.RangeAggregationExpr, r *downstreamRecorder) (syntax.SampleExpr, uint64, error) {
-	if hasLabelModifier(expr) {
-		// if an expr can modify labels this means multiple shards can return the same labelset.
-		// When this happens the merge strategy needs to be different from a simple concatenation.
-		// For instance for rates we need to sum data from different shards but same series.
-		// Since we currently support only concatenation as merge strategy, we skip those queries.
+	if !expr.Shardable() {
 		exprStats, err := m.shards.GetStats(expr)
 		if err != nil {
 			return nil, 0, err
 		}
-
 		return expr, exprStats.Bytes, nil
 	}

 	switch expr.Operation {
-	case syntax.OpRangeTypeCount, syntax.OpRangeTypeRate, syntax.OpRangeTypeBytesRate, syntax.OpRangeTypeBytes:
-		// count_over_time(x) -> count_over_time(x, shard=1) ++ count_over_time(x, shard=2)...
-		// rate(x) -> rate(x, shard=1) ++ rate(x, shard=2)...
-		// same goes for bytes_rate and bytes_over_time
-		return m.mapSampleExpr(expr, r)
+
+	case syntax.OpRangeTypeCount, syntax.OpRangeTypeRate, syntax.OpRangeTypeBytes, syntax.OpRangeTypeBytesRate, syntax.OpRangeTypeSum, syntax.OpRangeTypeMax, syntax.OpRangeTypeMin:
+		// if the expr can reduce labels, it can cause the same labelset to
+		// exist on separate shards and we'll need to merge the results
+		// accordingly. If it does not reduce labels and has no special grouping
+		// aggregation, we can shard it as normal via concatenation.
+		potentialConflict := syntax.ReducesLabels(expr)
+		if !potentialConflict && (expr.Grouping == nil || expr.Grouping.Noop()) {
+			return m.mapSampleExpr(expr, r)
+		}
+
+		// These functions require a different merge strategy than the default
+		// concatentation.
+		// This is because the same label sets may exist on multiple shards when label-reducing parsing is applied or when
+		// grouping by some subset of the labels. In this case, the resulting vector may have multiple values for the same
+		// series and we need to combine them appropriately given a particular operation.
+		mergeMap := map[string]string{
+			// all these may be summed
+			syntax.OpRangeTypeCount:     syntax.OpTypeSum,
+			syntax.OpRangeTypeRate:      syntax.OpTypeSum,
+			syntax.OpRangeTypeBytes:     syntax.OpTypeSum,
+			syntax.OpRangeTypeBytesRate: syntax.OpTypeSum,
+			syntax.OpRangeTypeSum:       syntax.OpTypeSum,
+
+			// min & max require taking the min|max of the shards
+			syntax.OpRangeTypeMin: syntax.OpTypeMin,
+			syntax.OpRangeTypeMax: syntax.OpTypeMax,
+		}
+
+		// range aggregation groupings default to `without ()` behavior
+		// so we explicitly set the wrapping vector aggregation to this
+		// for parity when it's not explicitly set
+		grouping := expr.Grouping
+		if grouping == nil {
+			grouping = &syntax.Grouping{Without: true}
+		}
+
+		mapped, bytes, err := m.mapSampleExpr(expr, r)
+		// max_over_time(_) -> max without() (max_over_time(_) ++ max_over_time(_)...)
+		// max_over_time(_) by (foo) -> max by (foo) (max_over_time(_) by (foo) ++ max_over_time(_) by (foo)...)
+		merger, ok := mergeMap[expr.Operation]
+		if !ok {
+			return nil, 0, fmt.Errorf(
+				"error while finding merge operation for %s", expr.Operation,
+			)
+		}
+		return &syntax.VectorAggregationExpr{
+			Left:      mapped,
+			Grouping:  grouping,
+			Operation: merger,
+		}, bytes, err
+
 	default:
-		// This part of the query is not shardable, so the bytesPerShard is the bytes for all the log matchers in expr
+		// don't shard if there's not an appropriate optimization
 		exprStats, err := m.shards.GetStats(expr)
 		if err != nil {
 			return nil, 0, err
 		}
-
 		return expr, exprStats.Bytes, nil
 	}
 }

-// hasLabelModifier tells if an expression contains pipelines that can modify stream labels
-// parsers introduce new labels but does not alter original one for instance.
-func hasLabelModifier(expr *syntax.RangeAggregationExpr) bool {
-	switch ex := expr.Left.Left.(type) {
-	case *syntax.MatchersExpr:
-		return false
-	case *syntax.PipelineExpr:
-		for _, p := range ex.MultiStages {
-			if _, ok := p.(*syntax.LabelFmtExpr); ok {
-				return true
-			}
-		}
-	}
-	return false
-}
-
 func badASTMapping(got syntax.Expr) error {
 	return fmt.Errorf("bad AST mapping: expected SampleExpr, but got (%T)", got)
 }
--- a/pkg/logql/shardmapper_test.go
+++ b/pkg/logql/shardmapper_test.go
@ -154,30 +154,91 @@ func TestMappingStrings(t *testing.T) {
 		{
 			in: `sum(max(rate({foo="bar"}[5m])))`,
 			out: `sum(max(
-				downstream<rate({foo="bar"}[5m]), shard=0_of_2>
-				++ downstream<rate({foo="bar"}[5m]), shard=1_of_2>
+				downstream<max(rate({foo="bar"}[5m])), shard=0_of_2>
+				++ downstream<max(rate({foo="bar"}[5m])), shard=1_of_2>
 			))`,
 		},
 		{
-			in:  `sum(max(rate({foo="bar"} | json | label_format foo=bar [5m])))`,
-			out: `sum(max(rate({foo="bar"} | json | label_format foo=bar [5m])))`,
+			in: `max without (env) (rate({foo="bar"}[5m]))`,
+			out: `max without (env) (
+				downstream<max without (env)(rate({foo="bar"}[5m])), shard=0_of_2> ++ downstream<max without (env)(rate({foo="bar"}[5m])), shard=1_of_2>
+			)`,
+		},
+		{
+			in: `sum(max(rate({foo="bar"} | json | label_format foo=bar [5m])))`,
+			out: `sum(
+				max(
+					sum without() (
+						downstream<rate({foo="bar"}|json|label_formatfoo=bar[5m]),shard=0_of_2>
+						++
+						downstream<rate({foo="bar"}|json|label_formatfoo=bar[5m]),shard=1_of_2>
+					)
+				)
+			)`,
+		},
+		{
+			in: `max(sum by (abc) (rate({foo="bar"} | json | label_format bazz=buzz [5m])))`,
+			out: `max(
+				sum by (abc) (
+					downstream<sumby(abc)(rate({foo="bar"}|json|label_formatbazz=buzz[5m])),shard=0_of_2>
+					++
+					downstream<sumby(abc)(rate({foo="bar"}|json|label_formatbazz=buzz[5m])),shard=1_of_2>
+				)
+			)`,
 		},
 		{
-			in:  `rate({foo="bar"} | json | label_format foo=bar [5m])`,
-			out: `rate({foo="bar"} | json | label_format foo=bar [5m])`,
+			in: `rate({foo="bar"} | json | label_format foo=bar [5m])`,
+			out: `sum without()(
+				downstream<rate({foo="bar"}|json|label_formatfoo=bar[5m]),shard=0_of_2>
+				++
+				downstream<rate({foo="bar"}|json|label_formatfoo=bar[5m]),shard=1_of_2>
+			)`,
 		},
 		{
 			in: `count(rate({foo="bar"} | json [5m]))`,
-			out: `count(
-				downstream<rate({foo="bar"} | json [5m]), shard=0_of_2>
-				++ downstream<rate({foo="bar"} | json [5m]), shard=1_of_2>
+			out: `sum(
+				downstream<count(rate({foo="bar"}|json[5m])),shard=0_of_2>
+				++
+				downstream<count(rate({foo="bar"}|json[5m])),shard=1_of_2>
 			)`,
 		},
 		{
 			in: `avg(rate({foo="bar"} | json [5m]))`,
-			out: `avg(
-				downstream<rate({foo="bar"} | json [5m]), shard=0_of_2>
-				++ downstream<rate({foo="bar"} | json [5m]), shard=1_of_2>
+			out: `(
+				sum(
+					downstream<sum(rate({foo="bar"}|json[5m])),shard=0_of_2>++downstream<sum(rate({foo="bar"}|json[5m])),shard=1_of_2>
+				)
+				/
+				sum(
+					downstream<count(rate({foo="bar"}|json[5m])),shard=0_of_2>++downstream<count(rate({foo="bar"}|json[5m])),shard=1_of_2>
+				)
+			)`,
+		},
+		{
+			in: `count(rate({foo="bar"} | json | keep foo [5m]))`,
+			out: `count(
+				sum without()(
+					downstream<rate({foo="bar"}|json|keepfoo[5m]),shard=0_of_2>
+					++
+					downstream<rate({foo="bar"}|json|keepfoo[5m]),shard=1_of_2>
+				)
+			)`,
+		},
+		{
+			// renaming reduces the labelset and must be reaggregated before counting
+			in: `count(rate({foo="bar"} | json | label_format foo=bar [5m]))`,
+			out: `count(
+				sum without() (
+					downstream<rate({foo="bar"}|json|label_formatfoo=bar[5m]),shard=0_of_2>
+					++
+					downstream<rate({foo="bar"}|json|label_formatfoo=bar[5m]),shard=1_of_2>
+				)
+			)`,
+		},
+		{
+			in: `sum without () (rate({job="foo"}[5m]))`,
+			out: `sumwithout()(
+				downstream<sumwithout()(rate({job="foo"}[5m])),shard=0_of_2>++downstream<sumwithout()(rate({job="foo"}[5m])),shard=1_of_2>
 			)`,
 		},
 		{
@ -223,9 +284,12 @@ func TestMappingStrings(t *testing.T) {
 				)`,
 		},
 		{
-			// Ensure we don't try to shard expressions that include label reformatting.
-			in:  `sum(count_over_time({foo="bar"} | logfmt | label_format bar=baz | bar="buz" [5m]))`,
-			out: `sum(count_over_time({foo="bar"} | logfmt | label_format bar=baz | bar="buz" [5m]))`,
+			in: `sum(count_over_time({foo="bar"} | logfmt | label_format bar=baz | bar="buz" [5m])) by (bar)`,
+			out: `sum by (bar) (
+				downstream<sum by (bar) (count_over_time({foo="bar"}|logfmt|label_formatbar=baz|bar="buz"[5m])),shard=0_of_2>
+				++
+				downstream<sum by (bar) (count_over_time({foo="bar"}|logfmt|label_formatbar=baz|bar="buz"[5m])),shard=1_of_2>
+			)`,
 		},
 		{
 			in: `sum by (cluster) (rate({foo="bar"} [5m])) + ignoring(machine) sum by (cluster,machine) (rate({foo="bar"} [5m]))`,
@ -255,6 +319,14 @@ func TestMappingStrings(t *testing.T) {
 				)
 			)`,
 		},
+		{
+			in: `max_over_time({foo="ugh"} | unwrap baz [1m]) by ()`,
+			out: `max(
+				downstream<max_over_time({foo="ugh"}|unwrapbaz[1m])by(),shard=0_of_2>
+				++
+				downstream<max_over_time({foo="ugh"}|unwrapbaz[1m])by(),shard=1_of_2>
+			)`,
+		},
 		{
 			in:  `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
 			out: `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
@ -554,51 +626,6 @@ func TestMapping(t *testing.T) {
 				},
 			},
 		},
-		{
-			in: `max without (env) (rate({foo="bar"}[5m]))`,
-			expr: &syntax.VectorAggregationExpr{
-				Grouping: &syntax.Grouping{
-					Without: true,
-					Groups:  []string{"env"},
-				},
-				Operation: syntax.OpTypeMax,
-				Left: &ConcatSampleExpr{
-					DownstreamSampleExpr: DownstreamSampleExpr{
-						shard: &astmapper.ShardAnnotation{
-							Shard: 0,
-							Of:    2,
-						},
-						SampleExpr: &syntax.RangeAggregationExpr{
-							Operation: syntax.OpRangeTypeRate,
-							Left: &syntax.LogRange{
-								Left: &syntax.MatchersExpr{
-									Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "foo", "bar")},
-								},
-								Interval: 5 * time.Minute,
-							},
-						},
-					},
-					next: &ConcatSampleExpr{
-						DownstreamSampleExpr: DownstreamSampleExpr{
-							shard: &astmapper.ShardAnnotation{
-								Shard: 1,
-								Of:    2,
-							},
-							SampleExpr: &syntax.RangeAggregationExpr{
-								Operation: syntax.OpRangeTypeRate,
-								Left: &syntax.LogRange{
-									Left: &syntax.MatchersExpr{
-										Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "foo", "bar")},
-									},
-									Interval: 5 * time.Minute,
-								},
-							},
-						},
-						next: nil,
-					},
-				},
-			},
-		},
 		{
 			in: `count(rate({foo="bar"}[5m]))`,
 			expr: &syntax.VectorAggregationExpr{
@ -871,53 +898,6 @@ func TestMapping(t *testing.T) {
 				},
 			},
 		},
-		// sum(max) should not shard the maxes
-		{
-			in: `sum(max(rate({foo="bar"}[5m])))`,
-			expr: &syntax.VectorAggregationExpr{
-				Grouping:  &syntax.Grouping{},
-				Operation: syntax.OpTypeSum,
-				Left: &syntax.VectorAggregationExpr{
-					Grouping:  &syntax.Grouping{},
-					Operation: syntax.OpTypeMax,
-					Left: &ConcatSampleExpr{
-						DownstreamSampleExpr: DownstreamSampleExpr{
-							shard: &astmapper.ShardAnnotation{
-								Shard: 0,
-								Of:    2,
-							},
-							SampleExpr: &syntax.RangeAggregationExpr{
-								Operation: syntax.OpRangeTypeRate,
-								Left: &syntax.LogRange{
-									Left: &syntax.MatchersExpr{
-										Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "foo", "bar")},
-									},
-									Interval: 5 * time.Minute,
-								},
-							},
-						},
-						next: &ConcatSampleExpr{
-							DownstreamSampleExpr: DownstreamSampleExpr{
-								shard: &astmapper.ShardAnnotation{
-									Shard: 1,
-									Of:    2,
-								},
-								SampleExpr: &syntax.RangeAggregationExpr{
-									Operation: syntax.OpRangeTypeRate,
-									Left: &syntax.LogRange{
-										Left: &syntax.MatchersExpr{
-											Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "foo", "bar")},
-										},
-										Interval: 5 * time.Minute,
-									},
-								},
-							},
-							next: nil,
-						},
-					},
-				},
-			},
-		},
 		// max(count) should shard the count, but not the max
 		{
 			in: `max(count(rate({foo="bar"}[5m])))`,
--- a/pkg/logql/syntax/ast.go
+++ b/pkg/logql/syntax/ast.go
@ -669,7 +669,11 @@ func newLabelFmtExpr(fmts []log.LabelFmt) *LabelFmtExpr {
 	}
 }

-func (e *LabelFmtExpr) Shardable() bool { return false }
+func (e *LabelFmtExpr) Shardable() bool {
+	// While LabelFmt is shardable in certain cases, it is not always,
+	// but this is left to the shardmapper to determine
+	return true
+}

 func (e *LabelFmtExpr) Walk(f WalkFn) { f(e) }

@ -1219,28 +1223,30 @@ type Grouping struct {
 func (g Grouping) String() string {
 	var sb strings.Builder

-	if g.Groups == nil {
-		return ""
-	}
-
 	if g.Without {
 		sb.WriteString(" without ")
 	} else {
 		sb.WriteString(" by ")
 	}

-	if len(g.Groups) > 0 {
-		sb.WriteString("(")
-		sb.WriteString(strings.Join(g.Groups, ","))
-		sb.WriteString(")")
-	}
-	if len(g.Groups) == 0 {
-		sb.WriteString("()")
-	}
+	sb.WriteString("(")
+	sb.WriteString(strings.Join(g.Groups, ","))
+	sb.WriteString(")")

 	return sb.String()
 }

+// whether grouping doesn't change the result
+func (g Grouping) Noop() bool {
+	return len(g.Groups) == 0 && g.Without
+}
+
+// whether grouping reduces the result to a single value
+// with no labels
+func (g Grouping) Singleton() bool {
+	return len(g.Groups) == 0 && !g.Without
+}
+
 // VectorAggregationExpr all vector aggregation expressions support grouping by/without label(s),
 // therefore the Grouping struct can never be nil.
 type VectorAggregationExpr struct {
@ -1340,33 +1346,60 @@ func (e *VectorAggregationExpr) String() string {
 			params = []string{e.Left.String()}
 		}
 	}
-	return formatOperation(e.Operation, e.Grouping, params...)
+	return formatVectorOperation(e.Operation, e.Grouping, params...)
 }

 // impl SampleExpr
 func (e *VectorAggregationExpr) Shardable() bool {
-	if e.Operation == OpTypeCount || e.Operation == OpTypeAvg {
-		if !e.Left.Shardable() {
-			return false
-		}
+	if !shardableOps[e.Operation] || !e.Left.Shardable() {
+		return false
+	}
+
+	switch e.Operation {
+
+	case OpTypeCount, OpTypeAvg:
 		// count is shardable if labels are not mutated
-		// otherwise distinct values can be counted twice per shard
-		shardable := true
-		e.Left.Walk(func(e interface{}) {
-			switch e.(type) {
-			// LabelParserExpr is normally shardable, but not in this case.
-			// TODO(owen-d): I think LabelParserExpr is shardable
-			// for avg, but not for count. Let's refactor to make this
-			// cleaner. For now I'm disallowing sharding on both.
-			case *LabelParserExpr:
-				shardable = false
-			case *LogfmtParserExpr:
-				shardable = false
-			}
-		})
+		// otherwise distinct values can be present in multiple shards and
+		// counted twice.
+		// avg is similar since it's remapped to sum/count.
+		// TODO(owen-d): this is hard to figure out; we should refactor to
+		// make these relationships clearer, safer, and more extensible.
+		shardable := !ReducesLabels(e.Left)
+
 		return shardable
+
+	case OpTypeMax, OpTypeMin:
+		// max(<range_aggr>) can be sharded by pushing down the max|min aggregation,
+		// but max(<vector_aggr>) cannot. It needs to perform the
+		// aggregation on the total result set, and then pick the max|min.
+		// For instance, `max(max_over_time)` or `max(rate)` can turn into
+		// `max( max(rate(shard1)) ++ max(rate(shard2)) ... etc)`,
+		// but you can’t do
+		// `max( max(sum(rate(shard1))) ++ max(sum(rate(shard2))) ... etc)`
+		// because it’s only taking the maximum from each shard,
+		// but we actually need to sum all the shards then put the max on top
+		if _, ok := e.Left.(*RangeAggregationExpr); ok {
+			return true
+		}
+		return false
+
+	case OpTypeSum:
+		// sum can shard & merge vector & range aggregations, but only if
+		// the resulting computation is commutative and associative.
+		// This does not apply to min & max, because while `min(min(min))`
+		// satisfies the above, sum( sum(min(shard1) ++ sum(min(shard2)) )
+		// does not
+		if child, ok := e.Left.(*VectorAggregationExpr); ok {
+			switch child.Operation {
+			case OpTypeMin, OpTypeMax:
+				return false
+			}
+		}
+		return true
+
 	}
-	return shardableOps[e.Operation] && e.Left.Shardable()
+
+	return true
 }

 func (e *VectorAggregationExpr) Walk(f WalkFn) {
@ -1823,7 +1856,7 @@ func (e *LiteralExpr) Value() (float64, error) {

 // helper used to impl Stringer for vector and range aggregations
 // nolint:interfacer
-func formatOperation(op string, grouping *Grouping, params ...string) string {
+func formatVectorOperation(op string, grouping *Grouping, params ...string) string {
 	nonEmptyParams := make([]string, 0, len(params))
 	for _, p := range params {
 		if p != "" {
@ -1833,7 +1866,7 @@ func formatOperation(op string, grouping *Grouping, params ...string) string {

 	var sb strings.Builder
 	sb.WriteString(op)
-	if grouping != nil {
+	if grouping != nil && !grouping.Singleton() {
 		sb.WriteString(grouping.String())
 	}
 	sb.WriteString("(")
@ -1921,7 +1954,9 @@ func (e *LabelReplaceExpr) String() string {
 	return sb.String()
 }

-// shardableOps lists the operations which may be sharded.
+// shardableOps lists the operations which may be sharded, but are not
+// guaranteed to be. See the `Shardable()` implementations
+// on the respective expr types for more details.
 // topk, botk, max, & min all must be concatenated and then evaluated in order to avoid
 // potential data loss due to series distribution across shards.
 // For example, grouping by `cluster` for a `max` operation may yield
@ -1944,6 +1979,8 @@ var shardableOps = map[string]bool{
 	// avg is only marked as shardable because we remap it into sum/count.
 	OpTypeAvg:   true,
 	OpTypeCount: true,
+	OpTypeMax:   true,
+	OpTypeMin:   true,

 	// range vector ops
 	OpRangeTypeCount:     true,
@ -2027,3 +2064,30 @@ func (e *VectorExpr) Pipeline() (log.Pipeline, error)         { return log.NewNo
 func (e *VectorExpr) Matchers() []*labels.Matcher             { return nil }
 func (e *VectorExpr) MatcherGroups() ([]MatcherRange, error)  { return nil, e.err }
 func (e *VectorExpr) Extractor() (log.SampleExtractor, error) { return nil, nil }
+
+func ReducesLabels(e Expr) (conflict bool) {
+	e.Walk(func(e interface{}) {
+		switch expr := e.(type) {
+		// Technically, any parser that mutates labels could cause the query
+		// to be non-shardable _if_ the total (inherent+extracted) labels
+		// exist on two different shards, but this is incredibly unlikely
+		// for parsers which add new labels so I (owen-d) am preferring
+		// to continue sharding in those cases and only prevent sharding
+		// when using `drop` or `keep` which reduce labels to a smaller subset
+		// more likely to collide across shards.
+		case *KeepLabelsExpr, *DropLabelsExpr:
+			conflict = true
+		case *LabelFmtExpr:
+			// TODO(owen-d): renaming is shardable in many cases, but will
+			// likely require a `sum without ()` wrapper to combine the
+			// same extracted labelsets executed on different shards
+			for _, f := range expr.Formats {
+				if f.Rename {
+					conflict = true
+					break
+				}
+			}
+		}
+	})
+	return
+}
--- a/pkg/logql/syntax/ast_test.go
+++ b/pkg/logql/syntax/ast_test.go
@ -711,7 +711,7 @@ func TestGroupingString(t *testing.T) {
 		Groups:  nil,
 		Without: false,
 	}
-	require.Equal(t, "", g.String())
+	require.Equal(t, " by ()", g.String())

 	g = Grouping{
 		Groups:  []string{"a", "b"},
@ -729,5 +729,5 @@ func TestGroupingString(t *testing.T) {
 		Groups:  nil,
 		Without: true,
 	}
-	require.Equal(t, "", g.String())
+	require.Equal(t, " without ()", g.String())
 }
--- a/pkg/querier/queryrange/querysharding_test.go
+++ b/pkg/querier/queryrange/querysharding_test.go
@ -208,7 +208,7 @@ func Test_astMapper_QuerySizeLimits(t *testing.T) {
 		},
 		{
 			desc:                     "Non shardable query too big",
-			query:                    `sum_over_time({app="foo"} |= "foo" | unwrap foo [1h])`,
+			query:                    `avg_over_time({job="foo"} | json busy="utilization" | unwrap busy [5m])`,
 			maxQuerierBytesSize:      10,
 			err:                      fmt.Sprintf(limErrQuerierTooManyBytesUnshardableTmpl, "100 B", "10 B"),
 			expectedStatsHandlerHits: 1,