fix(promql): histogram_quantile and histogram_fraction NaN observed in native histogram (#16724)

* fix(promql): histogram_quantile NaN observed in native histogram Fixes: #16578 See the issue for detailed explanation. When a histogram had only NaN observations and no normal observations, we returned 0 from the quantile, which is completely wrong. If there were normal observations but we went over them, we returned the upper bound of the existing buckets, however that contradicts expectations on histogram_fraction. Now we return NaN if the quantile is calculated to be over all normal observations, falling into NaNs (in a virtual +Inf bucket). We also return info level annotations if we see any NaN observations. The annotation calls out if we returned NaN or even if we took the virtual +Inf bucket into account. Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com> * fix(promql): histogram_fraction NaN observed in native histogram Fixes: #16580 According to the specification we should not take NaN observations into account when calculating the fraction. This commit fixes that and adds an info level annotation to let the user know about this. Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
1 day ago · 5b7ff92d95
parent 31e158b749
commit 5b7ff92d95
5 changed files with 167 additions and 29 deletions
--- a/docs/querying/functions.md
+++ b/docs/querying/functions.md
@ -293,6 +293,10 @@ boundaries, the function uses interpolation to estimate the fraction. With the
 resulting uncertainty, it becomes irrelevant if the boundaries are inclusive or
 exclusive.

+Special case for native histograms with standard exponential buckets:
+`NaN` observations are considered outside of any buckets in this case.
+`histogram_fraction(-Inf, +Inf, b)` effectively returns the fraction of
+non-`NaN` observations and may therefore be less than 1.

 ## `histogram_quantile()`

@ -385,9 +389,16 @@ Special cases for classic histograms:
  is applied within that bucket. Otherwise, the upper bound of the lowest
  bucket is returned for quantiles located in the lowest bucket.

-Special cases for native histograms (relevant for the exact interpolation
-happening within the zero bucket):
+Special cases for native histograms:

+* If a native histogram with standard exponential buckets has `NaN`
+  observations and the quantile falls into one of the existing exponential
+  buckets, the result is skewed towards higher values due to `NaN`
+  observations treated as `+Inf`. This is flagged with an info level
+  annotation.
+* If a native histogram with standard exponential buckets has `NaN`
+  observations and the quantile falls above all of the existing exponential
+  buckets, `NaN` is returned. This is flagged with an info level annotation.
 * A zero bucket with finite width is assumed to contain no negative
  observations if the histogram has observations in positive buckets, but none
  in negative buckets.
--- a/promql/functions.go
+++ b/promql/functions.go
@ -1364,9 +1364,11 @@ func funcHistogramFraction(vals []parser.Value, args parser.Expressions, enh *Ev
 		if !enh.enableDelayedNameRemoval {
 			sample.Metric = sample.Metric.DropReserved(schema.IsMetadataLabel)
 		}
+		hf, hfAnnos := HistogramFraction(lower, upper, sample.H, sample.Metric.Get(model.MetricNameLabel), args[0].PositionRange())
+		annos.Merge(hfAnnos)
 		enh.Out = append(enh.Out, Sample{
 			Metric:   sample.Metric,
-			F:        HistogramFraction(lower, upper, sample.H),
+			F:        hf,
 			DropName: true,
 		})
 	}
@ -1410,9 +1412,11 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev
 		if !enh.enableDelayedNameRemoval {
 			sample.Metric = sample.Metric.DropReserved(schema.IsMetadataLabel)
 		}
+		hq, hqAnnos := HistogramQuantile(q, sample.H, sample.Metric.Get(model.MetricNameLabel), args[0].PositionRange())
+		annos.Merge(hqAnnos)
 		enh.Out = append(enh.Out, Sample{
 			Metric:   sample.Metric,
-			F:        HistogramQuantile(q, sample.H),
+			F:        hq,
 			DropName: true,
 		})
 	}
--- a/promql/promqltest/testdata/native_histograms.test
+++ b/promql/promqltest/testdata/native_histograms.test
@ -44,6 +44,7 @@ eval instant at 1m histogram_fraction(1, 2, single_histogram)

 # We expect all values to fall in the range 0 < x <= 8.
 eval instant at 1m histogram_fraction(0, 8, single_histogram)
+    expect no_info
 	{} 1

 # Median is 1.414213562373095 (2**2**-1, or sqrt(2)) due to
@ -51,6 +52,7 @@ eval instant at 1m histogram_fraction(0, 8, single_histogram)
 # 2 is assumed where the bucket boundary would be if we increased the
 # resolution of the histogram by one step.
 eval instant at 1m histogram_quantile(0.5, single_histogram)
+	expect no_info
 	{} 1.414213562373095

 clear
@ -1328,3 +1330,46 @@ eval range from 0s to 60s step 15s last_over_time({__name__="http_request_durati
    {__name__="http_request_duration_seconds", pod="nginx-1"} {{count:3 sum:14 buckets:[1 2]}}x4

 clear
+
+# Test native histogram quantile and fraction when the native histogram with exponential
+# buckets has NaN observations.
+load 1m
+    histogram_nan{case="100% NaNs"} {{schema:0 count:0 sum:0}} {{schema:0 count:3 sum:NaN}}
+    histogram_nan{case="20% NaNs"} {{schema:0 count:0 sum:0}} {{schema:0 count:15 sum:NaN buckets:[12]}}
+
+eval instant at 1m histogram_quantile(1, histogram_nan)
+    expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan"
+    {case="100% NaNs"} NaN
+    {case="20% NaNs"} NaN
+
+eval instant at 1m histogram_quantile(0.81, histogram_nan)
+    expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan"
+    {case="100% NaNs"} NaN
+    {case="20% NaNs"} NaN
+
+eval instant at 1m histogram_quantile(0.8, histogram_nan{case="100% NaNs"})
+    expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan"
+    {case="100% NaNs"} NaN
+
+eval instant at 1m histogram_quantile(0.8, histogram_nan{case="20% NaNs"})
+    expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is skewed higher for metric name "histogram_nan"
+    {case="20% NaNs"} 1
+
+eval instant at 1m histogram_quantile(0.4, histogram_nan{case="100% NaNs"})
+    expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan"
+    {case="100% NaNs"} NaN
+
+# histogram_quantile and histogram_fraction equivalence if quantile is not NaN
+eval instant at 1m histogram_quantile(0.4, histogram_nan{case="20% NaNs"})
+    expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is skewed higher for metric name "histogram_nan"
+    {case="20% NaNs"} 0.7071067811865475
+
+eval instant at 1m histogram_fraction(-Inf, 0.7071067811865475, histogram_nan)
+    expect info msg: PromQL info: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name "histogram_nan"
+    {case="100% NaNs"} 0.0
+    {case="20% NaNs"} 0.4
+
+eval instant at 1m histogram_fraction(-Inf, +Inf, histogram_nan)
+    expect info msg: PromQL info: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name "histogram_nan"
+    {case="100% NaNs"} 0.0
+    {case="20% NaNs"} 0.8
--- a/promql/quantile.go
+++ b/promql/quantile.go
@ -20,7 +20,9 @@ import (

 	"github.com/prometheus/prometheus/model/histogram"
 	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/promql/parser/posrange"
 	"github.com/prometheus/prometheus/util/almost"
+	"github.com/prometheus/prometheus/util/annotations"
 )

 // smallDeltaTolerance is the threshold for relative deltas between classic
@ -195,24 +197,34 @@ func BucketQuantile(q float64, buckets Buckets) (float64, bool, bool) {
 //
 // If q is NaN, NaN is returned.
 //
+// If the native histogram has NaN observations and the quantile falls into
+// an existing bucket, then an additional info level annotation is returned
+// informing the user about possible skew to higher values as NaNs are
+// considered +Inf in this case.
+//
+// If the native histogram has NaN observations and the quantile falls above
+// all existing buckets, then NaN is returned along with an additional info
+// level annotation informing the user that this has happened.
+//
 // HistogramQuantile is for calculating the histogram_quantile() of native
 // histograms. See also: BucketQuantile for classic histograms.
 //
 // HistogramQuantile is exported as it may be used by other PromQL engine
 // implementations.
-func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
+func HistogramQuantile(q float64, h *histogram.FloatHistogram, metricName string, pos posrange.PositionRange) (float64, annotations.Annotations) {
 	if q < 0 {
-		return math.Inf(-1)
+		return math.Inf(-1), nil
 	}
 	if q > 1 {
-		return math.Inf(+1)
+		return math.Inf(+1), nil
 	}

 	if h.Count == 0 || math.IsNaN(q) {
-		return math.NaN()
+		return math.NaN(), nil
 	}

 	var (
+		annos  annotations.Annotations
 		bucket histogram.Bucket[float64]
 		count  float64
 		it     histogram.BucketIterator[float64]
@ -255,12 +267,12 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
 		if bucket.Lower == math.Inf(-1) {
 			// first bucket, with lower bound -Inf
 			if bucket.Upper <= 0 {
-				return bucket.Upper
+				return bucket.Upper, annos
 			}
 			bucket.Lower = 0
 		} else if bucket.Upper == math.Inf(1) {
 			// last bucket, with upper bound +Inf
-			return bucket.Lower
+			return bucket.Lower, annos
 		}
 	}
 	// Due to numerical inaccuracies, we could end up with a higher count
@ -270,21 +282,43 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
 	}
 	// We could have hit the highest bucket without even reaching the rank
 	// (this should only happen if the histogram contains observations of
-	// the value NaN), in which case we simply return the upper limit of the
-	// highest explicit bucket.
+	// the value NaN, in which case Sum is also NaN), in which case we simply
+	// return NaN.
+	// See https://github.com/prometheus/prometheus/issues/16578
 	if count < rank {
-		return bucket.Upper
+		if math.IsNaN(h.Sum) {
+			return math.NaN(), annos.Add(annotations.NewNativeHistogramQuantileNaNResultInfo(metricName, pos))
+		}
+		// This should not happen. Either NaNs are in the +Inf bucket (NHCB) and
+		// then count >= rank, or Sum is set to NaN. Might be a precision issue
+		// or something wrong with the histogram, fall back to returning the
+		// upper bound of the highest explicit bucket.
+		return bucket.Upper, annos
 	}

 	// NaN observations increase h.Count but not the total number of
 	// observations in the buckets. Therefore, we have to use the forward
-	// iterator to find percentiles. We recognize histograms containing NaN
-	// observations by checking if their h.Sum is NaN.
+	// iterator to find percentiles.
 	if math.IsNaN(h.Sum) || q < 0.5 {
 		rank -= count - bucket.Count
 	} else {
 		rank = count - rank
 	}
+	// We recognize histograms containing NaN observations by checking if their
+	// h.Sum is NaN and total number of observations is higher than the h.Count.
+	// If the latter is lost in precision, then the skew isn't worth reporting
+	// anyway. If the number is not greater, then the histogram observed -Inf
+	// and +Inf at some point, which made the Sum == NaN.
+	if math.IsNaN(h.Sum) {
+		// Detect if h.Count is greater than sum of buckets.
+		for it.Next() {
+			bucket = it.At()
+			count += bucket.Count
+		}
+		if count < h.Count {
+			annos.Add(annotations.NewNativeHistogramQuantileNaNSkewInfo(metricName, pos))
+		}
+	}

 	// The fraction of how far we are into the current bucket.
 	fraction := rank / bucket.Count
@ -292,7 +326,7 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
 	// Return linear interpolation for custom buckets and for quantiles that
 	// end up in the zero bucket.
 	if h.UsesCustomBuckets() || (bucket.Lower <= 0 && bucket.Upper >= 0) {
-		return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction
+		return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction, annos
 	}

 	// For exponential buckets, we interpolate on a logarithmic scale. On a
@ -305,10 +339,10 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
 	logLower := math.Log2(math.Abs(bucket.Lower))
 	logUpper := math.Log2(math.Abs(bucket.Upper))
 	if bucket.Lower > 0 { // Positive bucket.
-		return math.Exp2(logLower + (logUpper-logLower)*fraction)
+		return math.Exp2(logLower + (logUpper-logLower)*fraction), annos
 	}
 	// Otherwise, we are in a negative bucket and have to mirror things.
-	return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction))
+	return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction)), annos
 }

 // HistogramFraction calculates the fraction of observations between the
@ -343,23 +377,29 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
 //
 // If lower >= upper and the histogram has at least 1 observation, zero is returned.
 //
+// If the histogram has NaN observations, these are not considered in any bucket
+// thus histogram_fraction(-Inf, +Inf, v) might be less than 1.0. The function
+// returns an info level annotation in this case.
+//
 // HistogramFraction is exported as it may be used by other PromQL engine
 // implementations.
-func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram) float64 {
+func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram, metricName string, pos posrange.PositionRange) (float64, annotations.Annotations) {
 	if h.Count == 0 || math.IsNaN(lower) || math.IsNaN(upper) {
-		return math.NaN()
+		return math.NaN(), nil
 	}
 	if lower >= upper {
-		return 0
+		return 0, nil
 	}

 	var (
-		rank, lowerRank, upperRank float64
-		lowerSet, upperSet         bool
-		it                         = h.AllBucketIterator()
+		count, rank, lowerRank, upperRank float64
+		lowerSet, upperSet                bool
+		it                                = h.AllBucketIterator()
+		annos                             annotations.Annotations
 	)
 	for it.Next() {
 		b := it.At()
+		count += b.Count
 		zeroBucket := false

 		// interpolateLinearly is used for custom buckets to be
@ -438,14 +478,28 @@ func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram) float6
 		}
 		rank += b.Count
 	}
-	if !lowerSet || lowerRank > h.Count {
-		lowerRank = h.Count
+	if math.IsNaN(h.Sum) {
+		// There might be NaN observations, so we need to adjust
+		// the count to only include non `NaN` observations.
+		for it.Next() {
+			b := it.At()
+			count += b.Count
+		}
+		if count < h.Count {
+			annos.Add(annotations.NewNativeHistogramFractionNaNsInfo(metricName, pos))
+		}
+	} else {
+		count = h.Count
+	}
+
+	if !lowerSet || lowerRank > count {
+		lowerRank = count
 	}
-	if !upperSet || upperRank > h.Count {
-		upperRank = h.Count
+	if !upperSet || upperRank > count {
+		upperRank = count
 	}

-	return (upperRank - lowerRank) / h.Count
+	return (upperRank - lowerRank) / h.Count, annos
 }

 // BucketFraction is a version of HistogramFraction for classic histograms.
--- a/util/annotations/annotations.go
+++ b/util/annotations/annotations.go
@ -152,6 +152,9 @@ var (
 	IncompatibleTypesInBinOpInfo            = fmt.Errorf("%w: incompatible sample types encountered for binary operator", PromQLInfo)
 	HistogramIgnoredInAggregationInfo       = fmt.Errorf("%w: ignored histogram in", PromQLInfo)
 	HistogramIgnoredInMixedRangeInfo        = fmt.Errorf("%w: ignored histograms in a range containing both floats and histograms for metric name", PromQLInfo)
+	NativeHistogramQuantileNaNResultInfo    = fmt.Errorf("%w: input to histogram_quantile has NaN observations, result is NaN for metric name", PromQLInfo)
+	NativeHistogramQuantileNaNSkewInfo      = fmt.Errorf("%w: input to histogram_quantile has NaN observations, result is skewed higher for metric name", PromQLInfo)
+	NativeHistogramFractionNaNsInfo         = fmt.Errorf("%w: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name", PromQLInfo)
 )

 type annoErr struct {
@ -324,3 +327,24 @@ func NewIncompatibleBucketLayoutInBinOpWarning(operator string, pos posrange.Pos
 		Err:           fmt.Errorf("%w %s", IncompatibleBucketLayoutInBinOpWarning, operator),
 	}
 }
+
+func NewNativeHistogramQuantileNaNResultInfo(metricName string, pos posrange.PositionRange) error {
+	return annoErr{
+		PositionRange: pos,
+		Err:           fmt.Errorf("%w %q", NativeHistogramQuantileNaNResultInfo, metricName),
+	}
+}
+
+func NewNativeHistogramQuantileNaNSkewInfo(metricName string, pos posrange.PositionRange) error {
+	return annoErr{
+		PositionRange: pos,
+		Err:           fmt.Errorf("%w %q", NativeHistogramQuantileNaNSkewInfo, metricName),
+	}
+}
+
+func NewNativeHistogramFractionNaNsInfo(metricName string, pos posrange.PositionRange) error {
+	return annoErr{
+		PositionRange: pos,
+		Err:           fmt.Errorf("%w %q", NativeHistogramFractionNaNsInfo, metricName),
+	}
+}