fix(promql): histogram_quantile and histogram_fraction NaN observed in native histogram (#16724)

* fix(promql): histogram_quantile NaN observed in native histogram

Fixes: #16578

See the issue for detailed explanation.
When a histogram had only NaN observations and no normal observations,
we returned 0 from the quantile, which is completely wrong. If there were
normal observations but we went over them, we returned the upper bound of
the existing buckets, however that contradicts expectations on
histogram_fraction. Now we return NaN if the quantile is calculated to be
over all normal observations, falling into NaNs (in a virtual +Inf bucket).

We also return info level annotations if we see any NaN observations.
The annotation calls out if we returned NaN or even if we took the
virtual +Inf bucket into account.

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>

* fix(promql): histogram_fraction NaN observed in native histogram

Fixes: #16580

According to the specification we should not take NaN observations
into account when calculating the fraction. This commit fixes that
and adds an info level annotation to let the user know about this.

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
pull/16777/head
George Krajcsovits 1 day ago committed by GitHub
parent 31e158b749
commit 5b7ff92d95
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 15
      docs/querying/functions.md
  2. 8
      promql/functions.go
  3. 45
      promql/promqltest/testdata/native_histograms.test
  4. 104
      promql/quantile.go
  5. 24
      util/annotations/annotations.go

@ -293,6 +293,10 @@ boundaries, the function uses interpolation to estimate the fraction. With the
resulting uncertainty, it becomes irrelevant if the boundaries are inclusive or
exclusive.
Special case for native histograms with standard exponential buckets:
`NaN` observations are considered outside of any buckets in this case.
`histogram_fraction(-Inf, +Inf, b)` effectively returns the fraction of
non-`NaN` observations and may therefore be less than 1.
## `histogram_quantile()`
@ -385,9 +389,16 @@ Special cases for classic histograms:
is applied within that bucket. Otherwise, the upper bound of the lowest
bucket is returned for quantiles located in the lowest bucket.
Special cases for native histograms (relevant for the exact interpolation
happening within the zero bucket):
Special cases for native histograms:
* If a native histogram with standard exponential buckets has `NaN`
observations and the quantile falls into one of the existing exponential
buckets, the result is skewed towards higher values due to `NaN`
observations treated as `+Inf`. This is flagged with an info level
annotation.
* If a native histogram with standard exponential buckets has `NaN`
observations and the quantile falls above all of the existing exponential
buckets, `NaN` is returned. This is flagged with an info level annotation.
* A zero bucket with finite width is assumed to contain no negative
observations if the histogram has observations in positive buckets, but none
in negative buckets.

@ -1364,9 +1364,11 @@ func funcHistogramFraction(vals []parser.Value, args parser.Expressions, enh *Ev
if !enh.enableDelayedNameRemoval {
sample.Metric = sample.Metric.DropReserved(schema.IsMetadataLabel)
}
hf, hfAnnos := HistogramFraction(lower, upper, sample.H, sample.Metric.Get(model.MetricNameLabel), args[0].PositionRange())
annos.Merge(hfAnnos)
enh.Out = append(enh.Out, Sample{
Metric: sample.Metric,
F: HistogramFraction(lower, upper, sample.H),
F: hf,
DropName: true,
})
}
@ -1410,9 +1412,11 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev
if !enh.enableDelayedNameRemoval {
sample.Metric = sample.Metric.DropReserved(schema.IsMetadataLabel)
}
hq, hqAnnos := HistogramQuantile(q, sample.H, sample.Metric.Get(model.MetricNameLabel), args[0].PositionRange())
annos.Merge(hqAnnos)
enh.Out = append(enh.Out, Sample{
Metric: sample.Metric,
F: HistogramQuantile(q, sample.H),
F: hq,
DropName: true,
})
}

@ -44,6 +44,7 @@ eval instant at 1m histogram_fraction(1, 2, single_histogram)
# We expect all values to fall in the range 0 < x <= 8.
eval instant at 1m histogram_fraction(0, 8, single_histogram)
expect no_info
{} 1
# Median is 1.414213562373095 (2**2**-1, or sqrt(2)) due to
@ -51,6 +52,7 @@ eval instant at 1m histogram_fraction(0, 8, single_histogram)
# 2 is assumed where the bucket boundary would be if we increased the
# resolution of the histogram by one step.
eval instant at 1m histogram_quantile(0.5, single_histogram)
expect no_info
{} 1.414213562373095
clear
@ -1328,3 +1330,46 @@ eval range from 0s to 60s step 15s last_over_time({__name__="http_request_durati
{__name__="http_request_duration_seconds", pod="nginx-1"} {{count:3 sum:14 buckets:[1 2]}}x4
clear
# Test native histogram quantile and fraction when the native histogram with exponential
# buckets has NaN observations.
load 1m
histogram_nan{case="100% NaNs"} {{schema:0 count:0 sum:0}} {{schema:0 count:3 sum:NaN}}
histogram_nan{case="20% NaNs"} {{schema:0 count:0 sum:0}} {{schema:0 count:15 sum:NaN buckets:[12]}}
eval instant at 1m histogram_quantile(1, histogram_nan)
expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan"
{case="100% NaNs"} NaN
{case="20% NaNs"} NaN
eval instant at 1m histogram_quantile(0.81, histogram_nan)
expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan"
{case="100% NaNs"} NaN
{case="20% NaNs"} NaN
eval instant at 1m histogram_quantile(0.8, histogram_nan{case="100% NaNs"})
expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan"
{case="100% NaNs"} NaN
eval instant at 1m histogram_quantile(0.8, histogram_nan{case="20% NaNs"})
expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is skewed higher for metric name "histogram_nan"
{case="20% NaNs"} 1
eval instant at 1m histogram_quantile(0.4, histogram_nan{case="100% NaNs"})
expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan"
{case="100% NaNs"} NaN
# histogram_quantile and histogram_fraction equivalence if quantile is not NaN
eval instant at 1m histogram_quantile(0.4, histogram_nan{case="20% NaNs"})
expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is skewed higher for metric name "histogram_nan"
{case="20% NaNs"} 0.7071067811865475
eval instant at 1m histogram_fraction(-Inf, 0.7071067811865475, histogram_nan)
expect info msg: PromQL info: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name "histogram_nan"
{case="100% NaNs"} 0.0
{case="20% NaNs"} 0.4
eval instant at 1m histogram_fraction(-Inf, +Inf, histogram_nan)
expect info msg: PromQL info: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name "histogram_nan"
{case="100% NaNs"} 0.0
{case="20% NaNs"} 0.8

@ -20,7 +20,9 @@ import (
"github.com/prometheus/prometheus/model/histogram"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/promql/parser/posrange"
"github.com/prometheus/prometheus/util/almost"
"github.com/prometheus/prometheus/util/annotations"
)
// smallDeltaTolerance is the threshold for relative deltas between classic
@ -195,24 +197,34 @@ func BucketQuantile(q float64, buckets Buckets) (float64, bool, bool) {
//
// If q is NaN, NaN is returned.
//
// If the native histogram has NaN observations and the quantile falls into
// an existing bucket, then an additional info level annotation is returned
// informing the user about possible skew to higher values as NaNs are
// considered +Inf in this case.
//
// If the native histogram has NaN observations and the quantile falls above
// all existing buckets, then NaN is returned along with an additional info
// level annotation informing the user that this has happened.
//
// HistogramQuantile is for calculating the histogram_quantile() of native
// histograms. See also: BucketQuantile for classic histograms.
//
// HistogramQuantile is exported as it may be used by other PromQL engine
// implementations.
func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
func HistogramQuantile(q float64, h *histogram.FloatHistogram, metricName string, pos posrange.PositionRange) (float64, annotations.Annotations) {
if q < 0 {
return math.Inf(-1)
return math.Inf(-1), nil
}
if q > 1 {
return math.Inf(+1)
return math.Inf(+1), nil
}
if h.Count == 0 || math.IsNaN(q) {
return math.NaN()
return math.NaN(), nil
}
var (
annos annotations.Annotations
bucket histogram.Bucket[float64]
count float64
it histogram.BucketIterator[float64]
@ -255,12 +267,12 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
if bucket.Lower == math.Inf(-1) {
// first bucket, with lower bound -Inf
if bucket.Upper <= 0 {
return bucket.Upper
return bucket.Upper, annos
}
bucket.Lower = 0
} else if bucket.Upper == math.Inf(1) {
// last bucket, with upper bound +Inf
return bucket.Lower
return bucket.Lower, annos
}
}
// Due to numerical inaccuracies, we could end up with a higher count
@ -270,21 +282,43 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
}
// We could have hit the highest bucket without even reaching the rank
// (this should only happen if the histogram contains observations of
// the value NaN), in which case we simply return the upper limit of the
// highest explicit bucket.
// the value NaN, in which case Sum is also NaN), in which case we simply
// return NaN.
// See https://github.com/prometheus/prometheus/issues/16578
if count < rank {
return bucket.Upper
if math.IsNaN(h.Sum) {
return math.NaN(), annos.Add(annotations.NewNativeHistogramQuantileNaNResultInfo(metricName, pos))
}
// This should not happen. Either NaNs are in the +Inf bucket (NHCB) and
// then count >= rank, or Sum is set to NaN. Might be a precision issue
// or something wrong with the histogram, fall back to returning the
// upper bound of the highest explicit bucket.
return bucket.Upper, annos
}
// NaN observations increase h.Count but not the total number of
// observations in the buckets. Therefore, we have to use the forward
// iterator to find percentiles. We recognize histograms containing NaN
// observations by checking if their h.Sum is NaN.
// iterator to find percentiles.
if math.IsNaN(h.Sum) || q < 0.5 {
rank -= count - bucket.Count
} else {
rank = count - rank
}
// We recognize histograms containing NaN observations by checking if their
// h.Sum is NaN and total number of observations is higher than the h.Count.
// If the latter is lost in precision, then the skew isn't worth reporting
// anyway. If the number is not greater, then the histogram observed -Inf
// and +Inf at some point, which made the Sum == NaN.
if math.IsNaN(h.Sum) {
// Detect if h.Count is greater than sum of buckets.
for it.Next() {
bucket = it.At()
count += bucket.Count
}
if count < h.Count {
annos.Add(annotations.NewNativeHistogramQuantileNaNSkewInfo(metricName, pos))
}
}
// The fraction of how far we are into the current bucket.
fraction := rank / bucket.Count
@ -292,7 +326,7 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
// Return linear interpolation for custom buckets and for quantiles that
// end up in the zero bucket.
if h.UsesCustomBuckets() || (bucket.Lower <= 0 && bucket.Upper >= 0) {
return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction
return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction, annos
}
// For exponential buckets, we interpolate on a logarithmic scale. On a
@ -305,10 +339,10 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
logLower := math.Log2(math.Abs(bucket.Lower))
logUpper := math.Log2(math.Abs(bucket.Upper))
if bucket.Lower > 0 { // Positive bucket.
return math.Exp2(logLower + (logUpper-logLower)*fraction)
return math.Exp2(logLower + (logUpper-logLower)*fraction), annos
}
// Otherwise, we are in a negative bucket and have to mirror things.
return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction))
return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction)), annos
}
// HistogramFraction calculates the fraction of observations between the
@ -343,23 +377,29 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
//
// If lower >= upper and the histogram has at least 1 observation, zero is returned.
//
// If the histogram has NaN observations, these are not considered in any bucket
// thus histogram_fraction(-Inf, +Inf, v) might be less than 1.0. The function
// returns an info level annotation in this case.
//
// HistogramFraction is exported as it may be used by other PromQL engine
// implementations.
func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram) float64 {
func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram, metricName string, pos posrange.PositionRange) (float64, annotations.Annotations) {
if h.Count == 0 || math.IsNaN(lower) || math.IsNaN(upper) {
return math.NaN()
return math.NaN(), nil
}
if lower >= upper {
return 0
return 0, nil
}
var (
rank, lowerRank, upperRank float64
lowerSet, upperSet bool
it = h.AllBucketIterator()
count, rank, lowerRank, upperRank float64
lowerSet, upperSet bool
it = h.AllBucketIterator()
annos annotations.Annotations
)
for it.Next() {
b := it.At()
count += b.Count
zeroBucket := false
// interpolateLinearly is used for custom buckets to be
@ -438,14 +478,28 @@ func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram) float6
}
rank += b.Count
}
if !lowerSet || lowerRank > h.Count {
lowerRank = h.Count
if math.IsNaN(h.Sum) {
// There might be NaN observations, so we need to adjust
// the count to only include non `NaN` observations.
for it.Next() {
b := it.At()
count += b.Count
}
if count < h.Count {
annos.Add(annotations.NewNativeHistogramFractionNaNsInfo(metricName, pos))
}
} else {
count = h.Count
}
if !lowerSet || lowerRank > count {
lowerRank = count
}
if !upperSet || upperRank > h.Count {
upperRank = h.Count
if !upperSet || upperRank > count {
upperRank = count
}
return (upperRank - lowerRank) / h.Count
return (upperRank - lowerRank) / h.Count, annos
}
// BucketFraction is a version of HistogramFraction for classic histograms.

@ -152,6 +152,9 @@ var (
IncompatibleTypesInBinOpInfo = fmt.Errorf("%w: incompatible sample types encountered for binary operator", PromQLInfo)
HistogramIgnoredInAggregationInfo = fmt.Errorf("%w: ignored histogram in", PromQLInfo)
HistogramIgnoredInMixedRangeInfo = fmt.Errorf("%w: ignored histograms in a range containing both floats and histograms for metric name", PromQLInfo)
NativeHistogramQuantileNaNResultInfo = fmt.Errorf("%w: input to histogram_quantile has NaN observations, result is NaN for metric name", PromQLInfo)
NativeHistogramQuantileNaNSkewInfo = fmt.Errorf("%w: input to histogram_quantile has NaN observations, result is skewed higher for metric name", PromQLInfo)
NativeHistogramFractionNaNsInfo = fmt.Errorf("%w: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name", PromQLInfo)
)
type annoErr struct {
@ -324,3 +327,24 @@ func NewIncompatibleBucketLayoutInBinOpWarning(operator string, pos posrange.Pos
Err: fmt.Errorf("%w %s", IncompatibleBucketLayoutInBinOpWarning, operator),
}
}
func NewNativeHistogramQuantileNaNResultInfo(metricName string, pos posrange.PositionRange) error {
return annoErr{
PositionRange: pos,
Err: fmt.Errorf("%w %q", NativeHistogramQuantileNaNResultInfo, metricName),
}
}
func NewNativeHistogramQuantileNaNSkewInfo(metricName string, pos posrange.PositionRange) error {
return annoErr{
PositionRange: pos,
Err: fmt.Errorf("%w %q", NativeHistogramQuantileNaNSkewInfo, metricName),
}
}
func NewNativeHistogramFractionNaNsInfo(metricName string, pos posrange.PositionRange) error {
return annoErr{
PositionRange: pos,
Err: fmt.Errorf("%w %q", NativeHistogramFractionNaNsInfo, metricName),
}
}

Loading…
Cancel
Save