loki/vendor/cloud.google.com/go/bigtable/metrics.go

/*
Copyright 2024 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package bigtable

import (
	"context"
	"errors"
	"fmt"
	"os"
	"time"

	"cloud.google.com/go/bigtable/internal"
	"github.com/google/uuid"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/metric"
	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
	"google.golang.org/api/option"
)

const (
	builtInMetricsMeterName = "bigtable.googleapis.com/internal/client/"

	metricsPrefix         = "bigtable/"
	locationMDKey         = "x-goog-ext-425905942-bin"
	serverTimingMDKey     = "server-timing"
	serverTimingValPrefix = "gfet4t7; dur="

	// Monitored resource labels
	monitoredResLabelKeyProject  = "project_id"
	monitoredResLabelKeyInstance = "instance"
	monitoredResLabelKeyTable    = "table"
	monitoredResLabelKeyCluster  = "cluster"
	monitoredResLabelKeyZone     = "zone"

	// Metric labels
	metricLabelKeyAppProfile         = "app_profile"
	metricLabelKeyMethod             = "method"
	metricLabelKeyStatus             = "status"
	metricLabelKeyTag                = "tag"
	metricLabelKeyStreamingOperation = "streaming"
	metricLabelKeyClientName         = "client_name"
	metricLabelKeyClientUID          = "client_uid"

	// Metric names
	metricNameOperationLatencies = "operation_latencies"
	metricNameAttemptLatencies   = "attempt_latencies"
	metricNameServerLatencies    = "server_latencies"
	metricNameRetryCount         = "retry_count"
	metricNameDebugTags          = "debug_tags"

	// Metric units
	metricUnitMS    = "ms"
	metricUnitCount = "1"
)

// These are effectively constant, but for testing purposes they are mutable
var (
	// duration between two metric exports
	defaultSamplePeriod = time.Minute

	metricsErrorPrefix = "bigtable-metrics: "

	clientName = fmt.Sprintf("go-bigtable/%v", internal.Version)

	bucketBounds = []float64{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0, 13.0, 16.0, 20.0, 25.0, 30.0, 40.0,
		50.0, 65.0, 80.0, 100.0, 130.0, 160.0, 200.0, 250.0, 300.0, 400.0, 500.0, 650.0,
		800.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0, 100000.0, 200000.0,
		400000.0, 800000.0, 1600000.0, 3200000.0}

	// All the built-in metrics have same attributes except 'tag', 'status' and 'streaming'
	// These attributes need to be added to only few of the metrics
	metricsDetails = map[string]metricInfo{
		metricNameOperationLatencies: {
			additionalAttrs: []string{
				metricLabelKeyStatus,
				metricLabelKeyStreamingOperation,
			},
			recordedPerAttempt: false,
		},
		metricNameAttemptLatencies: {
			additionalAttrs: []string{
				metricLabelKeyStatus,
				metricLabelKeyStreamingOperation,
			},
			recordedPerAttempt: true,
		},
		metricNameServerLatencies: {
			additionalAttrs: []string{
				metricLabelKeyStatus,
				metricLabelKeyStreamingOperation,
			},
			recordedPerAttempt: true,
		},
		metricNameRetryCount: {
			additionalAttrs: []string{
				metricLabelKeyStatus,
			},
			recordedPerAttempt: true,
		},
	}

	// Generates unique client ID in the format go-<random UUID>@<hostname>
	generateClientUID = func() (string, error) {
		hostname := "localhost"
		hostname, err := os.Hostname()
		if err != nil {
			return "", err
		}
		return "go-" + uuid.NewString() + "@" + hostname, nil
	}

	// GCM exporter should use the same options as Bigtable client
	// createExporterOptions takes Bigtable client options and returns exporter options
	// Overwritten in tests
	createExporterOptions = func(btOpts ...option.ClientOption) []option.ClientOption {
		return btOpts
	}
)

type metricInfo struct {
	additionalAttrs    []string
	recordedPerAttempt bool
}

type builtinMetricsTracerFactory struct {
	enabled bool

	// To be called on client close
	shutdown func()

	// attributes that are specific to a client instance and
	// do not change across different function calls on client
	clientAttributes []attribute.KeyValue

	operationLatencies metric.Float64Histogram
	serverLatencies    metric.Float64Histogram
	attemptLatencies   metric.Float64Histogram
	retryCount         metric.Int64Counter
	debugTags          metric.Int64Counter
}

func newBuiltinMetricsTracerFactory(ctx context.Context, project, instance, appProfile string, metricsProvider MetricsProvider, opts ...option.ClientOption) (*builtinMetricsTracerFactory, error) {
	clientUID, err := generateClientUID()
	if err != nil {
		return nil, err
	}

	tracerFactory := &builtinMetricsTracerFactory{
		enabled: false,
		clientAttributes: []attribute.KeyValue{
			attribute.String(monitoredResLabelKeyProject, project),
			attribute.String(monitoredResLabelKeyInstance, instance),
			attribute.String(metricLabelKeyAppProfile, appProfile),
			attribute.String(metricLabelKeyClientUID, clientUID),
			attribute.String(metricLabelKeyClientName, clientName),
		},
		shutdown: func() {},
	}

	var meterProvider *sdkmetric.MeterProvider
	if metricsProvider == nil {
		// Create default meter provider
		mpOptions, err := builtInMeterProviderOptions(project, opts...)
		if err != nil {
			return tracerFactory, err
		}
		meterProvider = sdkmetric.NewMeterProvider(mpOptions...)

		tracerFactory.enabled = true
		tracerFactory.shutdown = func() { meterProvider.Shutdown(ctx) }
	} else {
		switch metricsProvider.(type) {
		case NoopMetricsProvider:
			tracerFactory.enabled = false
			return tracerFactory, nil
		default:
			tracerFactory.enabled = false
			return tracerFactory, errors.New("unknown MetricsProvider type")
		}
	}

	// Create meter and instruments
	meter := meterProvider.Meter(builtInMetricsMeterName, metric.WithInstrumentationVersion(internal.Version))
	err = tracerFactory.createInstruments(meter)
	return tracerFactory, err
}

func builtInMeterProviderOptions(project string, opts ...option.ClientOption) ([]sdkmetric.Option, error) {
	allOpts := createExporterOptions(opts...)
	defaultExporter, err := newMonitoringExporter(context.Background(), project, allOpts...)
	if err != nil {
		return nil, err
	}

	return []sdkmetric.Option{sdkmetric.WithReader(
		sdkmetric.NewPeriodicReader(
			defaultExporter,
			sdkmetric.WithInterval(defaultSamplePeriod),
		),
	)}, nil
}

func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) error {
	var err error

	// Create operation_latencies
	tf.operationLatencies, err = meter.Float64Histogram(
		metricNameOperationLatencies,
		metric.WithDescription("Total time until final operation success or failure, including retries and backoff."),
		metric.WithUnit(metricUnitMS),
		metric.WithExplicitBucketBoundaries(bucketBounds...),
	)
	if err != nil {
		return err
	}

	// Create attempt_latencies
	tf.attemptLatencies, err = meter.Float64Histogram(
		metricNameAttemptLatencies,
		metric.WithDescription("Client observed latency per RPC attempt."),
		metric.WithUnit(metricUnitMS),
		metric.WithExplicitBucketBoundaries(bucketBounds...),
	)
	if err != nil {
		return err
	}

	// Create server_latencies
	tf.serverLatencies, err = meter.Float64Histogram(
		metricNameServerLatencies,
		metric.WithDescription("The latency measured from the moment that the RPC entered the Google data center until the RPC was completed."),
		metric.WithUnit(metricUnitMS),
		metric.WithExplicitBucketBoundaries(bucketBounds...),
	)
	if err != nil {
		return err
	}

	// Create retry_count
	tf.retryCount, err = meter.Int64Counter(
		metricNameRetryCount,
		metric.WithDescription("The number of additional RPCs sent after the initial attempt."),
		metric.WithUnit(metricUnitCount),
	)
	if err != nil {
		return err
	}

	// Create debug_tags
	tf.debugTags, err = meter.Int64Counter(
		metricNameDebugTags,
		metric.WithDescription("A counter of internal client events used for debugging."),
		metric.WithUnit(metricUnitCount),
	)
	return err
}

// builtinMetricsTracer is created one per operation
// It is used to store metric instruments, attribute values
// and other data required to obtain and record them
type builtinMetricsTracer struct {
	ctx            context.Context
	builtInEnabled bool

	// attributes that are specific to a client instance and
	// do not change across different operations on client
	clientAttributes []attribute.KeyValue

	instrumentOperationLatencies metric.Float64Histogram
	instrumentServerLatencies    metric.Float64Histogram
	instrumentAttemptLatencies   metric.Float64Histogram
	instrumentRetryCount         metric.Int64Counter
	instrumentDebugTags          metric.Int64Counter

	tableName   string
	method      string
	isStreaming bool

	currOp opTracer
}

func (b *builtinMetricsTracer) setMethod(m string) {
	b.method = "Bigtable." + m
}

// opTracer is used to record metrics for the entire operation, including retries.
// Operation is a logical unit that represents a single method invocation on client.
// The method might require multiple attempts/rpcs and backoff logic to complete
type opTracer struct {
	attemptCount int64

	startTime time.Time

	// gRPC status code of last completed attempt
	status string

	currAttempt attemptTracer
}

func (o *opTracer) setStartTime(t time.Time) {
	o.startTime = t
}

func (o *opTracer) setStatus(status string) {
	o.status = status
}

func (o *opTracer) incrementAttemptCount() {
	o.attemptCount++
}

// attemptTracer is used to record metrics for each individual attempt of the operation.
// Attempt corresponds to an attempt of an RPC.
type attemptTracer struct {
	startTime time.Time
	clusterID string
	zoneID    string

	// gRPC status code
	status string

	// Server latency in ms
	serverLatency float64

	// Error seen while getting server latency from headers
	serverLatencyErr error
}

func (a *attemptTracer) setStartTime(t time.Time) {
	a.startTime = t
}

func (a *attemptTracer) setClusterID(clusterID string) {
	a.clusterID = clusterID
}

func (a *attemptTracer) setZoneID(zoneID string) {
	a.zoneID = zoneID
}

func (a *attemptTracer) setStatus(status string) {
	a.status = status
}

func (a *attemptTracer) setServerLatency(latency float64) {
	a.serverLatency = latency
}

func (a *attemptTracer) setServerLatencyErr(err error) {
	a.serverLatencyErr = err
}

func (tf *builtinMetricsTracerFactory) createBuiltinMetricsTracer(ctx context.Context, tableName string, isStreaming bool) builtinMetricsTracer {
	// Operation has started but not the attempt.
	// So, create only operation tracer and not attempt tracer
	currOpTracer := opTracer{}
	currOpTracer.setStartTime(time.Now())

	return builtinMetricsTracer{
		ctx:            ctx,
		builtInEnabled: tf.enabled,

		currOp:           currOpTracer,
		clientAttributes: tf.clientAttributes,

		instrumentOperationLatencies: tf.operationLatencies,
		instrumentServerLatencies:    tf.serverLatencies,
		instrumentAttemptLatencies:   tf.attemptLatencies,
		instrumentRetryCount:         tf.retryCount,
		instrumentDebugTags:          tf.debugTags,

		tableName:   tableName,
		isStreaming: isStreaming,
	}
}

// toOtelMetricAttrs:
// - converts metric attributes values captured throughout the operation / attempt
// to OpenTelemetry attributes format,
// - combines these with common client attributes and returns
func (mt *builtinMetricsTracer) toOtelMetricAttrs(metricName string) ([]attribute.KeyValue, error) {
	// Create attribute key value pairs for attributes common to all metricss
	attrKeyValues := []attribute.KeyValue{
		attribute.String(metricLabelKeyMethod, mt.method),

		// Add resource labels to otel metric labels.
		// These will be used for creating the monitored resource but exporter
		// will not add them to Google Cloud Monitoring metric labels
		attribute.String(monitoredResLabelKeyTable, mt.tableName),

		// Irrespective of whether metric is attempt specific or operation specific,
		// use last attempt's cluster and zone
		attribute.String(monitoredResLabelKeyCluster, mt.currOp.currAttempt.clusterID),
		attribute.String(monitoredResLabelKeyZone, mt.currOp.currAttempt.zoneID),
	}
	attrKeyValues = append(attrKeyValues, mt.clientAttributes...)

	// Get metric details
	mDetails, found := metricsDetails[metricName]
	if !found {
		return attrKeyValues, fmt.Errorf("unable to create attributes list for unknown metric: %v", metricName)
	}

	status := mt.currOp.status
	if mDetails.recordedPerAttempt {
		status = mt.currOp.currAttempt.status
	}

	// Add additional attributes to metrics
	for _, attrKey := range mDetails.additionalAttrs {
		switch attrKey {
		case metricLabelKeyStatus:
			attrKeyValues = append(attrKeyValues, attribute.String(metricLabelKeyStatus, status))
		case metricLabelKeyStreamingOperation:
			attrKeyValues = append(attrKeyValues, attribute.Bool(metricLabelKeyStreamingOperation, mt.isStreaming))
		default:
			return attrKeyValues, fmt.Errorf("unknown additional attribute: %v", attrKey)
		}
	}

	return attrKeyValues, nil
}