mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
436 lines
13 KiB
436 lines
13 KiB
/*
|
|
Copyright 2024 Google LLC
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package bigtable
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"time"
|
|
|
|
"cloud.google.com/go/bigtable/internal"
|
|
"github.com/google/uuid"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/metric"
|
|
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
|
"google.golang.org/api/option"
|
|
)
|
|
|
|
const (
|
|
builtInMetricsMeterName = "bigtable.googleapis.com/internal/client/"
|
|
|
|
metricsPrefix = "bigtable/"
|
|
locationMDKey = "x-goog-ext-425905942-bin"
|
|
serverTimingMDKey = "server-timing"
|
|
serverTimingValPrefix = "gfet4t7; dur="
|
|
|
|
// Monitored resource labels
|
|
monitoredResLabelKeyProject = "project_id"
|
|
monitoredResLabelKeyInstance = "instance"
|
|
monitoredResLabelKeyTable = "table"
|
|
monitoredResLabelKeyCluster = "cluster"
|
|
monitoredResLabelKeyZone = "zone"
|
|
|
|
// Metric labels
|
|
metricLabelKeyAppProfile = "app_profile"
|
|
metricLabelKeyMethod = "method"
|
|
metricLabelKeyStatus = "status"
|
|
metricLabelKeyTag = "tag"
|
|
metricLabelKeyStreamingOperation = "streaming"
|
|
metricLabelKeyClientName = "client_name"
|
|
metricLabelKeyClientUID = "client_uid"
|
|
|
|
// Metric names
|
|
metricNameOperationLatencies = "operation_latencies"
|
|
metricNameAttemptLatencies = "attempt_latencies"
|
|
metricNameServerLatencies = "server_latencies"
|
|
metricNameRetryCount = "retry_count"
|
|
metricNameDebugTags = "debug_tags"
|
|
|
|
// Metric units
|
|
metricUnitMS = "ms"
|
|
metricUnitCount = "1"
|
|
)
|
|
|
|
// These are effectively constant, but for testing purposes they are mutable
|
|
var (
|
|
// duration between two metric exports
|
|
defaultSamplePeriod = time.Minute
|
|
|
|
metricsErrorPrefix = "bigtable-metrics: "
|
|
|
|
clientName = fmt.Sprintf("go-bigtable/%v", internal.Version)
|
|
|
|
bucketBounds = []float64{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 10.0, 13.0, 16.0, 20.0, 25.0, 30.0, 40.0,
|
|
50.0, 65.0, 80.0, 100.0, 130.0, 160.0, 200.0, 250.0, 300.0, 400.0, 500.0, 650.0,
|
|
800.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0, 100000.0, 200000.0,
|
|
400000.0, 800000.0, 1600000.0, 3200000.0}
|
|
|
|
// All the built-in metrics have same attributes except 'tag', 'status' and 'streaming'
|
|
// These attributes need to be added to only few of the metrics
|
|
metricsDetails = map[string]metricInfo{
|
|
metricNameOperationLatencies: {
|
|
additionalAttrs: []string{
|
|
metricLabelKeyStatus,
|
|
metricLabelKeyStreamingOperation,
|
|
},
|
|
recordedPerAttempt: false,
|
|
},
|
|
metricNameAttemptLatencies: {
|
|
additionalAttrs: []string{
|
|
metricLabelKeyStatus,
|
|
metricLabelKeyStreamingOperation,
|
|
},
|
|
recordedPerAttempt: true,
|
|
},
|
|
metricNameServerLatencies: {
|
|
additionalAttrs: []string{
|
|
metricLabelKeyStatus,
|
|
metricLabelKeyStreamingOperation,
|
|
},
|
|
recordedPerAttempt: true,
|
|
},
|
|
metricNameRetryCount: {
|
|
additionalAttrs: []string{
|
|
metricLabelKeyStatus,
|
|
},
|
|
recordedPerAttempt: true,
|
|
},
|
|
}
|
|
|
|
// Generates unique client ID in the format go-<random UUID>@<hostname>
|
|
generateClientUID = func() (string, error) {
|
|
hostname := "localhost"
|
|
hostname, err := os.Hostname()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return "go-" + uuid.NewString() + "@" + hostname, nil
|
|
}
|
|
|
|
// GCM exporter should use the same options as Bigtable client
|
|
// createExporterOptions takes Bigtable client options and returns exporter options
|
|
// Overwritten in tests
|
|
createExporterOptions = func(btOpts ...option.ClientOption) []option.ClientOption {
|
|
return btOpts
|
|
}
|
|
)
|
|
|
|
type metricInfo struct {
|
|
additionalAttrs []string
|
|
recordedPerAttempt bool
|
|
}
|
|
|
|
type builtinMetricsTracerFactory struct {
|
|
enabled bool
|
|
|
|
// To be called on client close
|
|
shutdown func()
|
|
|
|
// attributes that are specific to a client instance and
|
|
// do not change across different function calls on client
|
|
clientAttributes []attribute.KeyValue
|
|
|
|
operationLatencies metric.Float64Histogram
|
|
serverLatencies metric.Float64Histogram
|
|
attemptLatencies metric.Float64Histogram
|
|
retryCount metric.Int64Counter
|
|
debugTags metric.Int64Counter
|
|
}
|
|
|
|
func newBuiltinMetricsTracerFactory(ctx context.Context, project, instance, appProfile string, metricsProvider MetricsProvider, opts ...option.ClientOption) (*builtinMetricsTracerFactory, error) {
|
|
clientUID, err := generateClientUID()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
tracerFactory := &builtinMetricsTracerFactory{
|
|
enabled: false,
|
|
clientAttributes: []attribute.KeyValue{
|
|
attribute.String(monitoredResLabelKeyProject, project),
|
|
attribute.String(monitoredResLabelKeyInstance, instance),
|
|
attribute.String(metricLabelKeyAppProfile, appProfile),
|
|
attribute.String(metricLabelKeyClientUID, clientUID),
|
|
attribute.String(metricLabelKeyClientName, clientName),
|
|
},
|
|
shutdown: func() {},
|
|
}
|
|
|
|
var meterProvider *sdkmetric.MeterProvider
|
|
if metricsProvider == nil {
|
|
// Create default meter provider
|
|
mpOptions, err := builtInMeterProviderOptions(project, opts...)
|
|
if err != nil {
|
|
return tracerFactory, err
|
|
}
|
|
meterProvider = sdkmetric.NewMeterProvider(mpOptions...)
|
|
|
|
tracerFactory.enabled = true
|
|
tracerFactory.shutdown = func() { meterProvider.Shutdown(ctx) }
|
|
} else {
|
|
switch metricsProvider.(type) {
|
|
case NoopMetricsProvider:
|
|
tracerFactory.enabled = false
|
|
return tracerFactory, nil
|
|
default:
|
|
tracerFactory.enabled = false
|
|
return tracerFactory, errors.New("unknown MetricsProvider type")
|
|
}
|
|
}
|
|
|
|
// Create meter and instruments
|
|
meter := meterProvider.Meter(builtInMetricsMeterName, metric.WithInstrumentationVersion(internal.Version))
|
|
err = tracerFactory.createInstruments(meter)
|
|
return tracerFactory, err
|
|
}
|
|
|
|
func builtInMeterProviderOptions(project string, opts ...option.ClientOption) ([]sdkmetric.Option, error) {
|
|
allOpts := createExporterOptions(opts...)
|
|
defaultExporter, err := newMonitoringExporter(context.Background(), project, allOpts...)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return []sdkmetric.Option{sdkmetric.WithReader(
|
|
sdkmetric.NewPeriodicReader(
|
|
defaultExporter,
|
|
sdkmetric.WithInterval(defaultSamplePeriod),
|
|
),
|
|
)}, nil
|
|
}
|
|
|
|
func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) error {
|
|
var err error
|
|
|
|
// Create operation_latencies
|
|
tf.operationLatencies, err = meter.Float64Histogram(
|
|
metricNameOperationLatencies,
|
|
metric.WithDescription("Total time until final operation success or failure, including retries and backoff."),
|
|
metric.WithUnit(metricUnitMS),
|
|
metric.WithExplicitBucketBoundaries(bucketBounds...),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Create attempt_latencies
|
|
tf.attemptLatencies, err = meter.Float64Histogram(
|
|
metricNameAttemptLatencies,
|
|
metric.WithDescription("Client observed latency per RPC attempt."),
|
|
metric.WithUnit(metricUnitMS),
|
|
metric.WithExplicitBucketBoundaries(bucketBounds...),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Create server_latencies
|
|
tf.serverLatencies, err = meter.Float64Histogram(
|
|
metricNameServerLatencies,
|
|
metric.WithDescription("The latency measured from the moment that the RPC entered the Google data center until the RPC was completed."),
|
|
metric.WithUnit(metricUnitMS),
|
|
metric.WithExplicitBucketBoundaries(bucketBounds...),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Create retry_count
|
|
tf.retryCount, err = meter.Int64Counter(
|
|
metricNameRetryCount,
|
|
metric.WithDescription("The number of additional RPCs sent after the initial attempt."),
|
|
metric.WithUnit(metricUnitCount),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Create debug_tags
|
|
tf.debugTags, err = meter.Int64Counter(
|
|
metricNameDebugTags,
|
|
metric.WithDescription("A counter of internal client events used for debugging."),
|
|
metric.WithUnit(metricUnitCount),
|
|
)
|
|
return err
|
|
}
|
|
|
|
// builtinMetricsTracer is created one per operation
|
|
// It is used to store metric instruments, attribute values
|
|
// and other data required to obtain and record them
|
|
type builtinMetricsTracer struct {
|
|
ctx context.Context
|
|
builtInEnabled bool
|
|
|
|
// attributes that are specific to a client instance and
|
|
// do not change across different operations on client
|
|
clientAttributes []attribute.KeyValue
|
|
|
|
instrumentOperationLatencies metric.Float64Histogram
|
|
instrumentServerLatencies metric.Float64Histogram
|
|
instrumentAttemptLatencies metric.Float64Histogram
|
|
instrumentRetryCount metric.Int64Counter
|
|
instrumentDebugTags metric.Int64Counter
|
|
|
|
tableName string
|
|
method string
|
|
isStreaming bool
|
|
|
|
currOp opTracer
|
|
}
|
|
|
|
func (b *builtinMetricsTracer) setMethod(m string) {
|
|
b.method = "Bigtable." + m
|
|
}
|
|
|
|
// opTracer is used to record metrics for the entire operation, including retries.
|
|
// Operation is a logical unit that represents a single method invocation on client.
|
|
// The method might require multiple attempts/rpcs and backoff logic to complete
|
|
type opTracer struct {
|
|
attemptCount int64
|
|
|
|
startTime time.Time
|
|
|
|
// gRPC status code of last completed attempt
|
|
status string
|
|
|
|
currAttempt attemptTracer
|
|
}
|
|
|
|
func (o *opTracer) setStartTime(t time.Time) {
|
|
o.startTime = t
|
|
}
|
|
|
|
func (o *opTracer) setStatus(status string) {
|
|
o.status = status
|
|
}
|
|
|
|
func (o *opTracer) incrementAttemptCount() {
|
|
o.attemptCount++
|
|
}
|
|
|
|
// attemptTracer is used to record metrics for each individual attempt of the operation.
|
|
// Attempt corresponds to an attempt of an RPC.
|
|
type attemptTracer struct {
|
|
startTime time.Time
|
|
clusterID string
|
|
zoneID string
|
|
|
|
// gRPC status code
|
|
status string
|
|
|
|
// Server latency in ms
|
|
serverLatency float64
|
|
|
|
// Error seen while getting server latency from headers
|
|
serverLatencyErr error
|
|
}
|
|
|
|
func (a *attemptTracer) setStartTime(t time.Time) {
|
|
a.startTime = t
|
|
}
|
|
|
|
func (a *attemptTracer) setClusterID(clusterID string) {
|
|
a.clusterID = clusterID
|
|
}
|
|
|
|
func (a *attemptTracer) setZoneID(zoneID string) {
|
|
a.zoneID = zoneID
|
|
}
|
|
|
|
func (a *attemptTracer) setStatus(status string) {
|
|
a.status = status
|
|
}
|
|
|
|
func (a *attemptTracer) setServerLatency(latency float64) {
|
|
a.serverLatency = latency
|
|
}
|
|
|
|
func (a *attemptTracer) setServerLatencyErr(err error) {
|
|
a.serverLatencyErr = err
|
|
}
|
|
|
|
func (tf *builtinMetricsTracerFactory) createBuiltinMetricsTracer(ctx context.Context, tableName string, isStreaming bool) builtinMetricsTracer {
|
|
// Operation has started but not the attempt.
|
|
// So, create only operation tracer and not attempt tracer
|
|
currOpTracer := opTracer{}
|
|
currOpTracer.setStartTime(time.Now())
|
|
|
|
return builtinMetricsTracer{
|
|
ctx: ctx,
|
|
builtInEnabled: tf.enabled,
|
|
|
|
currOp: currOpTracer,
|
|
clientAttributes: tf.clientAttributes,
|
|
|
|
instrumentOperationLatencies: tf.operationLatencies,
|
|
instrumentServerLatencies: tf.serverLatencies,
|
|
instrumentAttemptLatencies: tf.attemptLatencies,
|
|
instrumentRetryCount: tf.retryCount,
|
|
instrumentDebugTags: tf.debugTags,
|
|
|
|
tableName: tableName,
|
|
isStreaming: isStreaming,
|
|
}
|
|
}
|
|
|
|
// toOtelMetricAttrs:
|
|
// - converts metric attributes values captured throughout the operation / attempt
|
|
// to OpenTelemetry attributes format,
|
|
// - combines these with common client attributes and returns
|
|
func (mt *builtinMetricsTracer) toOtelMetricAttrs(metricName string) ([]attribute.KeyValue, error) {
|
|
// Create attribute key value pairs for attributes common to all metricss
|
|
attrKeyValues := []attribute.KeyValue{
|
|
attribute.String(metricLabelKeyMethod, mt.method),
|
|
|
|
// Add resource labels to otel metric labels.
|
|
// These will be used for creating the monitored resource but exporter
|
|
// will not add them to Google Cloud Monitoring metric labels
|
|
attribute.String(monitoredResLabelKeyTable, mt.tableName),
|
|
|
|
// Irrespective of whether metric is attempt specific or operation specific,
|
|
// use last attempt's cluster and zone
|
|
attribute.String(monitoredResLabelKeyCluster, mt.currOp.currAttempt.clusterID),
|
|
attribute.String(monitoredResLabelKeyZone, mt.currOp.currAttempt.zoneID),
|
|
}
|
|
attrKeyValues = append(attrKeyValues, mt.clientAttributes...)
|
|
|
|
// Get metric details
|
|
mDetails, found := metricsDetails[metricName]
|
|
if !found {
|
|
return attrKeyValues, fmt.Errorf("unable to create attributes list for unknown metric: %v", metricName)
|
|
}
|
|
|
|
status := mt.currOp.status
|
|
if mDetails.recordedPerAttempt {
|
|
status = mt.currOp.currAttempt.status
|
|
}
|
|
|
|
// Add additional attributes to metrics
|
|
for _, attrKey := range mDetails.additionalAttrs {
|
|
switch attrKey {
|
|
case metricLabelKeyStatus:
|
|
attrKeyValues = append(attrKeyValues, attribute.String(metricLabelKeyStatus, status))
|
|
case metricLabelKeyStreamingOperation:
|
|
attrKeyValues = append(attrKeyValues, attribute.Bool(metricLabelKeyStreamingOperation, mt.isStreaming))
|
|
default:
|
|
return attrKeyValues, fmt.Errorf("unknown additional attribute: %v", attrKey)
|
|
}
|
|
}
|
|
|
|
return attrKeyValues, nil
|
|
}
|
|
|