mirror of https://github.com/grafana/loki
chore(dataobj): Create initial dataobj builder (#16011)
Co-authored-by: Cyril Tovena <cyril.tovena@gmail.com> Co-authored-by: Robert Fratto <robertfratto@gmail.com>pull/16053/head
parent
4b44b59ee7
commit
ca4c025ad0
@ -0,0 +1,32 @@ |
||||
package consumer |
||||
|
||||
import ( |
||||
"errors" |
||||
"flag" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/dataobj" |
||||
) |
||||
|
||||
type Config struct { |
||||
dataobj.BuilderConfig |
||||
TenantID string `yaml:"tenant_id"` |
||||
// StorageBucketPrefix is the prefix to use for the storage bucket.
|
||||
StorageBucketPrefix string `yaml:"storage_bucket_prefix"` |
||||
} |
||||
|
||||
func (cfg *Config) Validate() error { |
||||
if cfg.TenantID == "" { |
||||
return errors.New("tenantID is required") |
||||
} |
||||
return cfg.BuilderConfig.Validate() |
||||
} |
||||
|
||||
func (cfg *Config) RegisterFlags(f *flag.FlagSet) { |
||||
cfg.RegisterFlagsWithPrefix("dataobj-consumer.", f) |
||||
} |
||||
|
||||
func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { |
||||
cfg.BuilderConfig.RegisterFlagsWithPrefix(prefix, f) |
||||
f.StringVar(&cfg.TenantID, prefix+"tenant-id", "fake", "The tenant ID to use for the data object builder.") |
||||
f.StringVar(&cfg.StorageBucketPrefix, prefix+"storage-bucket-prefix", "dataobj/", "The prefix to use for the storage bucket.") |
||||
} |
||||
@ -0,0 +1,117 @@ |
||||
package consumer |
||||
|
||||
import ( |
||||
"time" |
||||
|
||||
"go.uber.org/atomic" |
||||
|
||||
"github.com/prometheus/client_golang/prometheus" |
||||
) |
||||
|
||||
type partitionOffsetMetrics struct { |
||||
currentOffset prometheus.GaugeFunc |
||||
lastOffset atomic.Int64 |
||||
|
||||
// Error counters
|
||||
flushFailures prometheus.Counter |
||||
commitFailures prometheus.Counter |
||||
appendFailures prometheus.Counter |
||||
|
||||
// Processing delay histogram
|
||||
processingDelay prometheus.Histogram |
||||
} |
||||
|
||||
func newPartitionOffsetMetrics() *partitionOffsetMetrics { |
||||
p := &partitionOffsetMetrics{ |
||||
flushFailures: prometheus.NewCounter(prometheus.CounterOpts{ |
||||
Name: "loki_dataobj_consumer_flush_failures_total", |
||||
Help: "Total number of flush failures", |
||||
}), |
||||
commitFailures: prometheus.NewCounter(prometheus.CounterOpts{ |
||||
Name: "loki_dataobj_consumer_commit_failures_total", |
||||
Help: "Total number of commit failures", |
||||
}), |
||||
appendFailures: prometheus.NewCounter(prometheus.CounterOpts{ |
||||
Name: "loki_dataobj_consumer_append_failures_total", |
||||
Help: "Total number of append failures", |
||||
}), |
||||
processingDelay: prometheus.NewHistogram(prometheus.HistogramOpts{ |
||||
Name: "loki_dataobj_consumer_processing_delay_seconds", |
||||
Help: "Time difference between record timestamp and processing time in seconds", |
||||
Buckets: prometheus.DefBuckets, |
||||
NativeHistogramBucketFactor: 1.1, |
||||
NativeHistogramMaxBucketNumber: 100, |
||||
NativeHistogramMinResetDuration: 0, |
||||
}), |
||||
} |
||||
|
||||
p.currentOffset = prometheus.NewGaugeFunc( |
||||
prometheus.GaugeOpts{ |
||||
Name: "loki_dataobj_consumer_current_offset", |
||||
Help: "The last consumed offset for this partition", |
||||
}, |
||||
p.getCurrentOffset, |
||||
) |
||||
|
||||
return p |
||||
} |
||||
|
||||
func (p *partitionOffsetMetrics) getCurrentOffset() float64 { |
||||
return float64(p.lastOffset.Load()) |
||||
} |
||||
|
||||
func (p *partitionOffsetMetrics) register(reg prometheus.Registerer) error { |
||||
collectors := []prometheus.Collector{ |
||||
p.currentOffset, |
||||
p.flushFailures, |
||||
p.commitFailures, |
||||
p.appendFailures, |
||||
p.processingDelay, |
||||
} |
||||
|
||||
for _, collector := range collectors { |
||||
if err := reg.Register(collector); err != nil { |
||||
if _, ok := err.(prometheus.AlreadyRegisteredError); !ok { |
||||
return err |
||||
} |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func (p *partitionOffsetMetrics) unregister(reg prometheus.Registerer) { |
||||
collectors := []prometheus.Collector{ |
||||
p.currentOffset, |
||||
p.flushFailures, |
||||
p.commitFailures, |
||||
p.appendFailures, |
||||
p.processingDelay, |
||||
} |
||||
|
||||
for _, collector := range collectors { |
||||
reg.Unregister(collector) |
||||
} |
||||
} |
||||
|
||||
func (p *partitionOffsetMetrics) updateOffset(offset int64) { |
||||
p.lastOffset.Store(offset) |
||||
} |
||||
|
||||
func (p *partitionOffsetMetrics) incFlushFailures() { |
||||
p.flushFailures.Inc() |
||||
} |
||||
|
||||
func (p *partitionOffsetMetrics) incCommitFailures() { |
||||
p.commitFailures.Inc() |
||||
} |
||||
|
||||
func (p *partitionOffsetMetrics) incAppendFailures() { |
||||
p.appendFailures.Inc() |
||||
} |
||||
|
||||
func (p *partitionOffsetMetrics) observeProcessingDelay(recordTimestamp time.Time) { |
||||
// Convert milliseconds to seconds and calculate delay
|
||||
if !recordTimestamp.IsZero() { // Only observe if timestamp is valid
|
||||
p.processingDelay.Observe(time.Since(recordTimestamp).Seconds()) |
||||
} |
||||
} |
||||
@ -0,0 +1,201 @@ |
||||
package consumer |
||||
|
||||
import ( |
||||
"bytes" |
||||
"context" |
||||
"strconv" |
||||
"sync" |
||||
"time" |
||||
|
||||
"github.com/go-kit/log" |
||||
"github.com/go-kit/log/level" |
||||
"github.com/grafana/dskit/backoff" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/thanos-io/objstore" |
||||
"github.com/twmb/franz-go/pkg/kgo" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/dataobj" |
||||
"github.com/grafana/loki/v3/pkg/dataobj/metastore" |
||||
"github.com/grafana/loki/v3/pkg/kafka" |
||||
) |
||||
|
||||
type partitionProcessor struct { |
||||
// Kafka client and topic/partition info
|
||||
client *kgo.Client |
||||
topic string |
||||
partition int32 |
||||
tenantID []byte |
||||
// Processing pipeline
|
||||
records chan *kgo.Record |
||||
builder *dataobj.Builder |
||||
decoder *kafka.Decoder |
||||
|
||||
// Builder initialization
|
||||
builderOnce sync.Once |
||||
builderCfg dataobj.BuilderConfig |
||||
bucket objstore.Bucket |
||||
metastoreManager *metastore.Manager |
||||
// Metrics
|
||||
metrics *partitionOffsetMetrics |
||||
|
||||
// Control and coordination
|
||||
ctx context.Context |
||||
cancel context.CancelFunc |
||||
wg sync.WaitGroup |
||||
reg prometheus.Registerer |
||||
logger log.Logger |
||||
} |
||||
|
||||
func newPartitionProcessor(ctx context.Context, client *kgo.Client, builderCfg dataobj.BuilderConfig, bucket objstore.Bucket, tenantID string, topic string, partition int32, logger log.Logger, reg prometheus.Registerer) *partitionProcessor { |
||||
ctx, cancel := context.WithCancel(ctx) |
||||
decoder, err := kafka.NewDecoder() |
||||
if err != nil { |
||||
panic(err) |
||||
} |
||||
reg = prometheus.WrapRegistererWith(prometheus.Labels{ |
||||
"partition": strconv.Itoa(int(partition)), |
||||
}, reg) |
||||
|
||||
metrics := newPartitionOffsetMetrics() |
||||
if err := metrics.register(reg); err != nil { |
||||
level.Error(logger).Log("msg", "failed to register partition metrics", "err", err) |
||||
} |
||||
|
||||
metastoreManager, err := metastore.NewMetastoreManager(bucket, tenantID, logger, reg) |
||||
if err != nil { |
||||
level.Error(logger).Log("msg", "failed to create metastore manager", "err", err) |
||||
cancel() |
||||
return nil |
||||
} |
||||
|
||||
return &partitionProcessor{ |
||||
client: client, |
||||
logger: log.With(logger, "topic", topic, "partition", partition), |
||||
topic: topic, |
||||
partition: partition, |
||||
records: make(chan *kgo.Record, 1000), |
||||
ctx: ctx, |
||||
cancel: cancel, |
||||
decoder: decoder, |
||||
reg: reg, |
||||
builderCfg: builderCfg, |
||||
bucket: bucket, |
||||
tenantID: []byte(tenantID), |
||||
metrics: metrics, |
||||
metastoreManager: metastoreManager, |
||||
} |
||||
} |
||||
|
||||
func (p *partitionProcessor) start() { |
||||
p.wg.Add(1) |
||||
go func() { |
||||
defer p.wg.Done() |
||||
defer close(p.records) |
||||
|
||||
level.Info(p.logger).Log("msg", "started partition processor") |
||||
for { |
||||
select { |
||||
case <-p.ctx.Done(): |
||||
level.Info(p.logger).Log("msg", "stopping partition processor") |
||||
return |
||||
case record := <-p.records: |
||||
p.processRecord(record) |
||||
} |
||||
} |
||||
}() |
||||
} |
||||
|
||||
func (p *partitionProcessor) stop() { |
||||
p.cancel() |
||||
p.wg.Wait() |
||||
if p.builder != nil { |
||||
p.builder.UnregisterMetrics(p.reg) |
||||
} |
||||
p.metrics.unregister(p.reg) |
||||
} |
||||
|
||||
func (p *partitionProcessor) initBuilder() error { |
||||
var initErr error |
||||
p.builderOnce.Do(func() { |
||||
builder, err := dataobj.NewBuilder(p.builderCfg, p.bucket, string(p.tenantID)) |
||||
if err != nil { |
||||
initErr = err |
||||
return |
||||
} |
||||
if err := builder.RegisterMetrics(p.reg); err != nil { |
||||
initErr = err |
||||
return |
||||
} |
||||
p.builder = builder |
||||
}) |
||||
return initErr |
||||
} |
||||
|
||||
func (p *partitionProcessor) processRecord(record *kgo.Record) { |
||||
// Update offset metric at the end of processing
|
||||
defer p.metrics.updateOffset(record.Offset) |
||||
|
||||
// Observe processing delay
|
||||
p.metrics.observeProcessingDelay(record.Timestamp) |
||||
|
||||
// Initialize builder if this is the first record
|
||||
if err := p.initBuilder(); err != nil { |
||||
level.Error(p.logger).Log("msg", "failed to initialize builder", "err", err) |
||||
return |
||||
} |
||||
|
||||
// todo: handle multi-tenant
|
||||
if !bytes.Equal(record.Key, p.tenantID) { |
||||
return |
||||
} |
||||
stream, err := p.decoder.DecodeWithoutLabels(record.Value) |
||||
if err != nil { |
||||
level.Error(p.logger).Log("msg", "failed to decode record", "err", err) |
||||
return |
||||
} |
||||
|
||||
if err := p.builder.Append(stream); err != nil { |
||||
if err != dataobj.ErrBufferFull { |
||||
level.Error(p.logger).Log("msg", "failed to append stream", "err", err) |
||||
p.metrics.incAppendFailures() |
||||
return |
||||
} |
||||
|
||||
backoff := backoff.New(p.ctx, backoff.Config{ |
||||
MinBackoff: 100 * time.Millisecond, |
||||
MaxBackoff: 10 * time.Second, |
||||
}) |
||||
|
||||
var flushResult dataobj.FlushResult |
||||
for backoff.Ongoing() { |
||||
flushResult, err = p.builder.Flush(p.ctx) |
||||
if err == nil { |
||||
break |
||||
} |
||||
level.Error(p.logger).Log("msg", "failed to flush builder", "err", err) |
||||
p.metrics.incFlushFailures() |
||||
backoff.Wait() |
||||
} |
||||
|
||||
if err := p.metastoreManager.UpdateMetastore(p.ctx, flushResult); err != nil { |
||||
level.Error(p.logger).Log("msg", "failed to update metastore", "err", err) |
||||
return |
||||
} |
||||
|
||||
backoff.Reset() |
||||
for backoff.Ongoing() { |
||||
err = p.client.CommitRecords(p.ctx, record) |
||||
if err == nil { |
||||
break |
||||
} |
||||
level.Error(p.logger).Log("msg", "failed to commit records", "err", err) |
||||
p.metrics.incCommitFailures() |
||||
backoff.Wait() |
||||
} |
||||
|
||||
if err := p.builder.Append(stream); err != nil { |
||||
level.Error(p.logger).Log("msg", "failed to append stream after flushing", "err", err) |
||||
p.metrics.incAppendFailures() |
||||
} |
||||
} |
||||
} |
||||
@ -0,0 +1,218 @@ |
||||
package consumer |
||||
|
||||
import ( |
||||
"context" |
||||
"errors" |
||||
"strconv" |
||||
"sync" |
||||
"time" |
||||
|
||||
"github.com/go-kit/log" |
||||
"github.com/go-kit/log/level" |
||||
"github.com/grafana/dskit/ring" |
||||
"github.com/grafana/dskit/services" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/thanos-io/objstore" |
||||
"github.com/twmb/franz-go/pkg/kgo" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/kafka" |
||||
"github.com/grafana/loki/v3/pkg/kafka/client" |
||||
"github.com/grafana/loki/v3/pkg/kafka/partitionring/consumer" |
||||
) |
||||
|
||||
const ( |
||||
groupName = "dataobj-consumer" |
||||
) |
||||
|
||||
type Service struct { |
||||
services.Service |
||||
|
||||
logger log.Logger |
||||
reg prometheus.Registerer |
||||
client *consumer.Client |
||||
|
||||
cfg Config |
||||
bucket objstore.Bucket |
||||
|
||||
// Partition management
|
||||
partitionMtx sync.RWMutex |
||||
partitionHandlers map[string]map[int32]*partitionProcessor |
||||
} |
||||
|
||||
func New(kafkaCfg kafka.Config, cfg Config, bucket objstore.Bucket, instanceID string, partitionRing ring.PartitionRingReader, reg prometheus.Registerer, logger log.Logger) *Service { |
||||
if cfg.StorageBucketPrefix != "" { |
||||
bucket = objstore.NewPrefixedBucket(bucket, cfg.StorageBucketPrefix) |
||||
} |
||||
s := &Service{ |
||||
logger: log.With(logger, "component", groupName), |
||||
cfg: cfg, |
||||
bucket: bucket, |
||||
partitionHandlers: make(map[string]map[int32]*partitionProcessor), |
||||
reg: reg, |
||||
} |
||||
|
||||
client, err := consumer.NewGroupClient( |
||||
kafkaCfg, |
||||
partitionRing, |
||||
groupName, |
||||
client.NewReaderClientMetrics(groupName, reg), |
||||
logger, |
||||
kgo.InstanceID(instanceID), |
||||
kgo.SessionTimeout(3*time.Minute), |
||||
kgo.RebalanceTimeout(5*time.Minute), |
||||
kgo.OnPartitionsAssigned(s.handlePartitionsAssigned), |
||||
kgo.OnPartitionsRevoked(func(_ context.Context, _ *kgo.Client, m map[string][]int32) { |
||||
s.handlePartitionsRevoked(m) |
||||
}), |
||||
) |
||||
if err != nil { |
||||
level.Error(logger).Log("msg", "failed to create consumer", "err", err) |
||||
return nil |
||||
} |
||||
s.client = client |
||||
s.Service = services.NewBasicService(nil, s.run, s.stopping) |
||||
return s |
||||
} |
||||
|
||||
func (s *Service) handlePartitionsAssigned(ctx context.Context, client *kgo.Client, partitions map[string][]int32) { |
||||
level.Info(s.logger).Log("msg", "partitions assigned", "partitions", formatPartitionsMap(partitions)) |
||||
s.partitionMtx.Lock() |
||||
defer s.partitionMtx.Unlock() |
||||
|
||||
for topic, parts := range partitions { |
||||
if _, ok := s.partitionHandlers[topic]; !ok { |
||||
s.partitionHandlers[topic] = make(map[int32]*partitionProcessor) |
||||
} |
||||
|
||||
for _, partition := range parts { |
||||
processor := newPartitionProcessor(ctx, client, s.cfg.BuilderConfig, s.bucket, s.cfg.TenantID, topic, partition, s.logger, s.reg) |
||||
s.partitionHandlers[topic][partition] = processor |
||||
processor.start() |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (s *Service) handlePartitionsRevoked(partitions map[string][]int32) { |
||||
level.Info(s.logger).Log("msg", "partitions revoked", "partitions", formatPartitionsMap(partitions)) |
||||
s.partitionMtx.Lock() |
||||
defer s.partitionMtx.Unlock() |
||||
|
||||
var wg sync.WaitGroup |
||||
for topic, parts := range partitions { |
||||
if handlers, ok := s.partitionHandlers[topic]; ok { |
||||
for _, partition := range parts { |
||||
if processor, exists := handlers[partition]; exists { |
||||
wg.Add(1) |
||||
go func(p *partitionProcessor) { |
||||
defer wg.Done() |
||||
p.stop() |
||||
}(processor) |
||||
delete(handlers, partition) |
||||
} |
||||
} |
||||
if len(handlers) == 0 { |
||||
delete(s.partitionHandlers, topic) |
||||
} |
||||
} |
||||
} |
||||
wg.Wait() |
||||
} |
||||
|
||||
func (s *Service) run(ctx context.Context) error { |
||||
for { |
||||
fetches := s.client.PollRecords(ctx, -1) |
||||
if fetches.IsClientClosed() || ctx.Err() != nil { |
||||
return nil |
||||
} |
||||
if errs := fetches.Errors(); len(errs) > 0 { |
||||
var multiErr error |
||||
for _, err := range errs { |
||||
multiErr = errors.Join(multiErr, err.Err) |
||||
} |
||||
level.Error(s.logger).Log("msg", "error fetching records", "err", multiErr.Error()) |
||||
continue |
||||
} |
||||
if fetches.Empty() { |
||||
continue |
||||
} |
||||
|
||||
fetches.EachPartition(func(ftp kgo.FetchTopicPartition) { |
||||
s.partitionMtx.RLock() |
||||
handlers, ok := s.partitionHandlers[ftp.Topic] |
||||
if !ok { |
||||
s.partitionMtx.RUnlock() |
||||
return |
||||
} |
||||
processor, ok := handlers[ftp.Partition] |
||||
s.partitionMtx.RUnlock() |
||||
if !ok { |
||||
return |
||||
} |
||||
|
||||
// Collect all records for this partition
|
||||
records := ftp.Records |
||||
if len(records) == 0 { |
||||
return |
||||
} |
||||
|
||||
for _, record := range records { |
||||
select { |
||||
case <-processor.ctx.Done(): |
||||
return |
||||
case processor.records <- record: |
||||
// Record sent successfully
|
||||
} |
||||
} |
||||
}) |
||||
} |
||||
} |
||||
|
||||
func (s *Service) stopping(failureCase error) error { |
||||
s.partitionMtx.Lock() |
||||
defer s.partitionMtx.Unlock() |
||||
|
||||
var wg sync.WaitGroup |
||||
for _, handlers := range s.partitionHandlers { |
||||
for _, processor := range handlers { |
||||
wg.Add(1) |
||||
go func(p *partitionProcessor) { |
||||
defer wg.Done() |
||||
p.stop() |
||||
}(processor) |
||||
} |
||||
} |
||||
wg.Wait() |
||||
// Only close the client once all partitions have been stopped.
|
||||
// This is to ensure that all records have been processed before closing and offsets committed.
|
||||
s.client.Close() |
||||
level.Info(s.logger).Log("msg", "consumer stopped") |
||||
return failureCase |
||||
} |
||||
|
||||
// Helper function to format []int32 slice
|
||||
func formatInt32Slice(slice []int32) string { |
||||
if len(slice) == 0 { |
||||
return "[]" |
||||
} |
||||
result := "[" |
||||
for i, v := range slice { |
||||
if i > 0 { |
||||
result += "," |
||||
} |
||||
result += strconv.Itoa(int(v)) |
||||
} |
||||
result += "]" |
||||
return result |
||||
} |
||||
|
||||
// Helper function to format map[string][]int32 into a readable string
|
||||
func formatPartitionsMap(partitions map[string][]int32) string { |
||||
var result string |
||||
for topic, parts := range partitions { |
||||
if len(result) > 0 { |
||||
result += ", " |
||||
} |
||||
result += topic + "=" + formatInt32Slice(parts) |
||||
} |
||||
return result |
||||
} |
||||
@ -0,0 +1,175 @@ |
||||
package metastore |
||||
|
||||
import ( |
||||
"bytes" |
||||
"context" |
||||
"fmt" |
||||
"io" |
||||
"sync" |
||||
"time" |
||||
|
||||
"github.com/go-kit/log" |
||||
"github.com/go-kit/log/level" |
||||
"github.com/grafana/dskit/backoff" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/thanos-io/objstore" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/dataobj" |
||||
"github.com/grafana/loki/v3/pkg/logproto" |
||||
) |
||||
|
||||
const ( |
||||
metastoreWindowSize = 12 * time.Hour |
||||
) |
||||
|
||||
var ( |
||||
// Define our own builder config because metastore objects are significantly smaller.
|
||||
metastoreBuilderCfg = dataobj.BuilderConfig{ |
||||
SHAPrefixSize: 2, |
||||
TargetObjectSize: 32 * 1024 * 1024, |
||||
TargetPageSize: 4 * 1024 * 1024, |
||||
BufferSize: 32 * 1024 * 1024, // 8x page size
|
||||
TargetSectionSize: 4 * 1024 * 1024, // object size / 8
|
||||
} |
||||
) |
||||
|
||||
type Manager struct { |
||||
metastoreBuilder *dataobj.Builder |
||||
tenantID string |
||||
metrics *metastoreMetrics |
||||
bucket objstore.Bucket |
||||
logger log.Logger |
||||
backoff *backoff.Backoff |
||||
|
||||
builderOnce sync.Once |
||||
} |
||||
|
||||
func NewMetastoreManager(bucket objstore.Bucket, tenantID string, logger log.Logger, reg prometheus.Registerer) (*Manager, error) { |
||||
metrics := newMetastoreMetrics() |
||||
if err := metrics.register(reg); err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
return &Manager{ |
||||
bucket: bucket, |
||||
metrics: metrics, |
||||
logger: logger, |
||||
tenantID: tenantID, |
||||
backoff: backoff.New(context.TODO(), backoff.Config{ |
||||
MinBackoff: 50 * time.Millisecond, |
||||
MaxBackoff: 10 * time.Second, |
||||
}), |
||||
builderOnce: sync.Once{}, |
||||
}, nil |
||||
} |
||||
|
||||
func (m *Manager) initBuilder() error { |
||||
var initErr error |
||||
m.builderOnce.Do(func() { |
||||
metastoreBuilder, err := dataobj.NewBuilder(metastoreBuilderCfg, m.bucket, m.tenantID) |
||||
if err != nil { |
||||
initErr = err |
||||
return |
||||
} |
||||
m.metastoreBuilder = metastoreBuilder |
||||
}) |
||||
return initErr |
||||
} |
||||
|
||||
func (m *Manager) UpdateMetastore(ctx context.Context, flushResult dataobj.FlushResult) error { |
||||
var err error |
||||
start := time.Now() |
||||
defer m.metrics.observeMetastoreProcessing(start) |
||||
|
||||
// Initialize builder if this is the first call for this partition
|
||||
if err := m.initBuilder(); err != nil { |
||||
return err |
||||
} |
||||
|
||||
minTimestamp, maxTimestamp := flushResult.MinTimestamp, flushResult.MaxTimestamp |
||||
|
||||
// Work our way through the metastore objects window by window, updating & creating them as needed.
|
||||
// Each one handles its own retries in order to keep making progress in the event of a failure.
|
||||
minMetastoreWindow := minTimestamp.Truncate(metastoreWindowSize) |
||||
maxMetastoreWindow := maxTimestamp.Truncate(metastoreWindowSize) |
||||
for metastoreWindow := minMetastoreWindow; metastoreWindow.Compare(maxMetastoreWindow) <= 0; metastoreWindow = metastoreWindow.Add(metastoreWindowSize) { |
||||
metastorePath := fmt.Sprintf("tenant-%s/metastore/%s.store", m.tenantID, metastoreWindow.Format(time.RFC3339)) |
||||
m.backoff.Reset() |
||||
for m.backoff.Ongoing() { |
||||
err = m.bucket.GetAndReplace(ctx, metastorePath, func(existing io.Reader) (io.Reader, error) { |
||||
buf, err := io.ReadAll(existing) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
m.metastoreBuilder.Reset() |
||||
|
||||
if len(buf) > 0 { |
||||
replayStart := time.Now() |
||||
object := dataobj.FromReaderAt(bytes.NewReader(buf), int64(len(buf))) |
||||
if err := m.readFromExisting(ctx, object); err != nil { |
||||
return nil, err |
||||
} |
||||
m.metrics.observeMetastoreReplay(replayStart) |
||||
} |
||||
|
||||
encodingStart := time.Now() |
||||
|
||||
ls := fmt.Sprintf("{__start__=\"%d\", __end__=\"%d\", __path__=\"%s\"}", minTimestamp.UnixNano(), maxTimestamp.UnixNano(), flushResult.Path) |
||||
err = m.metastoreBuilder.Append(logproto.Stream{ |
||||
Labels: ls, |
||||
Entries: []logproto.Entry{{Line: ""}}, |
||||
}) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
newMetastore, err := m.metastoreBuilder.FlushToBuffer() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
m.metrics.observeMetastoreEncoding(encodingStart) |
||||
return newMetastore, nil |
||||
}) |
||||
if err == nil { |
||||
level.Info(m.logger).Log("msg", "successfully merged & updated metastore", "metastore", metastorePath) |
||||
break |
||||
} |
||||
level.Error(m.logger).Log("msg", "failed to get and replace metastore object", "err", err, "metastore", metastorePath) |
||||
m.metrics.incMetastoreWriteFailures() |
||||
m.backoff.Wait() |
||||
} |
||||
// Reset at the end too so we don't leave our memory hanging around between calls.
|
||||
m.metastoreBuilder.Reset() |
||||
} |
||||
return err |
||||
} |
||||
|
||||
func (m *Manager) readFromExisting(ctx context.Context, object *dataobj.Object) error { |
||||
// Fetch sections
|
||||
si, err := object.Metadata(ctx) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
|
||||
// Read streams from existing metastore object and write them to the builder for the new object
|
||||
streams := make([]dataobj.Stream, 100) |
||||
for i := 0; i < si.StreamsSections; i++ { |
||||
streamsReader := dataobj.NewStreamsReader(object, i) |
||||
for n, err := streamsReader.Read(ctx, streams); n > 0; n, err = streamsReader.Read(ctx, streams) { |
||||
if err != nil && err != io.EOF { |
||||
return err |
||||
} |
||||
for _, stream := range streams[:n] { |
||||
err = m.metastoreBuilder.Append(logproto.Stream{ |
||||
Labels: stream.Labels.String(), |
||||
Entries: []logproto.Entry{{Line: ""}}, |
||||
}) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
} |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
@ -0,0 +1,106 @@ |
||||
package metastore |
||||
|
||||
import ( |
||||
"context" |
||||
"fmt" |
||||
"testing" |
||||
"time" |
||||
|
||||
"github.com/go-kit/log" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/stretchr/testify/require" |
||||
|
||||
"github.com/grafana/dskit/backoff" |
||||
"github.com/thanos-io/objstore" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/dataobj" |
||||
) |
||||
|
||||
func BenchmarkWriteMetastores(t *testing.B) { |
||||
ctx := context.Background() |
||||
bucket := objstore.NewInMemBucket() |
||||
tenantID := "test-tenant" |
||||
|
||||
m, err := NewMetastoreManager(bucket, tenantID, log.NewNopLogger(), prometheus.DefaultRegisterer) |
||||
require.NoError(t, err) |
||||
|
||||
// Set limits for the test
|
||||
m.backoff = backoff.New(context.TODO(), backoff.Config{ |
||||
MinBackoff: 10 * time.Millisecond, |
||||
MaxBackoff: 100 * time.Millisecond, |
||||
MaxRetries: 3, |
||||
}) |
||||
|
||||
// Add test data spanning multiple metastore windows
|
||||
now := time.Date(2025, 1, 1, 15, 0, 0, 0, time.UTC) |
||||
|
||||
flushResults := make([]dataobj.FlushResult, 1000) |
||||
for i := 0; i < 1000; i++ { |
||||
flushResults[i] = dataobj.FlushResult{ |
||||
Path: fmt.Sprintf("test-dataobj-path-%d", i), |
||||
MinTimestamp: now.Add(-1 * time.Hour).Add(time.Duration(i) * time.Millisecond), |
||||
MaxTimestamp: now, |
||||
} |
||||
} |
||||
|
||||
t.ResetTimer() |
||||
t.ReportAllocs() |
||||
for i := 0; i < t.N; i++ { |
||||
// Test writing metastores
|
||||
err = m.UpdateMetastore(ctx, flushResults[i%len(flushResults)]) |
||||
require.NoError(t, err) |
||||
} |
||||
|
||||
require.Len(t, bucket.Objects(), 1) |
||||
} |
||||
|
||||
func TestWriteMetastores(t *testing.T) { |
||||
ctx := context.Background() |
||||
bucket := objstore.NewInMemBucket() |
||||
tenantID := "test-tenant" |
||||
|
||||
m, err := NewMetastoreManager(bucket, tenantID, log.NewNopLogger(), prometheus.DefaultRegisterer) |
||||
require.NoError(t, err) |
||||
|
||||
// Set limits for the test
|
||||
m.backoff = backoff.New(context.TODO(), backoff.Config{ |
||||
MinBackoff: 10 * time.Millisecond, |
||||
MaxBackoff: 100 * time.Millisecond, |
||||
MaxRetries: 3, |
||||
}) |
||||
|
||||
// Add test data spanning multiple metastore windows
|
||||
now := time.Date(2025, 1, 1, 15, 0, 0, 0, time.UTC) |
||||
|
||||
flushResult := dataobj.FlushResult{ |
||||
Path: "test-dataobj-path", |
||||
MinTimestamp: now.Add(-1 * time.Hour), |
||||
MaxTimestamp: now, |
||||
} |
||||
|
||||
require.Len(t, bucket.Objects(), 0) |
||||
|
||||
// Test writing metastores
|
||||
err = m.UpdateMetastore(ctx, flushResult) |
||||
require.NoError(t, err) |
||||
|
||||
require.Len(t, bucket.Objects(), 1) |
||||
var originalSize int |
||||
for _, obj := range bucket.Objects() { |
||||
originalSize = len(obj) |
||||
} |
||||
|
||||
flushResult2 := dataobj.FlushResult{ |
||||
Path: "different-test-dataobj-path", |
||||
MinTimestamp: now.Add(-15 * time.Minute), |
||||
MaxTimestamp: now, |
||||
} |
||||
|
||||
err = m.UpdateMetastore(ctx, flushResult2) |
||||
require.NoError(t, err) |
||||
|
||||
require.Len(t, bucket.Objects(), 1) |
||||
for _, obj := range bucket.Objects() { |
||||
require.Greater(t, len(obj), originalSize) |
||||
} |
||||
} |
||||
@ -0,0 +1,102 @@ |
||||
package metastore |
||||
|
||||
import ( |
||||
"time" |
||||
|
||||
"github.com/prometheus/client_golang/prometheus" |
||||
) |
||||
|
||||
type metastoreMetrics struct { |
||||
metastoreProcessingTime prometheus.Histogram |
||||
metastoreReplayTime prometheus.Histogram |
||||
metastoreEncodingTime prometheus.Histogram |
||||
metastoreWriteFailures prometheus.Counter |
||||
} |
||||
|
||||
func newMetastoreMetrics() *metastoreMetrics { |
||||
metrics := &metastoreMetrics{ |
||||
metastoreReplayTime: prometheus.NewHistogram(prometheus.HistogramOpts{ |
||||
Name: "loki_dataobj_consumer_metastore_replay_seconds", |
||||
Help: "Time taken to replay existing metastore data into the in-memory builder in seconds", |
||||
Buckets: prometheus.DefBuckets, |
||||
NativeHistogramBucketFactor: 1.1, |
||||
NativeHistogramMaxBucketNumber: 100, |
||||
NativeHistogramMinResetDuration: 0, |
||||
}), |
||||
metastoreEncodingTime: prometheus.NewHistogram(prometheus.HistogramOpts{ |
||||
Name: "loki_dataobj_consumer_metastore_encoding_seconds", |
||||
Help: "Time taken to add the new metadata & encode the new metastore data object in seconds", |
||||
Buckets: prometheus.DefBuckets, |
||||
NativeHistogramBucketFactor: 1.1, |
||||
NativeHistogramMaxBucketNumber: 100, |
||||
NativeHistogramMinResetDuration: 0, |
||||
}), |
||||
metastoreProcessingTime: prometheus.NewHistogram(prometheus.HistogramOpts{ |
||||
Name: "loki_dataobj_consumer_metastore_processing_seconds", |
||||
Help: "Total time taken to update all metastores for a flushed dataobj in seconds", |
||||
Buckets: prometheus.DefBuckets, |
||||
NativeHistogramBucketFactor: 1.1, |
||||
NativeHistogramMaxBucketNumber: 100, |
||||
NativeHistogramMinResetDuration: 0, |
||||
}), |
||||
metastoreWriteFailures: prometheus.NewCounter(prometheus.CounterOpts{ |
||||
Name: "loki_dataobj_consumer_metastore_write_failures_total", |
||||
Help: "Total number of metastore write failures", |
||||
}), |
||||
} |
||||
|
||||
return metrics |
||||
} |
||||
|
||||
func (p *metastoreMetrics) register(reg prometheus.Registerer) error { |
||||
collectors := []prometheus.Collector{ |
||||
p.metastoreReplayTime, |
||||
p.metastoreEncodingTime, |
||||
p.metastoreProcessingTime, |
||||
p.metastoreWriteFailures, |
||||
} |
||||
|
||||
for _, collector := range collectors { |
||||
if err := reg.Register(collector); err != nil { |
||||
if _, ok := err.(prometheus.AlreadyRegisteredError); !ok { |
||||
return err |
||||
} |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func (p *metastoreMetrics) unregister(reg prometheus.Registerer) { |
||||
collectors := []prometheus.Collector{ |
||||
p.metastoreReplayTime, |
||||
p.metastoreEncodingTime, |
||||
p.metastoreProcessingTime, |
||||
p.metastoreWriteFailures, |
||||
} |
||||
|
||||
for _, collector := range collectors { |
||||
reg.Unregister(collector) |
||||
} |
||||
} |
||||
|
||||
func (p *metastoreMetrics) incMetastoreWriteFailures() { |
||||
p.metastoreWriteFailures.Inc() |
||||
} |
||||
|
||||
func (p *metastoreMetrics) observeMetastoreReplay(recordTimestamp time.Time) { |
||||
if !recordTimestamp.IsZero() { // Only observe if timestamp is valid
|
||||
p.metastoreReplayTime.Observe(time.Since(recordTimestamp).Seconds()) |
||||
} |
||||
} |
||||
|
||||
func (p *metastoreMetrics) observeMetastoreEncoding(recordTimestamp time.Time) { |
||||
if !recordTimestamp.IsZero() { // Only observe if timestamp is valid
|
||||
p.metastoreEncodingTime.Observe(time.Since(recordTimestamp).Seconds()) |
||||
} |
||||
} |
||||
|
||||
func (p *metastoreMetrics) observeMetastoreProcessing(recordTimestamp time.Time) { |
||||
if !recordTimestamp.IsZero() { // Only observe if timestamp is valid
|
||||
p.metastoreProcessingTime.Observe(time.Since(recordTimestamp).Seconds()) |
||||
} |
||||
} |
||||
Loading…
Reference in new issue