loki/pkg/kafka/partition/reader.go

package partition

import (
	"context"
	"fmt"
	"time"

	"github.com/coder/quartz"
	"github.com/go-kit/log"
	"github.com/go-kit/log/level"
	"github.com/grafana/dskit/backoff"
	"github.com/grafana/dskit/multierror"
	"github.com/grafana/dskit/services"
	"github.com/pkg/errors"
	"github.com/prometheus/client_golang/prometheus"
	"github.com/twmb/franz-go/pkg/kadm"
	"github.com/twmb/franz-go/pkg/kerr"
	"github.com/twmb/franz-go/pkg/kgo"
	"github.com/twmb/franz-go/pkg/kmsg"

	"github.com/grafana/loki/v3/pkg/kafka"
	"github.com/grafana/loki/v3/pkg/kafka/client"
)

var errWaitTargetLagDeadlineExceeded = errors.New("waiting for target lag deadline exceeded")

const (
	kafkaStartOffset = -2
	kafkaEndOffset   = -1

	phaseStarting = "starting"
	phaseRunning  = "running"
)

// Reader is responsible for reading data from a specific Kafka partition
// and passing it to the consumer for processing. It is a core component of the
// Loki ingester's Kafka-based ingestion pipeline.
type Reader struct {
	services.Service

	kafkaCfg            kafka.Config
	partitionID         int32
	consumerGroup       string
	consumerFactory     ConsumerFactory
	committer           *partitionCommitter
	lastProcessedOffset int64
	recordsChan         chan []Record

	client  *kgo.Client
	logger  log.Logger
	metrics readerMetrics
	reg     prometheus.Registerer
	clock   quartz.Clock
}

type Record struct {
	// Context holds the tracing (and potentially other) info, that the record was enriched with on fetch from Kafka.
	Ctx      context.Context
	TenantID string
	Content  []byte
	Offset   int64
}

type ConsumerFactory func(committer Committer) (Consumer, error)

type Consumer interface {
	Start(ctx context.Context, recordsChan <-chan []Record) func()
}

// NewReader creates and initializes a new PartitionReader.
// It sets up the basic service and initializes the reader with the provided configuration.
func NewReader(
	kafkaCfg kafka.Config,
	partitionID int32,
	instanceID string,
	consumerFactory ConsumerFactory,
	logger log.Logger,
	reg prometheus.Registerer,
) (*Reader, error) {
	r := &Reader{
		kafkaCfg:            kafkaCfg,
		partitionID:         partitionID,
		consumerGroup:       kafkaCfg.GetConsumerGroup(instanceID, partitionID),
		logger:              logger,
		metrics:             newReaderMetrics(reg),
		reg:                 reg,
		lastProcessedOffset: -1,
		consumerFactory:     consumerFactory,
	}
	r.Service = services.NewBasicService(r.start, r.run, nil)
	return r, nil
}

// start initializes the Kafka client and committer for the PartitionReader.
// This method is called when the PartitionReader service starts.
func (p *Reader) start(ctx context.Context) error {
	var err error
	p.client, err = client.NewReaderClient(p.kafkaCfg, p.metrics.kprom, p.logger)
	if err != nil {
		return errors.Wrap(err, "creating kafka reader client")
	}
	p.metrics.reportStarting(p.partitionID)

	// We manage our commits manually, so we must fetch the last offset for our consumer group to find out where to read from.
	lastCommittedOffset := p.fetchLastCommittedOffset(ctx)
	if lastCommittedOffset == kafkaEndOffset {
		level.Warn(p.logger).Log("msg", "no committed offset found for partition, starting from the beginning", "partition", p.partitionID, "consumer_group", p.consumerGroup)
		lastCommittedOffset = kafkaStartOffset // If we haven't committed any offsets yet, we start reading from the beginning.
	}
	if lastCommittedOffset > 0 {
		lastCommittedOffset++ // We want to begin to read from the next offset, but only if we've previously committed an offset.
	}
	p.client.AddConsumePartitions(map[string]map[int32]kgo.Offset{
		p.kafkaCfg.Topic: {p.partitionID: kgo.NewOffset().At(lastCommittedOffset)},
	})

	level.Info(p.logger).Log("msg", "initialising partition reader", "last_committed_offset", lastCommittedOffset, "partition", p.partitionID, "consumer_group", p.consumerGroup)

	p.committer = newCommitter(p.kafkaCfg, kadm.NewClient(p.client), p.partitionID, p.consumerGroup, p.logger, p.reg)

	if targetLag, maxLag := p.kafkaCfg.TargetConsumerLagAtStartup, p.kafkaCfg.MaxConsumerLagAtStartup; targetLag > 0 && maxLag > 0 {
		consumer, err := p.consumerFactory(p.committer)
		if err != nil {
			return fmt.Errorf("creating consumer: %w", err)
		}

		cancelCtx, cancel := context.WithCancel(ctx)
		// Temporarily start a consumer to do the initial update
		recordsChan := make(chan []Record)
		wait := consumer.Start(cancelCtx, recordsChan)
		// Shutdown the consumer after catching up. We start a new instance in the run method to tie the lifecycle to the run context.
		defer func() {
			close(recordsChan)
			cancel()
			wait()
		}()

		err = p.processNextFetchesUntilTargetOrMaxLagHonored(ctx, p.kafkaCfg.MaxConsumerLagAtStartup, p.kafkaCfg.TargetConsumerLagAtStartup, recordsChan)
		if err != nil {
			level.Error(p.logger).Log("msg", "failed to catch up to max lag", "partition", p.partitionID, "consumer_group", p.consumerGroup, "err", err)
			return err
		}
	}

	return nil
}

// run is the main loop of the PartitionReader. It continuously fetches and processes
// data from Kafka, and send it to the consumer.
func (p *Reader) run(ctx context.Context) error {
	level.Info(p.logger).Log("msg", "starting partition reader", "partition", p.partitionID, "consumer_group", p.consumerGroup)
	p.metrics.reportRunning(p.partitionID)

	ctx, cancel := context.WithCancel(ctx)
	defer cancel()

	consumer, err := p.consumerFactory(p.committer)
	if err != nil {
		return errors.Wrap(err, "creating consumer")
	}

	recordsChan := p.startFetchLoop(ctx)
	wait := consumer.Start(ctx, recordsChan)

	wait()
	p.committer.Stop()
	return nil
}

func (p *Reader) fetchLastCommittedOffset(ctx context.Context) int64 {
	// We manually create a request so that we can request the offset for a single partition
	// only, which is more performant than requesting the offsets for all partitions.
	req := kmsg.NewPtrOffsetFetchRequest()
	req.Topics = []kmsg.OffsetFetchRequestTopic{{Topic: p.kafkaCfg.Topic, Partitions: []int32{p.partitionID}}}
	req.Group = p.consumerGroup

	resps := p.client.RequestSharded(ctx, req)

	// Since we issued a request for only 1 partition, we expect exactly 1 response.
	if expected, actual := 1, len(resps); actual != expected {
		level.Error(p.logger).Log("msg", fmt.Sprintf("unexpected number of responses (expected: %d, got: %d)", expected, actual), "expected", expected, "actual", len(resps))
		return kafkaStartOffset
	}
	// Ensure no error occurred.
	res := resps[0]
	if res.Err != nil {
		level.Error(p.logger).Log("msg", "error fetching group offset for partition", "err", res.Err)
		return kafkaStartOffset
	}

	// Parse the response.
	fetchRes, ok := res.Resp.(*kmsg.OffsetFetchResponse)
	if !ok {
		level.Error(p.logger).Log("msg", "unexpected response type")
		return kafkaStartOffset
	}
	if expected, actual := 1, len(fetchRes.Groups); actual != expected {
		level.Error(p.logger).Log("msg", fmt.Sprintf("unexpected number of groups in the response (expected: %d, got: %d)", expected, actual))
		return kafkaStartOffset
	}
	if expected, actual := 1, len(fetchRes.Groups[0].Topics); actual != expected {
		level.Error(p.logger).Log("msg", fmt.Sprintf("unexpected number of topics in the response (expected: %d, got: %d)", expected, actual))
		return kafkaStartOffset
	}
	if expected, actual := p.kafkaCfg.Topic, fetchRes.Groups[0].Topics[0].Topic; expected != actual {
		level.Error(p.logger).Log("msg", fmt.Sprintf("unexpected topic in the response (expected: %s, got: %s)", expected, actual))
		return kafkaStartOffset
	}
	if expected, actual := 1, len(fetchRes.Groups[0].Topics[0].Partitions); actual != expected {
		level.Error(p.logger).Log("msg", fmt.Sprintf("unexpected number of partitions in the response (expected: %d, got: %d)", expected, actual))
		return kafkaStartOffset
	}
	if expected, actual := p.partitionID, fetchRes.Groups[0].Topics[0].Partitions[0].Partition; actual != expected {
		level.Error(p.logger).Log("msg", fmt.Sprintf("unexpected partition in the response (expected: %d, got: %d)", expected, actual))
		return kafkaStartOffset
	}
	if err := kerr.ErrorForCode(fetchRes.Groups[0].Topics[0].Partitions[0].ErrorCode); err != nil {
		level.Error(p.logger).Log("msg", "unexpected error in the response", "err", err)
		return kafkaStartOffset
	}

	return fetchRes.Groups[0].Topics[0].Partitions[0].Offset
}

func (p *Reader) fetchPartitionOffset(ctx context.Context, position int64) (int64, error) {
	// Create a custom request to fetch the latest offset of a specific partition.
	// We manually create a request so that we can request the offset for a single partition
	// only, which is more performant than requesting the offsets for all partitions.
	partitionReq := kmsg.NewListOffsetsRequestTopicPartition()
	partitionReq.Partition = p.partitionID
	partitionReq.Timestamp = position

	topicReq := kmsg.NewListOffsetsRequestTopic()
	topicReq.Topic = p.kafkaCfg.Topic
	topicReq.Partitions = []kmsg.ListOffsetsRequestTopicPartition{partitionReq}

	req := kmsg.NewPtrListOffsetsRequest()
	req.IsolationLevel = 0 // 0 means READ_UNCOMMITTED.
	req.Topics = []kmsg.ListOffsetsRequestTopic{topicReq}

	// Even if we share the same client, other in-flight requests are not canceled once this context is canceled
	// (or its deadline is exceeded). We've verified it with a unit test.
	resps := p.client.RequestSharded(ctx, req)

	// Since we issued a request for only 1 partition, we expect exactly 1 response.
	if expected := 1; len(resps) != 1 {
		return 0, fmt.Errorf("unexpected number of responses (expected: %d, got: %d)", expected, len(resps))
	}

	// Ensure no error occurred.
	res := resps[0]
	if res.Err != nil {
		return 0, res.Err
	}

	// Parse the response.
	listRes, ok := res.Resp.(*kmsg.ListOffsetsResponse)
	if !ok {
		return 0, errors.New("unexpected response type")
	}
	if expected, actual := 1, len(listRes.Topics); actual != expected {
		return 0, fmt.Errorf("unexpected number of topics in the response (expected: %d, got: %d)", expected, actual)
	}
	if expected, actual := p.kafkaCfg.Topic, listRes.Topics[0].Topic; expected != actual {
		return 0, fmt.Errorf("unexpected topic in the response (expected: %s, got: %s)", expected, actual)
	}
	if expected, actual := 1, len(listRes.Topics[0].Partitions); actual != expected {
		return 0, fmt.Errorf("unexpected number of partitions in the response (expected: %d, got: %d)", expected, actual)
	}
	if expected, actual := p.partitionID, listRes.Topics[0].Partitions[0].Partition; actual != expected {
		return 0, fmt.Errorf("unexpected partition in the response (expected: %d, got: %d)", expected, actual)
	}
	if err := kerr.ErrorForCode(listRes.Topics[0].Partitions[0].ErrorCode); err != nil {
		return 0, err
	}

	return listRes.Topics[0].Partitions[0].Offset, nil
}

// processNextFetchesUntilTargetOrMaxLagHonored process records from Kafka until at least the maxLag is honored.
// This function does a best-effort to get lag below targetLag, but it's not guaranteed that it will be
// reached once this function successfully returns (only maxLag is guaranteed).
func (p *Reader) processNextFetchesUntilTargetOrMaxLagHonored(ctx context.Context, targetLag, maxLag time.Duration, recordsChan chan<- []Record) error {
	logger := log.With(p.logger, "target_lag", targetLag, "max_lag", maxLag)
	level.Info(logger).Log("msg", "partition reader is starting to consume partition until target and max consumer lag is honored")

	attempts := []func() (time.Duration, error){
		// First process fetches until at least the max lag is honored.
		func() (time.Duration, error) {
			return p.processNextFetchesUntilLagHonored(ctx, maxLag, logger, recordsChan, time.Since)
		},

		// If the target lag hasn't been reached with the first attempt (which stops once at least the max lag
		// is honored) then we try to reach the (lower) target lag within a fixed time (best-effort).
		// The timeout is equal to the max lag. This is done because we expect at least a 2x replay speed
		// from Kafka (which means at most it takes 1s to ingest 2s of data): assuming new data is continuously
		// written to the partition, we give the reader maxLag time to replay the backlog + ingest the new data
		// written in the meanwhile.
		func() (time.Duration, error) {
			timedCtx, cancel := context.WithTimeoutCause(ctx, maxLag, errWaitTargetLagDeadlineExceeded)
			defer cancel()

			return p.processNextFetchesUntilLagHonored(timedCtx, targetLag, logger, recordsChan, time.Since)
		},

		// If the target lag hasn't been reached with the previous attempt then we'll move on. However,
		// we still need to guarantee that in the meanwhile the lag didn't increase and max lag is still honored.
		func() (time.Duration, error) {
			return p.processNextFetchesUntilLagHonored(ctx, maxLag, logger, recordsChan, time.Since)
		},
	}

	var currLag time.Duration
	for _, attempt := range attempts {
		var err error

		currLag, err = attempt()
		if errors.Is(err, errWaitTargetLagDeadlineExceeded) {
			continue
		}
		if err != nil {
			return err
		}
		if currLag <= targetLag {
			level.Info(logger).Log(
				"msg", "partition reader consumed partition and current lag is lower or equal to configured target consumer lag",
				"last_consumed_offset", p.committer.lastCommittedOffset,
				"current_lag", currLag,
			)
			return nil
		}
	}

	level.Warn(logger).Log(
		"msg", "partition reader consumed partition and current lag is lower than configured max consumer lag but higher than target consumer lag",
		"last_consumed_offset", p.committer.lastCommittedOffset,
		"current_lag", currLag,
	)
	return nil
}

func (p *Reader) processNextFetchesUntilLagHonored(ctx context.Context, maxLag time.Duration, logger log.Logger, recordsChan chan<- []Record, timeSince func(time.Time) time.Duration) (time.Duration, error) {
	boff := backoff.New(ctx, backoff.Config{
		MinBackoff: 100 * time.Millisecond,
		MaxBackoff: time.Second,
		MaxRetries: 0, // Retry forever (unless context is canceled / deadline exceeded).
	})
	currLag := time.Duration(0)

	for boff.Ongoing() {
		// Send a direct request to the Kafka backend to fetch the partition start offset.
		partitionStartOffset, err := p.fetchPartitionOffset(ctx, kafkaStartOffset)
		if err != nil {
			level.Warn(logger).Log("msg", "partition reader failed to fetch partition start offset", "err", err)
			boff.Wait()
			continue
		}

		consumerGroupLastCommittedOffset := p.fetchLastCommittedOffset(ctx)

		// Send a direct request to the Kafka backend to fetch the last produced offset.
		// We intentionally don't use WaitNextFetchLastProducedOffset() to not introduce further
		// latency.
		lastProducedOffsetRequestedAt := time.Now()
		lastProducedOffset, err := p.fetchPartitionOffset(ctx, kafkaEndOffset)
		if err != nil {
			level.Warn(logger).Log("msg", "partition reader failed to fetch last produced offset", "err", err)
			boff.Wait()
			continue
		}
		lastProducedOffset = lastProducedOffset - 1 // Kafka returns the next empty offset so we must subtract 1 to get the oldest written offset.

		level.Debug(logger).Log("msg", "fetched latest offset information", "partition_start_offset", partitionStartOffset, "last_produced_offset", lastProducedOffset)

		// Ensure there are some records to consume. For example, if the partition has been inactive for a long
		// time and all its records have been deleted, the partition start offset may be > 0 but there are no
		// records to actually consume.
		if partitionStartOffset > lastProducedOffset {
			level.Info(logger).Log("msg", "partition reader found no records to consume because partition is empty", "partition_start_offset", partitionStartOffset, "last_produced_offset", lastProducedOffset)
			return 0, nil
		}

		if consumerGroupLastCommittedOffset == lastProducedOffset {
			level.Info(logger).Log("msg", "partition reader found no records to consume because it is already up-to-date", "last_committed_offset", consumerGroupLastCommittedOffset, "last_produced_offset", lastProducedOffset)
			return 0, nil
		}

		// This message is NOT expected to be logged with a very high rate. In this log we display the last measured
		// lag. If we don't have it (lag is zero value), then it will not be logged.
		level.Info(loggerWithCurrentLagIfSet(logger, currLag)).Log("msg", "partition reader is consuming records to honor target and max consumer lag", "partition_start_offset", partitionStartOffset, "last_produced_offset", lastProducedOffset, "last_processed_offset", p.lastProcessedOffset, "offset_lag", lastProducedOffset-p.lastProcessedOffset)

		for boff.Ongoing() {
			// Continue reading until we reached the desired offset.
			if lastProducedOffset <= p.lastProcessedOffset {
				break
			}
			if time.Since(lastProducedOffsetRequestedAt) > time.Minute {
				level.Info(loggerWithCurrentLagIfSet(logger, currLag)).Log("msg", "partition reader is still consuming records...", "last_processed_offset", p.lastProcessedOffset, "offset_lag", lastProducedOffset-p.lastProcessedOffset)
			}

			timedCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
			recordsChan <- p.poll(timedCtx)
			cancel()
		}
		if boff.Err() != nil {
			return 0, boff.ErrCause()
		}

		// If it took less than the max desired lag to replay the partition
		// then we can stop here, otherwise we'll have to redo it.
		if currLag = timeSince(lastProducedOffsetRequestedAt); currLag <= maxLag {
			return currLag, nil
		}
	}

	return 0, boff.ErrCause()
}

func loggerWithCurrentLagIfSet(logger log.Logger, currLag time.Duration) log.Logger {
	if currLag <= 0 {
		return logger
	}

	return log.With(logger, "current_lag", currLag)
}

func (p *Reader) startFetchLoop(ctx context.Context) chan []Record {
	records := make(chan []Record)
	go func() {
		for {
			select {
			case <-ctx.Done():
				return
			default:
				records <- p.poll(ctx)
			}
		}
	}()
	return records
}

// logFetchErrors logs any errors encountered during the fetch operation.
func (p *Reader) logFetchErrors(fetches kgo.Fetches) {
	mErr := multierror.New()
	fetches.EachError(func(topic string, partition int32, err error) {
		if errors.Is(err, context.Canceled) {
			return
		}

		// kgo advises to "restart" the kafka client if the returned error is a kerr.Error.
		// Recreating the client would cause duplicate metrics registration, so we don't do it for now.
		mErr.Add(fmt.Errorf("topic %q, partition %d: %w", topic, partition, err))
	})
	if len(mErr) == 0 {
		return
	}
	p.metrics.fetchesErrors.Add(float64(len(mErr)))
	level.Error(p.logger).Log("msg", "encountered error while fetching", "err", mErr.Err())
}

// pollFetches retrieves the next batch of records from Kafka and measures the fetch duration.
func (p *Reader) poll(ctx context.Context) []Record {
	defer func(start time.Time) {
		p.metrics.fetchWaitDuration.Observe(time.Since(start).Seconds())
	}(time.Now())
	fetches := p.client.PollFetches(ctx)
	p.recordFetchesMetrics(fetches)
	p.logFetchErrors(fetches)
	fetches = filterOutErrFetches(fetches)
	if fetches.NumRecords() == 0 {
		return nil
	}
	records := make([]Record, 0, fetches.NumRecords())
	fetches.EachRecord(func(rec *kgo.Record) {
		if rec.Partition != p.partitionID {
			level.Error(p.logger).Log("msg", "wrong partition record received", "partition", rec.Partition, "expected_partition", p.partitionID)
			return
		}
		records = append(records, Record{
			// This context carries the tracing data for this individual record;
			// kotel populates this data when it fetches the messages.
			Ctx:      rec.Context,
			TenantID: string(rec.Key),
			Content:  rec.Value,
			Offset:   rec.Offset,
		})
	})
	p.lastProcessedOffset = records[len(records)-1].Offset
	return records
}

// recordFetchesMetrics updates various metrics related to the fetch operation.
func (p *Reader) recordFetchesMetrics(fetches kgo.Fetches) {
	var (
		now        = time.Now()
		numRecords = 0
	)
	fetches.EachRecord(func(record *kgo.Record) {
		numRecords++
		delay := now.Sub(record.Timestamp).Seconds()
		if p.Service.State() == services.Starting {
			p.metrics.receiveDelay.WithLabelValues(phaseStarting).Observe(delay)
		} else {
			p.metrics.receiveDelay.WithLabelValues(phaseRunning).Observe(delay)
		}
	})

	p.metrics.fetchesTotal.Add(float64(len(fetches)))
	p.metrics.recordsPerFetch.Observe(float64(numRecords))
}

// filterOutErrFetches removes any fetches that resulted in errors from the provided slice.
func filterOutErrFetches(fetches kgo.Fetches) kgo.Fetches {
	filtered := make(kgo.Fetches, 0, len(fetches))
	for i, fetch := range fetches {
		if !isErrFetch(fetch) {
			filtered = append(filtered, fetches[i])
		}
	}

	return filtered
}

// isErrFetch checks if a given fetch resulted in any errors.
func isErrFetch(fetch kgo.Fetch) bool {
	for _, t := range fetch.Topics {
		for _, p := range t.Partitions {
			if p.Err != nil {
				return true
			}
		}
	}
	return false
}