package ingester import ( "context" "errors" "flag" "fmt" "net/http" "regexp" "strconv" "strings" "time" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/kv" "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/prometheus/client_golang/prometheus" "google.golang.org/grpc/health/grpc_health_v1" "github.com/grafana/loki/v3/pkg/kafka" "github.com/grafana/loki/v3/pkg/kafka/ingester/shutdownmarker" "github.com/grafana/loki/v3/pkg/kafka/partitionring" util_log "github.com/grafana/loki/v3/pkg/util/log" "github.com/grafana/loki/v3/pkg/util" ) const ( RingName = "kafka-ingester" PartitionRingName = "kafka-partition" ) var ( ingesterIDRegexp = regexp.MustCompile("-([0-9]+)$") defaultFlushInterval = 15 * time.Second defaultFlushSize int64 = 300 << 20 // 300 MB ) // Config for an ingester. type Config struct { Enabled bool `yaml:"enabled" doc:"description=Whether the kafka ingester is enabled."` LifecyclerConfig ring.LifecyclerConfig `yaml:"lifecycler,omitempty" doc:"description=Configures how the lifecycle of the ingester will operate and where it will register for discovery."` ShutdownMarkerPath string `yaml:"shutdown_marker_path"` FlushInterval time.Duration `yaml:"flush_interval" doc:"description=The interval at which the ingester will flush and commit offsets to Kafka. If not set, the default flush interval will be used."` FlushSize int64 `yaml:"flush_size" doc:"description=The size at which the ingester will flush and commit offsets to Kafka. If not set, the default flush size will be used."` PartitionRingConfig partitionring.Config `yaml:"partition_ring" category:"experimental"` KafkaConfig kafka.Config `yaml:"-"` } // RegisterFlags registers the flags. func (cfg *Config) RegisterFlags(f *flag.FlagSet) { cfg.LifecyclerConfig.RegisterFlagsWithPrefix("kafka-ingester", f, util_log.Logger) cfg.PartitionRingConfig.RegisterFlags(f) f.StringVar(&cfg.ShutdownMarkerPath, "kafka-ingester.shutdown-marker-path", "", "Path where the shutdown marker file is stored. If not set and common.path_prefix is set then common.path_prefix will be used.") f.BoolVar(&cfg.Enabled, "kafka-ingester.enabled", false, "Whether the Kafka-based ingester path is enabled") f.DurationVar(&cfg.FlushInterval, "kafka-ingester.flush-interval", defaultFlushInterval, "The interval at which the ingester will flush and commit offsets to Kafka. If not set, the default flush interval will be used.") f.Int64Var(&cfg.FlushSize, "kafka-ingester.flush-size", defaultFlushSize, "The size at which the ingester will flush and commit offsets to Kafka. If not set, the default flush size will be used.") } func (cfg *Config) Validate() error { if !cfg.Enabled { return nil } if cfg.FlushInterval <= 0 { return errors.New("kafka-ingester.flush-interval must be greater than 0") } if cfg.LifecyclerConfig.RingConfig.ReplicationFactor != 1 { cfg.LifecyclerConfig.RingConfig.ReplicationFactor = 1 level.Warn(util_log.Logger).Log("msg", "kafka-ingester.lifecycler.replication-factor has been set to 1. This is the only supported replication factor for the kafka-ingester.") } return nil } type Wrapper interface { Wrap(wrapped Interface) Interface } // Interface is an interface for the Ingester type Interface interface { services.Service http.Handler CheckReady(ctx context.Context) error FlushHandler(w http.ResponseWriter, _ *http.Request) } // Ingester builds chunks for incoming log streams. type Ingester struct { services.Service cfg Config logger log.Logger metrics *ingesterMetrics lifecycler *ring.Lifecycler lifecyclerWatcher *services.FailureWatcher ingesterPartitionID int32 partitionRingLifecycler *ring.PartitionInstanceLifecycler partitionReader *PartitionReader } // New makes a new Ingester. func New(cfg Config, consumerFactory ConsumerFactory, logger log.Logger, metricsNamespace string, registerer prometheus.Registerer, ) (*Ingester, error) { metrics := newIngesterMetrics(registerer) ingesterPartitionID, err := extractIngesterPartitionID(cfg.LifecyclerConfig.ID) if err != nil { return nil, fmt.Errorf("calculating ingester partition ID: %w", err) } partitionRingKV := cfg.PartitionRingConfig.KVStore.Mock if partitionRingKV == nil { partitionRingKV, err = kv.NewClient(cfg.PartitionRingConfig.KVStore, ring.GetPartitionRingCodec(), kv.RegistererWithKVName(registerer, PartitionRingName+"-lifecycler"), logger) if err != nil { return nil, fmt.Errorf("creating KV store for ingester partition ring: %w", err) } } partitionRingLifecycler := ring.NewPartitionInstanceLifecycler( cfg.PartitionRingConfig.ToLifecyclerConfig(ingesterPartitionID, cfg.LifecyclerConfig.ID), PartitionRingName, PartitionRingName+"-key", partitionRingKV, logger, prometheus.WrapRegistererWithPrefix("loki_", registerer)) i := &Ingester{ cfg: cfg, logger: logger, ingesterPartitionID: ingesterPartitionID, partitionRingLifecycler: partitionRingLifecycler, metrics: metrics, } i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, RingName, RingName+"-ring", true, logger, prometheus.WrapRegistererWithPrefix(metricsNamespace+"_", registerer)) if err != nil { return nil, err } i.partitionReader, err = NewPartitionReader(cfg.KafkaConfig, ingesterPartitionID, cfg.LifecyclerConfig.ID, consumerFactory, logger, registerer) if err != nil { return nil, err } i.lifecyclerWatcher = services.NewFailureWatcher() i.lifecyclerWatcher.WatchService(i.lifecycler) i.lifecyclerWatcher.WatchService(i.partitionRingLifecycler) i.lifecyclerWatcher.WatchService(i.partitionReader) i.Service = services.NewBasicService(i.starting, i.running, i.stopping) return i, nil } // ingesterPartitionID returns the partition ID owner the the given ingester. func extractIngesterPartitionID(ingesterID string) (int32, error) { if strings.Contains(ingesterID, "local") { return 0, nil } match := ingesterIDRegexp.FindStringSubmatch(ingesterID) if len(match) == 0 { return 0, fmt.Errorf("ingester ID %s doesn't match regular expression %q", ingesterID, ingesterIDRegexp.String()) } // Parse the ingester sequence number. ingesterSeq, err := strconv.Atoi(match[1]) if err != nil { return 0, fmt.Errorf("no ingester sequence number in ingester ID %s", ingesterID) } return int32(ingesterSeq), nil } // ServeHTTP implements the pattern ring status page. func (i *Ingester) ServeHTTP(w http.ResponseWriter, r *http.Request) { i.lifecycler.ServeHTTP(w, r) } func (i *Ingester) starting(ctx context.Context) (err error) { defer func() { if err != nil { // if starting() fails for any reason (e.g., context canceled), // the lifecycler must be stopped. _ = services.StopAndAwaitTerminated(context.Background(), i.lifecycler) } }() // First of all we have to check if the shutdown marker is set. This needs to be done // as first thing because, if found, it may change the behaviour of the ingester startup. if exists, err := shutdownmarker.Exists(shutdownmarker.GetPath(i.cfg.ShutdownMarkerPath)); err != nil { return fmt.Errorf("failed to check ingester shutdown marker: %w", err) } else if exists { level.Info(i.logger).Log("msg", "detected existing shutdown marker, setting unregister and flush on shutdown", "path", shutdownmarker.GetPath(i.cfg.ShutdownMarkerPath)) i.setPrepareShutdown() } // pass new context to lifecycler, so that it doesn't stop automatically when Ingester's service context is done err = i.lifecycler.StartAsync(context.Background()) if err != nil { return err } err = i.lifecycler.AwaitRunning(ctx) if err != nil { return err } err = i.partitionRingLifecycler.StartAsync(context.Background()) if err != nil { return err } err = i.partitionRingLifecycler.AwaitRunning(ctx) if err != nil { return err } err = i.partitionReader.StartAsync(context.Background()) if err != nil { return err } err = i.partitionReader.AwaitRunning(ctx) if err != nil { return err } return nil } func (i *Ingester) running(ctx context.Context) error { var serviceError error select { // wait until service is asked to stop case <-ctx.Done(): // stop case err := <-i.lifecyclerWatcher.Chan(): serviceError = fmt.Errorf("lifecycler failed: %w", err) } return serviceError } // stopping is called when Ingester transitions to Stopping state. // // At this point, loop no longer runs, but flushers are still running. func (i *Ingester) stopping(_ error) error { var errs util.MultiError errs.Add(services.StopAndAwaitTerminated(context.Background(), i.partitionReader)) errs.Add(services.StopAndAwaitTerminated(context.Background(), i.lifecycler)) errs.Add(services.StopAndAwaitTerminated(context.Background(), i.partitionRingLifecycler)) // Remove the shutdown marker if it exists since we are shutting down shutdownMarkerPath := shutdownmarker.GetPath(i.cfg.ShutdownMarkerPath) exist, err := shutdownmarker.Exists(shutdownMarkerPath) if err != nil { level.Warn(i.logger).Log("msg", "failed to check for prepare-shutdown marker file", "path", shutdownMarkerPath, "err", err) } else if exist { if err := shutdownmarker.Remove(shutdownMarkerPath); err != nil { level.Warn(i.logger).Log("msg", "failed to remove shutdown marker", "path", shutdownMarkerPath, "err", err) } } return errs.Err() } // Watch implements grpc_health_v1.HealthCheck. func (*Ingester) Watch(*grpc_health_v1.HealthCheckRequest, grpc_health_v1.Health_WatchServer) error { return nil } func (i *Ingester) PreparePartitionDownscaleHandler(w http.ResponseWriter, r *http.Request) { logger := log.With(i.logger, "partition", i.ingesterPartitionID) // Don't allow callers to change the shutdown configuration while we're in the middle // of starting or shutting down. if i.State() != services.Running { w.WriteHeader(http.StatusServiceUnavailable) return } shutdownMarkerPath := shutdownmarker.GetPath(i.cfg.ShutdownMarkerPath) exists, err := shutdownmarker.Exists(shutdownMarkerPath) if err != nil { level.Error(i.logger).Log("msg", "unable to check for prepare-shutdown marker file", "path", shutdownMarkerPath, "err", err) w.WriteHeader(http.StatusInternalServerError) return } switch r.Method { case http.MethodPost: // It's not allowed to prepare the downscale while in PENDING state. Why? Because if the downscale // will be later cancelled, we don't know if it was requested in PENDING or ACTIVE state, so we // don't know to which state reverting back. Given a partition is expected to stay in PENDING state // for a short period, we simply don't allow this case. state, _, err := i.partitionRingLifecycler.GetPartitionState(r.Context()) if err != nil { level.Error(logger).Log("msg", "failed to check partition state in the ring", "err", err) w.WriteHeader(http.StatusInternalServerError) return } if state == ring.PartitionPending { level.Warn(logger).Log("msg", "received a request to prepare partition for shutdown, but the request can't be satisfied because the partition is in PENDING state") w.WriteHeader(http.StatusConflict) return } if err := i.partitionRingLifecycler.ChangePartitionState(r.Context(), ring.PartitionInactive); err != nil { level.Error(logger).Log("msg", "failed to change partition state to inactive", "err", err) http.Error(w, err.Error(), http.StatusInternalServerError) return } if !exists { if err := shutdownmarker.Create(shutdownMarkerPath); err != nil { level.Error(i.logger).Log("msg", "unable to create prepare-shutdown marker file", "path", shutdownMarkerPath, "err", err) w.WriteHeader(http.StatusInternalServerError) return } } i.setPrepareShutdown() case http.MethodDelete: state, _, err := i.partitionRingLifecycler.GetPartitionState(r.Context()) if err != nil { level.Error(logger).Log("msg", "failed to check partition state in the ring", "err", err) w.WriteHeader(http.StatusInternalServerError) return } // If partition is inactive, make it active. We ignore other states Active and especially Pending. if state == ring.PartitionInactive { // We don't switch it back to PENDING state if there are not enough owners because we want to guarantee consistency // in the read path. If the partition is within the lookback period we need to guarantee that partition will be queried. // Moving back to PENDING will cause us loosing consistency, because PENDING partitions are not queried by design. // We could move back to PENDING if there are not enough owners and the partition moved to INACTIVE more than // "lookback period" ago, but since we delete inactive partitions with no owners that moved to inactive since longer // than "lookback period" ago, it looks to be an edge case not worth to address. if err := i.partitionRingLifecycler.ChangePartitionState(r.Context(), ring.PartitionActive); err != nil { level.Error(logger).Log("msg", "failed to change partition state to active", "err", err) http.Error(w, err.Error(), http.StatusInternalServerError) return } if exists { if err := shutdownmarker.Remove(shutdownMarkerPath); err != nil { level.Error(i.logger).Log("msg", "unable to remove prepare-shutdown marker file", "path", shutdownMarkerPath, "err", err) w.WriteHeader(http.StatusInternalServerError) return } } i.unsetPrepareShutdown() } } state, stateTimestamp, err := i.partitionRingLifecycler.GetPartitionState(r.Context()) if err != nil { level.Error(logger).Log("msg", "failed to check partition state in the ring", "err", err) w.WriteHeader(http.StatusInternalServerError) return } if state == ring.PartitionInactive { util.WriteJSONResponse(w, map[string]any{"timestamp": stateTimestamp.Unix()}) } else { util.WriteJSONResponse(w, map[string]any{"timestamp": 0}) } } // setPrepareShutdown toggles ingester lifecycler config to prepare for shutdown func (i *Ingester) setPrepareShutdown() { i.lifecycler.SetUnregisterOnShutdown(true) i.lifecycler.SetFlushOnShutdown(true) i.partitionRingLifecycler.SetCreatePartitionOnStartup(false) i.partitionRingLifecycler.SetRemoveOwnerOnShutdown(true) i.metrics.shutdownMarker.Set(1) } func (i *Ingester) unsetPrepareShutdown() { i.lifecycler.SetUnregisterOnShutdown(i.cfg.LifecyclerConfig.UnregisterOnShutdown) i.lifecycler.SetFlushOnShutdown(true) i.partitionRingLifecycler.SetCreatePartitionOnStartup(true) i.partitionRingLifecycler.SetRemoveOwnerOnShutdown(false) i.metrics.shutdownMarker.Set(0) } // ReadinessHandler is used to indicate to k8s when the ingesters are ready for // the addition removal of another ingester. Returns 204 when the ingester is // ready, 500 otherwise. func (i *Ingester) CheckReady(ctx context.Context) error { // todo. if s := i.State(); s != services.Running && s != services.Stopping { return fmt.Errorf("ingester not ready: %v", s) } return i.lifecycler.CheckReady(ctx) } // Flush implements ring.FlushTransferer // Flush triggers a flush of all the chunks and closes the flush queues. // Called from the Lifecycler as part of the ingester shutdown. func (i *Ingester) Flush() { } func (i *Ingester) TransferOut(_ context.Context) error { return nil }