Do not transfer off chunks on ingester shutdown (#10709)

This PR removes the ability of an ingester in LEAVING state prior to shutdown to transfer chunks off to another ingester in the ring in PENDING state.

The **Write Ahead Log** (WAL) supersedes the chunk transfer feature.

See upgrade notes for information how to replace the removed setting.

Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
Co-authored-by: Salva Corts <salva.corts@grafana.com>
pull/10780/head
Christian Haudum 2 years ago committed by GitHub
parent da175771c0
commit 54e46446fa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      CHANGELOG.md
  2. 5
      docs/sources/configure/_index.md
  3. 9
      docs/sources/setup/upgrade/_index.md
  4. 1
      pkg/ingester/checkpoint_test.go
  5. 3
      pkg/ingester/client/client.go
  6. 9
      pkg/ingester/flush.go
  7. 22
      pkg/ingester/ingester.go
  8. 318
      pkg/ingester/transfer.go
  9. 244
      pkg/ingester/transfer_test.go
  10. 1027
      pkg/logproto/logproto.pb.go
  11. 13
      pkg/logproto/logproto.proto
  12. 3
      pkg/loki/loki.go
  13. 1
      pkg/loki/modules.go

@ -20,6 +20,7 @@
* [10534](https://github.com/grafana/loki/pull/10534) **chaudum** Remove configuration `use_boltdb_shipper_as_backup`
* [10620](https://github.com/grafana/loki/pull/10620) **ashwanthgoli** Enable embedded cache if no other cache is explicitly enabled.
* [10655](https://github.com/grafana/loki/pull/10655) **chaudum** Remove legacy ingester shutdown handler `/ingester/flush_shutdown`.
* [10709](https://github.com/grafana/loki/pull/10709) **chaudum**/**salvacorts** Remove `ingester.max-transfer-retries` configuration option in favor of using the WAL.
* [10736](https://github.com/grafana/loki/pull/10736) **ashwanthgoli** Deprecate write dedupe cache as this is not required by the newer single store indexes (tsdb and boltdb-shipper).
* [10693](https://github.com/grafana/loki/pull/10693) **ashwanthgoli** Embedded cache: Updates the metric prefix from `querier_cache_` to `loki_embeddedcache_` and removes duplicate metrics.

@ -1456,11 +1456,6 @@ lifecycler:
# CLI flag: -ingester.lifecycler.ID
[id: <string> | default = "<hostname>"]
# Number of times to try and transfer chunks before falling back to flushing. If
# set to 0 or negative value, transfers are disabled.
# CLI flag: -ingester.max-transfer-retries
[max_transfer_retries: <int> | default = 0]
# How many flushes can happen concurrently from each stream.
# CLI flag: -ingester.concurrent-flushes
[concurrent_flushes: <int> | default = 32]

@ -60,6 +60,15 @@ The previous default value `false` is applied.
The already deprecated handler `/ingester/flush_shutdown` is removed in favor of `/ingester/shutdown?flush=true`.
#### Ingester configuration `max_transfer_retries` is removed.
The setting `max_transfer_retries` (`-ingester.max-transfer-retries`) is removed in favor of the Write Ahead log (WAL).
It was used to allow transferring chunks to new ingesters when the old ingester was shutting down during a rolling restart.
Alternatives to this setting are:
- **A. (Preferred)** Enable the WAL and rely on the new ingester to replay the WAL.
- Optionally, you can enable `flush_on_shutdown` (`-ingester.flush-on-shutdown`) to flush to long-term storage on shutdowns.
- **B.** Manually flush during shutdowns via [the ingester `/shutdown?flush=true` endpoint]({{< relref "../../reference/api#flush-in-memory-chunks-and-shut-down" >}}).
#### Distributor metric changes
The `loki_distributor_ingester_append_failures_total` metric has been removed in favour of `loki_distributor_ingester_append_timeouts_total`.

@ -47,7 +47,6 @@ func ensureIngesterData(ctx context.Context, t *testing.T, start, end time.Time,
func defaultIngesterTestConfigWithWAL(t *testing.T, walDir string) Config {
ingesterConfig := defaultIngesterTestConfig(t)
ingesterConfig.MaxTransferRetries = 0
ingesterConfig.WAL.Enabled = true
ingesterConfig.WAL.Dir = walDir
ingesterConfig.WAL.CheckpointDuration = time.Second

@ -25,7 +25,6 @@ var ingesterClientRequestDuration = promauto.NewHistogramVec(prometheus.Histogra
}, []string{"operation", "status_code"})
type HealthAndIngesterClient interface {
logproto.IngesterClient
grpc_health_v1.HealthClient
Close() error
}
@ -33,7 +32,6 @@ type HealthAndIngesterClient interface {
type ClosableHealthAndIngesterClient struct {
logproto.PusherClient
logproto.QuerierClient
logproto.IngesterClient
logproto.StreamDataClient
grpc_health_v1.HealthClient
io.Closer
@ -81,7 +79,6 @@ func New(cfg Config, addr string) (HealthAndIngesterClient, error) {
return ClosableHealthAndIngesterClient{
PusherClient: logproto.NewPusherClient(conn),
QuerierClient: logproto.NewQuerierClient(conn),
IngesterClient: logproto.NewIngesterClient(conn),
StreamDataClient: logproto.NewStreamDataClient(conn),
HealthClient: grpc_health_v1.NewHealthClient(conn),
Closer: conn,

@ -8,6 +8,7 @@ import (
"time"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/ring"
"github.com/grafana/dskit/user"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
@ -47,12 +48,20 @@ func (i *Ingester) InitFlushQueues() {
}
}
// Flush implements ring.FlushTransferer
// Flush triggers a flush of all the chunks and closes the flush queues.
// Called from the Lifecycler as part of the ingester shutdown.
func (i *Ingester) Flush() {
i.flush(true)
}
// TransferOut implements ring.FlushTransferer
// Noop implemenetation because ingesters have a WAL now that does not require transferring chunks any more.
// We return ErrTransferDisabled to indicate that we don't support transfers, and therefore we may flush on shutdown if configured to do so.
func (i *Ingester) TransferOut(_ context.Context) error {
return ring.ErrTransferDisabled
}
func (i *Ingester) flush(mayRemoveStreams bool) {
i.sweepUsers(true, mayRemoveStreams)

@ -76,9 +76,6 @@ var (
type Config struct {
LifecyclerConfig ring.LifecyclerConfig `yaml:"lifecycler,omitempty" doc:"description=Configures how the lifecycle of the ingester will operate and where it will register for discovery."`
// Config for transferring chunks.
MaxTransferRetries int `yaml:"max_transfer_retries,omitempty"`
ConcurrentFlushes int `yaml:"concurrent_flushes"`
FlushCheckPeriod time.Duration `yaml:"flush_check_period"`
FlushOpTimeout time.Duration `yaml:"flush_op_timeout"`
@ -121,7 +118,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
cfg.LifecyclerConfig.RegisterFlags(f, util_log.Logger)
cfg.WAL.RegisterFlags(f)
f.IntVar(&cfg.MaxTransferRetries, "ingester.max-transfer-retries", 0, "Number of times to try and transfer chunks before falling back to flushing. If set to 0 or negative value, transfers are disabled.")
f.IntVar(&cfg.ConcurrentFlushes, "ingester.concurrent-flushes", 32, "How many flushes can happen concurrently from each stream.")
f.DurationVar(&cfg.FlushCheckPeriod, "ingester.flush-check-period", 30*time.Second, "How often should the ingester see if there are any blocks to flush. The first flush check is delayed by a random time up to 0.8x the flush check period. Additionally, there is +/- 1% jitter added to the interval.")
f.DurationVar(&cfg.FlushOpTimeout, "ingester.flush-op-timeout", 10*time.Minute, "The timeout before a flush is cancelled.")
@ -152,10 +148,6 @@ func (cfg *Config) Validate() error {
return err
}
if cfg.MaxTransferRetries > 0 && cfg.WAL.Enabled {
return errors.New("the use of the write ahead log (WAL) is incompatible with chunk transfers. It's suggested to use the WAL. Please try setting ingester.max-transfer-retries to 0 to disable transfers")
}
if cfg.IndexShards <= 0 {
return fmt.Errorf("invalid ingester index shard factor: %d", cfg.IndexShards)
}
@ -180,7 +172,6 @@ type Store interface {
type Interface interface {
services.Service
logproto.IngesterServer
logproto.PusherServer
logproto.QuerierServer
logproto.StreamDataServer
@ -538,8 +529,6 @@ func (i *Ingester) stopping(_ error) error {
}
errs.Add(services.StopAndAwaitTerminated(context.Background(), i.lifecycler))
// Normally, flushers are stopped via lifecycler (in transferOut), but if lifecycler fails,
// we better stop them.
for _, flushQueue := range i.flushQueues {
flushQueue.Close()
}
@ -557,6 +546,17 @@ func (i *Ingester) stopping(_ error) error {
return errs.Err()
}
// stopIncomingRequests is called when ingester is stopping
func (i *Ingester) stopIncomingRequests() {
i.shutdownMtx.Lock()
defer i.shutdownMtx.Unlock()
i.instancesMtx.Lock()
defer i.instancesMtx.Unlock()
i.readonly = true
}
// removeShutdownMarkerFile removes the shutdown marker if it exists. Any errors are logged.
func (i *Ingester) removeShutdownMarkerFile() {
shutdownMarkerPath := path.Join(i.cfg.ShutdownMarkerPath, shutdownMarkerFilename)

@ -1,318 +0,0 @@
package ingester
import (
"fmt"
"io"
"os"
"time"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/backoff"
"github.com/grafana/dskit/ring"
"github.com/grafana/dskit/user"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/prometheus/model/labels"
"golang.org/x/net/context"
"github.com/grafana/loki/pkg/logproto"
lokiutil "github.com/grafana/loki/pkg/util"
util_log "github.com/grafana/loki/pkg/util/log"
)
var (
sentChunks = promauto.NewCounter(prometheus.CounterOpts{
Namespace: "loki",
Name: "ingester_sent_chunks",
Help: "The total number of chunks sent by this ingester whilst leaving.",
})
receivedChunks = promauto.NewCounter(prometheus.CounterOpts{
Namespace: "loki",
Name: "ingester_received_chunks",
Help: "The total number of chunks received by this ingester whilst joining.",
})
)
// TransferChunks receives all chunks from another ingester. The Ingester
// must be in PENDING state or else the call will fail.
func (i *Ingester) TransferChunks(stream logproto.Ingester_TransferChunksServer) error {
logger := util_log.WithContext(stream.Context(), util_log.Logger)
// Prevent a shutdown from happening until we've completely finished a handoff
// from a leaving ingester.
i.shutdownMtx.Lock()
defer i.shutdownMtx.Unlock()
// Entry JOINING state (only valid from PENDING)
if err := i.lifecycler.ChangeState(stream.Context(), ring.JOINING); err != nil {
return err
}
// The ingesters state effectively works as a giant mutex around this
// whole method, and as such we have to ensure we unlock the mutex.
defer func() {
state := i.lifecycler.GetState()
if i.lifecycler.GetState() == ring.ACTIVE {
return
}
level.Error(logger).Log("msg", "TransferChunks failed, not in ACTIVE state.", "state", state)
// Enter PENDING state (only valid from JOINING)
if i.lifecycler.GetState() == ring.JOINING {
// Create a new context here to attempt to update the state back to pending to allow
// a failed transfer to try again. If we fail to set the state back to PENDING then
// exit Loki as we will effectively be hung anyway stuck in a JOINING state and will
// never join.
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
if err := i.lifecycler.ChangeState(ctx, ring.PENDING); err != nil {
level.Error(logger).Log("msg", "failed to update the ring state back to PENDING after "+
"a chunk transfer failure, there is nothing more Loki can do from this state "+
"so the process will exit...", "err", err)
os.Exit(1)
}
cancel()
}
}()
fromIngesterID := ""
seriesReceived := 0
for {
chunkSet, err := stream.Recv()
if err == io.EOF {
break
} else if err != nil {
return err
}
// We can't send "extra" fields with a streaming call, so we repeat
// chunkSet.FromIngesterId and assume it is the same every time around
// this loop.
if fromIngesterID == "" {
fromIngesterID = chunkSet.FromIngesterId
level.Info(logger).Log("msg", "processing TransferChunks request", "from_ingester", fromIngesterID)
// Before transfer, make sure 'from' ingester is in correct state to call ClaimTokensFor later
err := i.checkFromIngesterIsInLeavingState(stream.Context(), fromIngesterID)
if err != nil {
return errors.Wrap(err, "TransferChunks: checkFromIngesterIsInLeavingState")
}
}
userCtx := user.InjectOrgID(stream.Context(), chunkSet.UserId)
lbls := make([]labels.Label, 0, len(chunkSet.Labels))
for _, lbl := range chunkSet.Labels {
lbls = append(lbls, labels.Label{Name: lbl.Name, Value: lbl.Value})
}
instance, err := i.GetOrCreateInstance(chunkSet.UserId)
if err != nil {
return err
}
for _, chunk := range chunkSet.Chunks {
if err := instance.consumeChunk(userCtx, lbls, chunk); err != nil {
return err
}
}
seriesReceived++
receivedChunks.Add(float64(len(chunkSet.Chunks)))
}
if seriesReceived == 0 {
level.Error(logger).Log("msg", "received TransferChunks request with no series", "from_ingester", fromIngesterID)
return fmt.Errorf("no series")
} else if fromIngesterID == "" {
level.Error(logger).Log("msg", "received TransferChunks request with no ID from ingester")
return fmt.Errorf("no ingester id")
}
if err := i.lifecycler.ClaimTokensFor(stream.Context(), fromIngesterID); err != nil {
return err
}
if err := i.lifecycler.ChangeState(stream.Context(), ring.ACTIVE); err != nil {
return err
}
// Close the stream last, as this is what tells the "from" ingester that
// it's OK to shut down.
if err := stream.SendAndClose(&logproto.TransferChunksResponse{}); err != nil {
level.Error(logger).Log("msg", "Error closing TransferChunks stream", "from_ingester", fromIngesterID, "err", err)
return err
}
level.Info(logger).Log("msg", "Successfully transferred chunks", "from_ingester", fromIngesterID, "series_received", seriesReceived)
return nil
}
// Ring gossiping: check if "from" ingester is in LEAVING state. It should be, but we may not see that yet
// when using gossip ring. If we cannot see ingester is the LEAVING state yet, we don't accept this
// transfer, as claiming tokens would possibly end up with this ingester owning no tokens, due to conflict
// resolution in ring merge function. Hopefully the leaving ingester will retry transfer again.
func (i *Ingester) checkFromIngesterIsInLeavingState(ctx context.Context, fromIngesterID string) error {
v, err := i.lifecycler.KVStore.Get(ctx, RingKey)
if err != nil {
return errors.Wrap(err, "get ring")
}
if v == nil {
return fmt.Errorf("ring not found when checking state of source ingester")
}
r, ok := v.(*ring.Desc)
if !ok || r == nil {
return fmt.Errorf("ring not found, got %T", v)
}
if r.Ingesters == nil || r.Ingesters[fromIngesterID].State != ring.LEAVING {
return fmt.Errorf("source ingester is not in a LEAVING state, found state=%v", r.Ingesters[fromIngesterID].State)
}
// all fine
return nil
}
// stopIncomingRequests is called when ingester is stopping
func (i *Ingester) stopIncomingRequests() {
i.shutdownMtx.Lock()
defer i.shutdownMtx.Unlock()
i.instancesMtx.Lock()
defer i.instancesMtx.Unlock()
i.readonly = true
}
// TransferOut implements ring.Lifecycler.
func (i *Ingester) TransferOut(ctx context.Context) error {
if i.cfg.MaxTransferRetries <= 0 {
return ring.ErrTransferDisabled
}
backoff := backoff.New(ctx, backoff.Config{
MinBackoff: 100 * time.Millisecond,
MaxBackoff: 5 * time.Second,
MaxRetries: i.cfg.MaxTransferRetries,
})
for backoff.Ongoing() {
err := i.transferOut(ctx)
if err == nil {
return nil
}
level.Error(util_log.WithContext(ctx, util_log.Logger)).Log("msg", "transfer failed", "err", err)
backoff.Wait()
}
return backoff.Err()
}
func (i *Ingester) transferOut(ctx context.Context) error {
logger := util_log.WithContext(ctx, util_log.Logger)
targetIngester, err := i.findTransferTarget(ctx)
if err != nil {
return fmt.Errorf("cannot find ingester to transfer chunks to: %v", err)
}
level.Info(logger).Log("msg", "sending chunks", "to_ingester", targetIngester.Addr)
c, err := i.cfg.ingesterClientFactory(i.clientConfig, targetIngester.Addr)
if err != nil {
return err
}
if c, ok := c.(io.Closer); ok {
defer lokiutil.LogErrorWithContext(ctx, "closing client", c.Close)
}
ic := c.(logproto.IngesterClient)
ctx = user.InjectOrgID(ctx, "-1")
s, err := ic.TransferChunks(ctx)
if err != nil {
return errors.Wrap(err, "TransferChunks")
}
for instanceID, inst := range i.instances {
err := inst.streams.ForEach(func(istream *stream) (bool, error) {
err = func() error {
istream.chunkMtx.Lock()
defer istream.chunkMtx.Unlock()
lbls := []*logproto.LabelPair{}
for _, lbl := range istream.labels {
lbls = append(lbls, &logproto.LabelPair{Name: lbl.Name, Value: lbl.Value})
}
// We moved to sending one chunk at a time in a stream instead of sending all chunks for a stream
// as large chunks can create large payloads of >16MB which can hit GRPC limits,
// typically streams won't have many chunks in memory so sending one at a time
// shouldn't add too much overhead.
for _, c := range istream.chunks {
// Close the chunk first, writing any data in the headblock to a new block.
err := c.chunk.Close()
if err != nil {
return err
}
bb, err := c.chunk.Bytes()
if err != nil {
return err
}
chunks := make([]*logproto.Chunk, 1)
chunks[0] = &logproto.Chunk{
Data: bb,
}
err = s.Send(&logproto.TimeSeriesChunk{
Chunks: chunks,
UserId: instanceID,
Labels: lbls,
FromIngesterId: i.lifecycler.ID,
})
if err != nil {
level.Error(logger).Log("msg", "failed sending stream's chunks to ingester", "to_ingester", targetIngester.Addr, "err", err)
return err
}
sentChunks.Add(float64(len(chunks)))
}
return nil
}()
if err != nil {
return false, err
}
return true, nil
})
if err != nil {
return err
}
}
_, err = s.CloseAndRecv()
if err != nil {
return errors.Wrap(err, "CloseAndRecv")
}
for _, flushQueue := range i.flushQueues {
flushQueue.DiscardAndClose()
}
i.flushQueuesDone.Wait()
level.Info(logger).Log("msg", "successfully sent chunks", "to_ingester", targetIngester.Addr)
return nil
}
// findTransferTarget finds an ingester in a PENDING state to use for transferring
// chunks to.
func (i *Ingester) findTransferTarget(ctx context.Context) (*ring.InstanceDesc, error) {
ringDesc, err := i.lifecycler.KVStore.Get(ctx, RingKey)
if err != nil {
return nil, err
}
ingesters := ringDesc.(*ring.Desc).FindIngestersByState(ring.PENDING)
if len(ingesters) == 0 {
return nil, fmt.Errorf("no pending ingesters")
}
return &ingesters[0], nil
}

@ -1,244 +0,0 @@
package ingester
import (
"fmt"
"io"
"sort"
"testing"
"time"
gokitlog "github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/kv"
"github.com/grafana/dskit/ring"
"github.com/grafana/dskit/services"
"github.com/grafana/dskit/user"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/net/context"
"google.golang.org/grpc"
"github.com/grafana/loki/pkg/chunkenc"
"github.com/grafana/loki/pkg/ingester/client"
"github.com/grafana/loki/pkg/logproto"
"github.com/grafana/loki/pkg/logql/log"
util_log "github.com/grafana/loki/pkg/util/log"
)
func TestTransferOut(t *testing.T) {
f := newTestIngesterFactory(t)
ing := f.getIngester(time.Duration(0), t)
// Push some data into our original ingester
ctx := user.InjectOrgID(context.Background(), "test")
_, err := ing.Push(ctx, &logproto.PushRequest{
Streams: []logproto.Stream{
{
Entries: []logproto.Entry{
{Line: "line 0", Timestamp: time.Unix(0, 0)},
{Line: "line 1", Timestamp: time.Unix(1, 0)},
},
Labels: `{foo="bar",bar="baz1"}`,
},
{
Entries: []logproto.Entry{
{Line: "line 2", Timestamp: time.Unix(2, 0)},
{Line: "line 3", Timestamp: time.Unix(3, 0)},
},
Labels: `{foo="bar",bar="baz2"}`,
},
},
})
require.NoError(t, err)
assert.Len(t, ing.instances, 1)
if assert.Contains(t, ing.instances, "test") {
assert.Equal(t, ing.instances["test"].streams.Len(), 2)
}
// Create a new ingester and transfer data to it
ing2 := f.getIngester(time.Second*60, t)
defer services.StopAndAwaitTerminated(context.Background(), ing2) //nolint:errcheck
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
assert.Len(t, ing2.instances, 1)
if assert.Contains(t, ing2.instances, "test") {
assert.Equal(t, ing2.instances["test"].streams.Len(), 2)
lines := []string{}
// Get all the lines back and make sure the blocks transferred successfully
_ = ing2.instances["test"].streams.ForEach(func(s *stream) (bool, error) {
it, err := s.Iterator(
context.TODO(),
nil,
time.Unix(0, 0),
time.Unix(10, 0),
logproto.FORWARD,
log.NewNoopPipeline().ForStream(s.labels),
)
if !assert.NoError(t, err) {
return true, nil
}
for it.Next() {
entry := it.Entry()
lines = append(lines, entry.Line)
}
return true, nil
})
sort.Strings(lines)
assert.Equal(
t,
[]string{"line 0", "line 1", "line 2", "line 3"},
lines,
)
}
}
type testIngesterFactory struct {
t *testing.T
store kv.Client
n int
ingesters map[string]*Ingester
}
func newTestIngesterFactory(t *testing.T) *testIngesterFactory {
kvClient, err := kv.NewClient(kv.Config{Store: "inmemory"}, ring.GetCodec(), nil, gokitlog.NewNopLogger())
require.NoError(t, err)
return &testIngesterFactory{
t: t,
store: kvClient,
ingesters: make(map[string]*Ingester),
}
}
func (f *testIngesterFactory) getIngester(joinAfter time.Duration, t *testing.T) *Ingester {
f.n++
cfg := defaultIngesterTestConfig(t)
cfg.MaxTransferRetries = 1
cfg.LifecyclerConfig.ID = fmt.Sprintf("localhost-%d", f.n)
cfg.LifecyclerConfig.RingConfig.KVStore.Mock = f.store
cfg.LifecyclerConfig.JoinAfter = joinAfter
cfg.LifecyclerConfig.Addr = cfg.LifecyclerConfig.ID
// Force a tiny chunk size and no encoding so we can guarantee multiple chunks
// These values are also crafted around the specific use of `line _` in the log line which is 6 bytes long
cfg.BlockSize = 3 // Block size needs to be less than chunk size so we can get more than one block per chunk
cfg.TargetChunkSize = 24
cfg.ChunkEncoding = chunkenc.EncNone.String()
cfg.ingesterClientFactory = func(_ client.Config, addr string) (client.HealthAndIngesterClient, error) {
ingester, ok := f.ingesters[addr]
if !ok {
return nil, fmt.Errorf("no ingester %s", addr)
}
return client.ClosableHealthAndIngesterClient{
PusherClient: nil,
QuerierClient: nil,
IngesterClient: &testIngesterClient{t: f.t, i: ingester},
Closer: io.NopCloser(nil),
}, nil
}
_, ing := newTestStore(f.t, cfg, nil)
f.ingesters[fmt.Sprintf("%s:0", cfg.LifecyclerConfig.ID)] = ing
// NB there's some kind of race condition with the in-memory KV client when
// we don't give the ingester a little bit of time to initialize. a 100ms
// wait time seems effective.
time.Sleep(time.Millisecond * 100)
return ing
}
type testIngesterClient struct {
t *testing.T
i *Ingester
}
func (c *testIngesterClient) TransferChunks(context.Context, ...grpc.CallOption) (logproto.Ingester_TransferChunksClient, error) {
chunkCh := make(chan *logproto.TimeSeriesChunk)
respCh := make(chan *logproto.TransferChunksResponse)
waitCh := make(chan bool)
client := &testTransferChunksClient{ch: chunkCh, resp: respCh, wait: waitCh}
go func() {
server := &testTransferChunksServer{ch: chunkCh, resp: respCh}
err := c.i.TransferChunks(server)
require.NoError(c.t, err)
}()
// After 50ms, we try killing the target ingester's lifecycler to verify
// that it obtained a lock on the shutdown process. This operation should
// block until the transfer completes.
//
// Then after another 50ms, we also allow data to start sending. This tests an issue
// where an ingester is shut down before it completes the handoff and ends up in an
// unhealthy state, permanently stuck in the handler for claiming tokens.
go func() {
time.Sleep(time.Millisecond * 50)
c.i.stopIncomingRequests() // used to be called from lifecycler, now it must be called *before* stopping lifecyler. (ingester does this on shutdown)
err := services.StopAndAwaitTerminated(context.Background(), c.i.lifecycler)
if err != nil {
level.Error(util_log.Logger).Log("msg", "lifecycler failed", "err", err)
}
}()
go func() {
time.Sleep(time.Millisecond * 100)
close(waitCh)
}()
return client, nil
}
type testTransferChunksClient struct {
wait chan bool
ch chan *logproto.TimeSeriesChunk
resp chan *logproto.TransferChunksResponse
grpc.ClientStream
}
func (c *testTransferChunksClient) Send(chunk *logproto.TimeSeriesChunk) error {
<-c.wait
c.ch <- chunk
return nil
}
func (c *testTransferChunksClient) CloseAndRecv() (*logproto.TransferChunksResponse, error) {
<-c.wait
close(c.ch)
resp := <-c.resp
close(c.resp)
return resp, nil
}
type testTransferChunksServer struct {
ch chan *logproto.TimeSeriesChunk
resp chan *logproto.TransferChunksResponse
grpc.ServerStream
}
func (s *testTransferChunksServer) Context() context.Context {
return context.Background()
}
func (s *testTransferChunksServer) SendAndClose(resp *logproto.TransferChunksResponse) error {
s.resp <- resp
return nil
}
func (s *testTransferChunksServer) Recv() (*logproto.TimeSeriesChunk, error) {
chunk, ok := <-s.ch
if !ok {
return nil, io.EOF
}
return chunk, nil
}

File diff suppressed because it is too large Load Diff

@ -32,10 +32,6 @@ service Querier {
rpc GetVolume(VolumeRequest) returns (VolumeResponse) {}
}
service Ingester {
rpc TransferChunks(stream TimeSeriesChunk) returns (TransferChunksResponse) {}
}
service StreamData {
rpc GetStreamRates(StreamRatesRequest) returns (StreamRatesResponse) {}
}
@ -200,13 +196,6 @@ message DroppedStream {
string labels = 3;
}
message TimeSeriesChunk {
string from_ingester_id = 1;
string user_id = 2;
repeated LabelPair labels = 3;
repeated Chunk chunks = 4;
}
message LabelPair {
string name = 1;
string value = 2;
@ -223,8 +212,6 @@ message Chunk {
bytes data = 1;
}
message TransferChunksResponse {}
message TailersCountRequest {}
message TailersCountResponse {

@ -335,13 +335,10 @@ func New(cfg Config) (*Loki, error) {
}
func (t *Loki) setupAuthMiddleware() {
// Don't check auth header on TransferChunks, as we weren't originally
// sending it and this could cause transfers to fail on update.
t.HTTPAuthMiddleware = fakeauth.SetupAuthMiddleware(&t.Cfg.Server, t.Cfg.AuthEnabled,
// Also don't check auth for these gRPC methods, since single call is used for multiple users (or no user like health check).
[]string{
"/grpc.health.v1.Health/Check",
"/logproto.Ingester/TransferChunks",
"/logproto.StreamData/GetStreamRates",
"/frontend.Frontend/Process",
"/frontend.Frontend/NotifyClientShutdown",

@ -497,7 +497,6 @@ func (t *Loki) initIngester() (_ services.Service, err error) {
logproto.RegisterPusherServer(t.Server.GRPC, t.Ingester)
logproto.RegisterQuerierServer(t.Server.GRPC, t.Ingester)
logproto.RegisterIngesterServer(t.Server.GRPC, t.Ingester)
logproto.RegisterStreamDataServer(t.Server.GRPC, t.Ingester)
httpMiddleware := middleware.Merge(

Loading…
Cancel
Save