Log stream flushes to see how many chunks we flush at a time (#6819)

* Log stream flushes to see how many chunks we flush at a time

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

* Smuggling in an error wrapping which aides debugging

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

* Update docs/sources/upgrading/_index.md

Co-authored-by: Ed Welch <ed@oqqer.com>

* Removing unnecessary label

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

Co-authored-by: Ed Welch <ed@oqqer.com>
pull/6827/head
Danny Kopping 3 years ago committed by GitHub
parent 1c6a2410d6
commit 66fdb3c572
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      docs/sources/upgrading/_index.md
  2. 9
      pkg/ingester/flush.go
  3. 2
      pkg/storage/chunk/client/local/boltdb_index_client.go

@ -133,7 +133,7 @@ Meanwhile, the legacy format is a string in the following format:
* `query_ingesters_within` under the `querier` config now defaults to `3h`, previously it was `0s`. Any query (or subquery) that has an end time more than `3h` ago will not be sent to the ingesters, this saves work on the ingesters for data they normally don't contain. If you regularly write old data to Loki you may need to return this value to `0s` to always query ingesters.
* `max_concurrent` under the `querier` config now defaults to `10` instead of `20`.
* `match_max_concurrent` under the `frontend_worker` config now defaults to true, this supersedes the `parallelism` setting which can now be removed from your config. Controlling query parallelism of a single process can now be done with the `querier` `max_concurrent` setting.
* `flush_op_timeout` under the `ingester` configuration block now defaults to `10m`, increased from `10s`. This can help when replaying a large WAL on Loki startup, and avoid `msg="failed to flush user" ... context deadline exceeded` errors.
* `flush_op_timeout` under the `ingester` configuration block now defaults to `10m`, increased from `10s`. This can help when replaying a large WAL on Loki startup, and avoid `msg="failed to flush" ... context deadline exceeded` errors.
### Promtail

@ -139,11 +139,9 @@ func (i *Ingester) flushLoop(j int) {
}
op := o.(*flushOp)
level.Debug(util_log.Logger).Log("msg", "flushing stream", "userid", op.userID, "fp", op.fp, "immediate", op.immediate)
err := i.flushUserSeries(op.userID, op.fp, op.immediate)
if err != nil {
level.Error(util_log.WithUserID(op.userID, util_log.Logger)).Log("msg", "failed to flush user", "err", err)
level.Error(util_log.WithUserID(op.userID, util_log.Logger)).Log("msg", "failed to flush", "err", err)
}
// If we're exiting & we failed to flush, put the failed operation
@ -166,12 +164,15 @@ func (i *Ingester) flushUserSeries(userID string, fp model.Fingerprint, immediat
return nil
}
lbs := labels.String()
level.Info(util_log.Logger).Log("msg", "flushing stream", "user", userID, "fp", fp, "immediate", immediate, "num_chunks", len(chunks), "labels", lbs)
ctx := user.InjectOrgID(context.Background(), userID)
ctx, cancel := context.WithTimeout(ctx, i.cfg.FlushOpTimeout)
defer cancel()
err := i.flushChunks(ctx, fp, labels, chunks, chunkMtx)
if err != nil {
return err
return fmt.Errorf("failed to flush chunks: %w, num_chunks: %d, labels: %s", err, len(chunks), lbs)
}
return nil

@ -168,7 +168,7 @@ func (b *BoltIndexClient) GetDB(name string, operation int) (*bbolt.DB, error) {
// Set Timeout to avoid obtaining file lock wait indefinitely.
db, err := bbolt.Open(path.Join(b.cfg.Directory, name), 0o666, &bbolt.Options{Timeout: openBoltDBFileTimeout})
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to open boltdb index file: %w", err)
}
b.dbs[name] = db

Loading…
Cancel
Save