Log stream flushes to see how many chunks we flush at a time (#6819)

* Log stream flushes to see how many chunks we flush at a time Signed-off-by: Danny Kopping <danny.kopping@grafana.com> * Smuggling in an error wrapping which aides debugging Signed-off-by: Danny Kopping <danny.kopping@grafana.com> * Update docs/sources/upgrading/_index.md Co-authored-by: Ed Welch <ed@oqqer.com> * Removing unnecessary label Signed-off-by: Danny Kopping <danny.kopping@grafana.com> Co-authored-by: Ed Welch <ed@oqqer.com>
3 years ago · 66fdb3c572
parent 1c6a2410d6
commit 66fdb3c572
3 changed files with 7 additions and 6 deletions
--- a/docs/sources/upgrading/_index.md
+++ b/docs/sources/upgrading/_index.md
@ -133,7 +133,7 @@ Meanwhile, the legacy format is a string in the following format:
 * `query_ingesters_within` under the `querier` config now defaults to `3h`, previously it was `0s`. Any query (or subquery) that has an end time more than `3h` ago will not be sent to the ingesters, this saves work on the ingesters for data they normally don't contain. If you regularly write old data to Loki you may need to return this value to `0s` to always query ingesters. 
 * `max_concurrent` under the `querier` config now defaults to `10` instead of `20`.
 * `match_max_concurrent` under the `frontend_worker` config now defaults to true, this supersedes the `parallelism` setting which can now be removed from your config. Controlling query parallelism of a single process can now be done with the `querier` `max_concurrent` setting.
-* `flush_op_timeout` under the `ingester` configuration block now defaults to `10m`, increased from `10s`. This can help when replaying a large WAL on Loki startup, and avoid `msg="failed to flush user" ... context deadline exceeded` errors.
+* `flush_op_timeout` under the `ingester` configuration block now defaults to `10m`, increased from `10s`. This can help when replaying a large WAL on Loki startup, and avoid `msg="failed to flush" ... context deadline exceeded` errors.

 ### Promtail

--- a/pkg/ingester/flush.go
+++ b/pkg/ingester/flush.go
@ -139,11 +139,9 @@ func (i *Ingester) flushLoop(j int) {
 		}
 		op := o.(*flushOp)

-		level.Debug(util_log.Logger).Log("msg", "flushing stream", "userid", op.userID, "fp", op.fp, "immediate", op.immediate)
-
 		err := i.flushUserSeries(op.userID, op.fp, op.immediate)
 		if err != nil {
-			level.Error(util_log.WithUserID(op.userID, util_log.Logger)).Log("msg", "failed to flush user", "err", err)
+			level.Error(util_log.WithUserID(op.userID, util_log.Logger)).Log("msg", "failed to flush", "err", err)
 		}

 		// If we're exiting & we failed to flush, put the failed operation
@ -166,12 +164,15 @@ func (i *Ingester) flushUserSeries(userID string, fp model.Fingerprint, immediat
 		return nil
 	}

+	lbs := labels.String()
+	level.Info(util_log.Logger).Log("msg", "flushing stream", "user", userID, "fp", fp, "immediate", immediate, "num_chunks", len(chunks), "labels", lbs)
+
 	ctx := user.InjectOrgID(context.Background(), userID)
 	ctx, cancel := context.WithTimeout(ctx, i.cfg.FlushOpTimeout)
 	defer cancel()
 	err := i.flushChunks(ctx, fp, labels, chunks, chunkMtx)
 	if err != nil {
-		return err
+		return fmt.Errorf("failed to flush chunks: %w, num_chunks: %d, labels: %s", err, len(chunks), lbs)
 	}

 	return nil
--- a/pkg/storage/chunk/client/local/boltdb_index_client.go
+++ b/pkg/storage/chunk/client/local/boltdb_index_client.go
@ -168,7 +168,7 @@ func (b *BoltIndexClient) GetDB(name string, operation int) (*bbolt.DB, error) {
 	// Set Timeout to avoid obtaining file lock wait indefinitely.
 	db, err := bbolt.Open(path.Join(b.cfg.Directory, name), 0o666, &bbolt.Options{Timeout: openBoltDBFileTimeout})
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("failed to open boltdb index file: %w", err)
 	}

 	b.dbs[name] = db