diff --git a/main.go b/main.go index 05edb4e943..c881b85bdc 100644 --- a/main.go +++ b/main.go @@ -60,7 +60,8 @@ var ( checkpointInterval = flag.Duration("storage.local.checkpoint-interval", 5*time.Minute, "The period at which the in-memory metrics and the chunks not yet persisted to series files are checkpointed.") checkpointDirtySeriesLimit = flag.Int("storage.local.checkpoint-dirty-series-limit", 5000, "If approx. that many time series are in a state that would require a recovery operation after a crash, a checkpoint is triggered, even if the checkpoint interval hasn't passed yet. A recovery operation requires a disk seek. The default limit intends to keep the recovery time below 1min even on spinning disks. With SSD, recovery is much faster, so you might want to increase this value in that case to avoid overly frequent checkpoints.") - storageDirty = flag.Bool("storage.local.dirty", false, "If set, the local storage layer will perform crash recovery even if the last shutdown appears to be clean.") + storageDirty = flag.Bool("storage.local.dirty", false, "If set, the local storage layer will perform crash recovery even if the last shutdown appears to be clean.") + storagePedanticChecks = flag.Bool("storage.local.pedantic-checks", false, "If set, a crash recovery will perform checks on each series file. This might take a very long time.") printVersion = flag.Bool("version", false, "Print version information.") ) diff --git a/storage/local/crashrecovery.go b/storage/local/crashrecovery.go index f988bcb6c7..384f798695 100644 --- a/storage/local/crashrecovery.go +++ b/storage/local/crashrecovery.go @@ -221,7 +221,10 @@ func (p *persistence) sanitizeSeries( if s == nil { panic("fingerprint mapped to nil pointer") } - if bytesToTrim == 0 && s.chunkDescsOffset != -1 && chunksInFile == s.chunkDescsOffset+s.persistWatermark { + if !p.pedanticChecks && + bytesToTrim == 0 && + s.chunkDescsOffset != -1 && + chunksInFile == s.chunkDescsOffset+s.persistWatermark { // Everything is consistent. We are good. return fp, true } diff --git a/storage/local/persistence.go b/storage/local/persistence.go index 7e9a91f97a..8870bac4b6 100644 --- a/storage/local/persistence.go +++ b/storage/local/persistence.go @@ -114,15 +114,16 @@ type persistence struct { indexingBatchLatency prometheus.Summary checkpointDuration prometheus.Gauge - dirtyMtx sync.Mutex // Protects dirty and becameDirty. - dirty bool // true if persistence was started in dirty state. - becameDirty bool // true if an inconsistency came up during runtime. - dirtyFileName string // The file used for locking and to mark dirty state. - fLock flock.Releaser // The file lock to protect against concurrent usage. + dirtyMtx sync.Mutex // Protects dirty and becameDirty. + dirty bool // true if persistence was started in dirty state. + becameDirty bool // true if an inconsistency came up during runtime. + pedanticChecks bool // true if crash recovery should check each series. + dirtyFileName string // The file used for locking and to mark dirty state. + fLock flock.Releaser // The file lock to protect against concurrent usage. } // newPersistence returns a newly allocated persistence backed by local disk storage, ready to use. -func newPersistence(basePath string, dirty bool) (*persistence, error) { +func newPersistence(basePath string, dirty, pedanticChecks bool) (*persistence, error) { dirtyPath := filepath.Join(basePath, dirtyFileName) versionPath := filepath.Join(basePath, versionFileName) @@ -225,9 +226,10 @@ func newPersistence(basePath string, dirty bool) (*persistence, error) { Name: "checkpoint_duration_milliseconds", Help: "The duration (in milliseconds) it took to checkpoint in-memory metrics and head chunks.", }), - dirty: dirty, - dirtyFileName: dirtyPath, - fLock: fLock, + dirty: dirty, + pedanticChecks: pedanticChecks, + dirtyFileName: dirtyPath, + fLock: fLock, } if p.dirty { diff --git a/storage/local/persistence_test.go b/storage/local/persistence_test.go index 83f6cc5363..b5fd6b8829 100644 --- a/storage/local/persistence_test.go +++ b/storage/local/persistence_test.go @@ -36,7 +36,7 @@ var ( func newTestPersistence(t *testing.T, encoding chunkEncoding) (*persistence, test.Closer) { *defaultChunkEncoding = int(encoding) dir := test.NewTemporaryDirectory("test_persistence", t) - p, err := newPersistence(dir.Path(), false) + p, err := newPersistence(dir.Path(), false, false) if err != nil { dir.Close() t.Fatal(err) diff --git a/storage/local/storage.go b/storage/local/storage.go index 689edbbe9f..9ffd928b07 100644 --- a/storage/local/storage.go +++ b/storage/local/storage.go @@ -71,8 +71,6 @@ type memorySeriesStorage struct { persistence *persistence - countPersistedHeadChunks chan struct{} - evictList *list.List evictRequests chan evictRequest evictStopping, evictStopped chan struct{} @@ -95,12 +93,13 @@ type MemorySeriesStorageOptions struct { CheckpointInterval time.Duration // How often to checkpoint the series map and head chunks. CheckpointDirtySeriesLimit int // How many dirty series will trigger an early checkpoint. Dirty bool // Force the storage to consider itself dirty on startup. + PedanticChecks bool // If dirty, perform crash-recovery checks on each series file. } // NewMemorySeriesStorage returns a newly allocated Storage. Storage.Serve still // has to be called to start the storage. func NewMemorySeriesStorage(o *MemorySeriesStorageOptions) (Storage, error) { - p, err := newPersistence(o.PersistenceStoragePath, o.Dirty) + p, err := newPersistence(o.PersistenceStoragePath, o.Dirty, o.PedanticChecks) if err != nil { return nil, err } @@ -133,8 +132,6 @@ func NewMemorySeriesStorage(o *MemorySeriesStorageOptions) (Storage, error) { numChunksToPersist: numChunksToPersist, persistence: p, - countPersistedHeadChunks: make(chan struct{}, 100), - evictList: list.New(), evictRequests: make(chan evictRequest, evictRequestsCap), evictStopping: make(chan struct{}),