Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/ruler/storage/cleaner/cleaner.go

235 lines
7.5 KiB

// This directory was copied and adapted from https://github.com/grafana/agent/tree/main/pkg/metrics.
// We cannot vendor the agent in since the agent vendors loki in, which would cause a cyclic dependency.
// NOTE: many changes have been made to the original code for our use-case.
package cleaner
import (
"fmt"
"os"
"path/filepath"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
promwal "github.com/prometheus/prometheus/tsdb/wlog"
"github.com/grafana/loki/pkg/ruler/storage/instance"
"github.com/grafana/loki/pkg/ruler/storage/wal"
)
// Default settings for the WAL cleaner.
const (
DefaultCleanupAge = 12 * time.Hour
DefaultCleanupPeriod = 0 * time.Second // disabled by default
)
// lastModifiedFunc gets the last modified time of the most recent segment of a WAL
type lastModifiedFunc func(path string) (time.Time, error)
func lastModified(path string) (time.Time, error) {
existing, err := promwal.Open(nil, path)
if err != nil {
return time.Time{}, err
}
// We don't care if there are errors closing the abandoned WAL
defer func() { _ = existing.Close() }()
_, last, err := promwal.Segments(existing.Dir())
if err != nil {
return time.Time{}, fmt.Errorf("unable to open WAL: %w", err)
}
if last == -1 {
return time.Time{}, fmt.Errorf("unable to determine most recent segment for %s", path)
}
// full path to the most recent segment in this WAL
lastSegment := promwal.SegmentName(path, last)
segmentFile, err := os.Stat(lastSegment)
if err != nil {
return time.Time{}, fmt.Errorf("unable to determine mtime for %s segment: %w", lastSegment, err)
}
return segmentFile.ModTime(), nil
}
// WALCleaner periodically checks for Write Ahead Logs (WALs) that are not associated
// with any active instance.ManagedInstance and have not been written to in some configured
// amount of time and deletes them.
type WALCleaner struct {
logger log.Logger
instanceManager instance.Manager
walDirectory string
walLastModified lastModifiedFunc
minAge time.Duration
period time.Duration
done chan bool
metrics *Metrics
}
// NewWALCleaner creates a new cleaner that looks for abandoned WALs in the given
// directory and removes them if they haven't been modified in over minAge. Starts
// a goroutine to periodically run the cleanup method in a loop
func NewWALCleaner(logger log.Logger, manager instance.Manager, metrics *Metrics, walDirectory string, cfg Config) *WALCleaner {
c := &WALCleaner{
logger: log.With(logger, "component", "cleaner"),
instanceManager: manager,
walDirectory: filepath.Clean(walDirectory),
walLastModified: lastModified,
minAge: DefaultCleanupAge,
period: DefaultCleanupPeriod,
done: make(chan bool),
metrics: metrics,
}
if cfg.MinAge > 0 {
c.minAge = cfg.MinAge
}
// We allow a period of 0 here because '0' means "don't run the task". This
// is handled by not running a ticker at all in the run method.
if cfg.Period >= 0 {
c.period = cfg.Period
}
go c.run()
return c
}
// getManagedStorage gets storage directories used for each ManagedInstance
func (c *WALCleaner) getManagedStorage(instances map[string]instance.ManagedInstance) map[string]bool {
out := make(map[string]bool)
for _, inst := range instances {
out[inst.StorageDirectory()] = true
}
return out
}
// getAllStorage gets all storage directories under walDirectory
func (c *WALCleaner) getAllStorage() []string {
var out []string
_ = filepath.Walk(c.walDirectory, func(p string, info os.FileInfo, err error) error {
if os.IsNotExist(err) {
// The root WAL directory doesn't exist. Maybe this Agent isn't responsible for any
// instances yet. Log at debug since this isn't a big deal. We'll just try to crawl
// the direction again on the next periodic run.
level.Debug(c.logger).Log("msg", "WAL storage path does not exist", "path", p, "err", err)
} else if err != nil {
// Just log any errors traversing the WAL directory. This will potentially result
// in a WAL (that has incorrect permissions or some similar problem) not being cleaned
// up. This is better than preventing *all* other WALs from being cleaned up.
c.metrics.DiscoveryError.WithLabelValues(p).Inc()
level.Warn(c.logger).Log("msg", "unable to traverse WAL storage path", "path", p, "err", err)
} else if info.IsDir() && filepath.Dir(p) == c.walDirectory {
// Single level below the root are instance storage directories (including WALs)
out = append(out, p)
}
return nil
})
return out
}
// getAbandonedStorage gets the full path of storage directories that aren't associated with
// an active instance and haven't been written to within a configured duration (usually several
// hours or more).
func (c *WALCleaner) getAbandonedStorage(all []string, managed map[string]bool, now time.Time) []string {
var out []string
for _, dir := range all {
if managed[dir] {
level.Debug(c.logger).Log("msg", "active WAL", "name", dir)
continue
}
walDir := wal.SubDirectory(dir)
mtime, err := c.walLastModified(walDir)
if err != nil {
c.metrics.SegmentError.WithLabelValues(dir).Inc()
level.Warn(c.logger).Log("msg", "unable to find segment mtime of WAL", "name", dir, "err", err)
continue
}
diff := now.Sub(mtime)
if diff > c.minAge {
// The last segment for this WAL was modified more then $minAge (positive number of hours)
// in the past. This makes it a candidate for deletion since it's also not associated with
// any Instances this agent knows about.
out = append(out, dir)
}
level.Debug(c.logger).Log("msg", "abandoned WAL", "name", dir, "mtime", mtime, "diff", diff)
}
return out
}
// run cleans up abandoned WALs (if period != 0) in a loop periodically until stopped
func (c *WALCleaner) run() {
// A period of 0 means don't run a cleanup task
if c.period == 0 {
return
}
ticker := time.NewTicker(c.period)
defer ticker.Stop()
for {
select {
case <-c.done:
level.Debug(c.logger).Log("msg", "stopping cleaner...")
return
case <-ticker.C:
c.cleanup()
}
}
}
// cleanup removes any abandoned and unused WAL directories. Note that it shouldn't be
// necessary to call this method explicitly in most cases since it will be run periodically
// in a goroutine (started when WALCleaner is created).
func (c *WALCleaner) cleanup() {
if !c.instanceManager.Ready() {
level.Warn(c.logger).Log("msg", "delaying WAL clean until all storage instances are ready")
return
}
start := time.Now()
all := c.getAllStorage()
managed := c.getManagedStorage(c.instanceManager.ListInstances())
abandoned := c.getAbandonedStorage(all, managed, time.Now())
c.metrics.ManagedStorage.Set(float64(len(managed)))
c.metrics.AbandonedStorage.Set(float64(len(abandoned)))
// NOTE: this is a little imperfect right now; the manager cannot currently be notified when an
// instance (rule group) is removed by the prometheus QueueManager, so the cleaner will only really
// become aware of "abandoned" instances when the ruler is restarted.
// TODO(dannyk): contribute a callback mechanism to prometheus to allow for this hook
for _, a := range abandoned {
level.Info(c.logger).Log("msg", "deleting abandoned WAL", "name", a)
err := os.RemoveAll(a)
if err != nil {
level.Error(c.logger).Log("msg", "failed to delete abandoned WAL", "name", a, "err", err)
c.metrics.CleanupRunsErrors.Inc()
} else {
c.metrics.CleanupRunsSuccess.Inc()
}
}
c.metrics.CleanupTimes.Observe(time.Since(start).Seconds())
}
// Stop the cleaner and any background tasks running
func (c *WALCleaner) Stop() {
close(c.done)
}