mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
539 lines
19 KiB
539 lines
19 KiB
package compactor
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/go-kit/log"
|
|
"github.com/go-kit/log/level"
|
|
"github.com/grafana/dskit/concurrency"
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/common/model"
|
|
"go.etcd.io/bbolt"
|
|
|
|
"github.com/grafana/loki/pkg/storage/chunk/client/local"
|
|
chunk_util "github.com/grafana/loki/pkg/storage/chunk/client/util"
|
|
"github.com/grafana/loki/pkg/storage/stores/shipper/compactor/retention"
|
|
"github.com/grafana/loki/pkg/storage/stores/shipper/storage"
|
|
shipper_util "github.com/grafana/loki/pkg/storage/stores/shipper/util"
|
|
util_log "github.com/grafana/loki/pkg/util/log"
|
|
)
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Below we show various formats that we have for structuring index in the object store. //
|
|
// //
|
|
// FORMAT1 FORMAT2 FORMAT3 //
|
|
// //
|
|
// table1 table1 table1 //
|
|
// | | | //
|
|
// ----> db1.gz ----> db1.gz ----> user1 //
|
|
// | | | | //
|
|
// ----> index ----> user1 | ----> db1.gz //
|
|
// ----> user2 | | //
|
|
// | ----> index //
|
|
// ----> user2 //
|
|
// | //
|
|
// ----> db1.gz //
|
|
// | //
|
|
// ----> index //
|
|
// //
|
|
// FORMAT1 - `table1` has 1 db named db1.gz and 1 boltdb bucket named `index` which contains index for all the users. //
|
|
// It is in use when the flag to build per user index is not enabled. //
|
|
// Ingesters write the index in Format1 which then compactor compacts down in same format. //
|
|
// //
|
|
// FORMAT2 - `table1` has 1 db named db1.gz and 1 boltdb bucket each for `user1` and `user2` containing //
|
|
// index just for those users. //
|
|
// It is an intermediate format built by ingesters when the flag to build per user index is enabled. //
|
|
// //
|
|
// FORMAT3 - `table1` has 1 folder each for `user1` and `user2` containing index files having index just for those users. //
|
|
// Compactor builds index in this format from Format2. //
|
|
// //
|
|
// THING TO NOTE HERE IS COMPACTOR BUILDS INDEX IN FORMAT1 FROM FORMAT1 AND FORMAT3 FROM FORMAT2. //
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
const (
|
|
uploaderName = "compactor"
|
|
uploadIndexSetsConcurrency = 10
|
|
|
|
readDBsConcurrency = 50
|
|
batchSize = 1000
|
|
|
|
// we want to recreate compactedDB when the chances of it changing due to compaction or deletion of data are low.
|
|
// this is to avoid recreation of the DB too often which would be too costly in a large cluster.
|
|
recreateCompactedDBOlderThan = 12 * time.Hour
|
|
dropFreePagesTxMaxSize = 100 * 1024 * 1024 // 100MB
|
|
recreatedCompactedDBSuffix = ".r.gz"
|
|
)
|
|
|
|
type indexEntry struct {
|
|
k, v []byte
|
|
}
|
|
|
|
type tableExpirationChecker interface {
|
|
IntervalMayHaveExpiredChunks(interval model.Interval, userID string) bool
|
|
}
|
|
|
|
type table struct {
|
|
name string
|
|
workingDirectory string
|
|
indexStorageClient storage.Client
|
|
tableMarker retention.TableMarker
|
|
expirationChecker tableExpirationChecker
|
|
|
|
baseUserIndexSet, baseCommonIndexSet storage.IndexSet
|
|
|
|
indexSets map[string]*indexSet
|
|
indexSetsMtx sync.RWMutex
|
|
usersWithPerUserIndex []string
|
|
uploadCompactedDB bool
|
|
compactedDB *bbolt.DB
|
|
logger log.Logger
|
|
|
|
ctx context.Context
|
|
}
|
|
|
|
func newTable(ctx context.Context, workingDirectory string, indexStorageClient storage.Client,
|
|
tableMarker retention.TableMarker, expirationChecker tableExpirationChecker,
|
|
) (*table, error) {
|
|
err := chunk_util.EnsureDirectory(workingDirectory)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
table := table{
|
|
ctx: ctx,
|
|
name: filepath.Base(workingDirectory),
|
|
workingDirectory: workingDirectory,
|
|
indexStorageClient: indexStorageClient,
|
|
tableMarker: tableMarker,
|
|
expirationChecker: expirationChecker,
|
|
indexSets: map[string]*indexSet{},
|
|
baseUserIndexSet: storage.NewIndexSet(indexStorageClient, true),
|
|
baseCommonIndexSet: storage.NewIndexSet(indexStorageClient, false),
|
|
}
|
|
table.logger = log.With(util_log.Logger, "table-name", table.name)
|
|
|
|
return &table, nil
|
|
}
|
|
|
|
func (t *table) compact(applyRetention bool) error {
|
|
indexFiles, usersWithPerUserIndex, err := t.indexStorageClient.ListFiles(t.ctx, t.name, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if len(indexFiles) == 0 && len(usersWithPerUserIndex) == 0 {
|
|
level.Info(t.logger).Log("msg", "no common index files and user index found")
|
|
return nil
|
|
}
|
|
|
|
t.usersWithPerUserIndex = usersWithPerUserIndex
|
|
|
|
level.Info(t.logger).Log("msg", "listed files", "count", len(indexFiles))
|
|
|
|
defer func() {
|
|
for _, is := range t.indexSets {
|
|
is.cleanup()
|
|
}
|
|
|
|
if t.compactedDB != nil {
|
|
if err := t.compactedDB.Close(); err != nil {
|
|
level.Error(t.logger).Log("msg", "error closing compacted DB", "err", err)
|
|
}
|
|
}
|
|
|
|
if err := os.RemoveAll(t.workingDirectory); err != nil {
|
|
level.Error(t.logger).Log("msg", fmt.Sprintf("failed to remove working directory %s", t.workingDirectory), "err", err)
|
|
}
|
|
}()
|
|
|
|
dbsCompacted := false
|
|
|
|
if len(indexFiles) > 1 || (len(indexFiles) == 1 && !strings.HasPrefix(indexFiles[0].Name, uploaderName)) {
|
|
// if we have more than 1 index file or the only file we have is not from the compactor then, we need to compact them.
|
|
dbsCompacted = true
|
|
if err := t.compactFiles(indexFiles); err != nil {
|
|
return err
|
|
}
|
|
} else if len(indexFiles) == 1 && (applyRetention || mustRecreateCompactedDB(indexFiles)) {
|
|
// we have just 1 common index file which is already compacted.
|
|
// initialize common compacted db if we need to apply retention, or we need to recreate it
|
|
downloadAt := filepath.Join(t.workingDirectory, indexFiles[0].Name)
|
|
err = shipper_util.DownloadFileFromStorage(downloadAt, shipper_util.IsCompressedFile(indexFiles[0].Name),
|
|
false, shipper_util.LoggerWithFilename(t.logger, indexFiles[0].Name),
|
|
func() (io.ReadCloser, error) {
|
|
return t.baseCommonIndexSet.GetFile(t.ctx, t.name, "", indexFiles[0].Name)
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
t.compactedDB, err = openBoltdbFileWithNoSync(downloadAt)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// initialize common index set if we have initialized compacted db.
|
|
if t.compactedDB != nil {
|
|
// remove the source files if we did a compaction which gets reflected in dbsCompacted
|
|
t.indexSets[""], err = newCommonIndex(t.ctx, t.name, t.workingDirectory, t.compactedDB, t.uploadCompactedDB,
|
|
indexFiles, dbsCompacted, t.baseCommonIndexSet, t.logger)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if applyRetention {
|
|
err := t.applyRetention()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return t.done()
|
|
}
|
|
|
|
// done takes care of final operations which includes:
|
|
// - initializing user index sets which requires recreation of files
|
|
// - call indexSet.done() on all the index sets.
|
|
func (t *table) done() error {
|
|
for _, userID := range t.usersWithPerUserIndex {
|
|
if _, ok := t.indexSets[userID]; ok {
|
|
continue
|
|
}
|
|
|
|
indexFiles, err := t.baseUserIndexSet.ListFiles(t.ctx, t.name, userID, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// initialize the user index sets for:
|
|
// - compaction if we have more than 1 index file, taken care of by index set initialization
|
|
// - recreation if mustRecreateCompactedDB says so, taken care of by indexSet.done call below
|
|
if len(indexFiles) > 1 || mustRecreateCompactedDB(indexFiles) {
|
|
t.indexSets[userID], err = t.getOrCreateUserIndex(userID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
userIDs := make([]string, 0, len(t.indexSets))
|
|
for userID := range t.indexSets {
|
|
// indexSet.done() uploads the compacted db and cleans up the source index files.
|
|
// For user index sets, the files from common index sets are also a source of index.
|
|
// if we cleanup common index sets first, and we fail to upload newly compacted dbs in user index sets, then we will lose data.
|
|
// To avoid any data loss, we should call done() on common index sets at the end.
|
|
if userID == "" {
|
|
continue
|
|
}
|
|
|
|
userIDs = append(userIDs, userID)
|
|
}
|
|
|
|
err := concurrency.ForEachJob(t.ctx, len(userIDs), uploadIndexSetsConcurrency, func(ctx context.Context, idx int) error {
|
|
return t.indexSets[userIDs[idx]].done()
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if commonIndexSet, ok := t.indexSets[""]; ok {
|
|
if err := commonIndexSet.done(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// applyRetention applies retention on the index sets
|
|
func (t *table) applyRetention() error {
|
|
tableInterval := retention.ExtractIntervalFromTableName(t.name)
|
|
// call runRetention on the already initialized index sets which may have expired chunks
|
|
for userID, is := range t.indexSets {
|
|
if !t.expirationChecker.IntervalMayHaveExpiredChunks(tableInterval, userID) {
|
|
continue
|
|
}
|
|
err := is.runRetention(t.tableMarker)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// find and call runRetention on the uninitialized index sets which may have expired chunks
|
|
for _, userID := range t.usersWithPerUserIndex {
|
|
if _, ok := t.indexSets[userID]; ok {
|
|
continue
|
|
}
|
|
if !t.expirationChecker.IntervalMayHaveExpiredChunks(tableInterval, userID) {
|
|
continue
|
|
}
|
|
|
|
var err error
|
|
t.indexSets[userID], err = t.getOrCreateUserIndex(userID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = t.indexSets[userID].runRetention(t.tableMarker)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// compactFiles compacts the given files into a single file.
|
|
func (t *table) compactFiles(files []storage.IndexFile) error {
|
|
var err error
|
|
level.Info(t.logger).Log("msg", "starting compaction of dbs")
|
|
|
|
compactedDBName := filepath.Join(t.workingDirectory, fmt.Sprint(time.Now().Unix()))
|
|
// if we find a previously compacted file, use it as a seed file to copy other index into it
|
|
seedSourceFileIdx := compactedFileIdx(files)
|
|
|
|
if seedSourceFileIdx != -1 {
|
|
t.uploadCompactedDB = true
|
|
compactedDBName = filepath.Join(t.workingDirectory, files[seedSourceFileIdx].Name)
|
|
|
|
level.Info(t.logger).Log("msg", fmt.Sprintf("using %s as seed file", files[seedSourceFileIdx].Name))
|
|
err = shipper_util.DownloadFileFromStorage(compactedDBName, shipper_util.IsCompressedFile(files[seedSourceFileIdx].Name),
|
|
false, shipper_util.LoggerWithFilename(t.logger, files[seedSourceFileIdx].Name), func() (io.ReadCloser, error) {
|
|
return t.baseCommonIndexSet.GetFile(t.ctx, t.name, "", files[seedSourceFileIdx].Name)
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
t.compactedDB, err = openBoltdbFileWithNoSync(compactedDBName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// go through each file and build index in FORMAT1 from FORMAT1 files and FORMAT3 from FORMAT2 files
|
|
return concurrency.ForEachJob(t.ctx, len(files), readDBsConcurrency, func(ctx context.Context, idx int) error {
|
|
workNum := idx
|
|
// skip seed file
|
|
if workNum == seedSourceFileIdx {
|
|
return nil
|
|
}
|
|
fileName := files[idx].Name
|
|
downloadAt := filepath.Join(t.workingDirectory, fileName)
|
|
|
|
err = shipper_util.DownloadFileFromStorage(downloadAt, shipper_util.IsCompressedFile(fileName),
|
|
false, shipper_util.LoggerWithFilename(t.logger, fileName), func() (io.ReadCloser, error) {
|
|
return t.baseCommonIndexSet.GetFile(t.ctx, t.name, "", fileName)
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return readFile(t.logger, downloadAt, t.writeBatch)
|
|
})
|
|
}
|
|
|
|
// writeBatch writes a batch to compactedDB
|
|
func (t *table) writeBatch(bucketName string, batch []indexEntry) error {
|
|
if bucketName == shipper_util.GetUnsafeString(local.IndexBucketName) {
|
|
return t.writeCommonIndex(batch)
|
|
}
|
|
return t.writeUserIndex(bucketName, batch)
|
|
}
|
|
|
|
// writeCommonIndex writes a batch to compactedDB which is for FORMAT1 index
|
|
func (t *table) writeCommonIndex(batch []indexEntry) error {
|
|
t.uploadCompactedDB = true
|
|
return t.compactedDB.Batch(func(tx *bbolt.Tx) error {
|
|
b, err := tx.CreateBucketIfNotExists(local.IndexBucketName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, w := range batch {
|
|
err = b.Put(w.k, w.v)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// writeUserIndex sends a batch to write to the user index set which is for FORMAT3 index
|
|
func (t *table) writeUserIndex(userID string, batch []indexEntry) error {
|
|
ui, err := t.getOrCreateUserIndex(userID)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "failed to get user index for user %s", userID)
|
|
}
|
|
|
|
return ui.writeBatch(userID, batch)
|
|
}
|
|
|
|
func (t *table) getOrCreateUserIndex(userID string) (*indexSet, error) {
|
|
// if index set is already there, use it.
|
|
t.indexSetsMtx.RLock()
|
|
ui, ok := t.indexSets[userID]
|
|
t.indexSetsMtx.RUnlock()
|
|
|
|
if !ok {
|
|
t.indexSetsMtx.Lock()
|
|
// check if some other competing goroutine got the lock before us and created the table, use it if so.
|
|
ui, ok = t.indexSets[userID]
|
|
if !ok {
|
|
// table not found, creating one.
|
|
level.Info(t.logger).Log("msg", fmt.Sprintf("initializing indexSet for user %s", userID))
|
|
|
|
var err error
|
|
ui, err = newUserIndex(t.ctx, t.name, userID, t.baseUserIndexSet, filepath.Join(t.workingDirectory, userID), t.logger)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
t.indexSets[userID] = ui
|
|
}
|
|
t.indexSetsMtx.Unlock()
|
|
}
|
|
|
|
return ui, ui.isReady()
|
|
}
|
|
|
|
// openBoltdbFileWithNoSync opens a boltdb file and configures it to not sync the file to disk.
|
|
// Compaction process is idempotent and we do not retain the files so there is no need to sync them to disk.
|
|
func openBoltdbFileWithNoSync(path string) (*bbolt.DB, error) {
|
|
boltdb, err := shipper_util.SafeOpenBoltdbFile(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// no need to enforce write to disk, we'll upload and delete the file anyway.
|
|
boltdb.NoSync = true
|
|
|
|
return boltdb, nil
|
|
}
|
|
|
|
// compactedFileIdx returns index of previously compacted file(which starts with uploaderName).
|
|
// If it can't find a previously compacted file, it would return -1.
|
|
func compactedFileIdx(files []storage.IndexFile) int {
|
|
for i, file := range files {
|
|
if strings.HasPrefix(file.Name, uploaderName) {
|
|
return i
|
|
}
|
|
}
|
|
|
|
return -1
|
|
}
|
|
|
|
// readFile reads an index file and sends batch of index to writeBatch func.
|
|
func readFile(logger log.Logger, path string, writeBatch func(userID string, batch []indexEntry) error) error {
|
|
level.Debug(logger).Log("msg", "reading file for compaction", "path", path)
|
|
|
|
db, err := openBoltdbFileWithNoSync(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
defer func() {
|
|
if err := db.Close(); err != nil {
|
|
level.Error(logger).Log("msg", "failed to close db", "path", path, "err", err)
|
|
}
|
|
|
|
if err = os.Remove(path); err != nil {
|
|
level.Error(logger).Log("msg", "failed to remove file", "path", path, "err", err)
|
|
}
|
|
}()
|
|
|
|
batch := make([]indexEntry, 0, batchSize)
|
|
|
|
return db.View(func(tx *bbolt.Tx) error {
|
|
return tx.ForEach(func(name []byte, b *bbolt.Bucket) error {
|
|
batch = batch[:0]
|
|
bucketNameStr := string(name)
|
|
err := b.ForEach(func(k, v []byte) error {
|
|
ie := indexEntry{
|
|
k: make([]byte, len(k)),
|
|
v: make([]byte, len(v)),
|
|
}
|
|
|
|
// make a copy since k, v are only valid for the life of the transaction.
|
|
// See: https://godoc.org/github.com/boltdb/bolt#Cursor.Seek
|
|
copy(ie.k, k)
|
|
copy(ie.v, v)
|
|
|
|
batch = append(batch, ie)
|
|
|
|
if len(batch) == cap(batch) {
|
|
// batch is full, write the batch and create a new one.
|
|
err := writeBatch(bucketNameStr, batch)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
batch = batch[:0]
|
|
}
|
|
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// write the remaining batch which might have been left unwritten due to it not being full yet.
|
|
return writeBatch(bucketNameStr, batch)
|
|
})
|
|
})
|
|
}
|
|
|
|
// uploadFile uploads the compacted db in compressed format.
|
|
func uploadFile(compactedDBPath string, putFileFunc func(file io.ReadSeeker) error, logger log.Logger) error {
|
|
// compress the compactedDB.
|
|
compressedDBPath := fmt.Sprintf("%s.gz", compactedDBPath)
|
|
err := shipper_util.CompressFile(compactedDBPath, compressedDBPath, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// open the file for reading.
|
|
compressedDB, err := os.Open(compressedDBPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
defer func() {
|
|
if err := compressedDB.Close(); err != nil {
|
|
level.Error(logger).Log("msg", "failed to close file", "path", compactedDBPath, "err", err)
|
|
}
|
|
|
|
if err := os.Remove(compressedDBPath); err != nil {
|
|
level.Error(logger).Log("msg", "failed to remove file", "path", compressedDBPath, "err", err)
|
|
}
|
|
}()
|
|
|
|
err = putFileFunc(compressedDB)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// mustRecreateCompactedDB returns true if the compacted db should be recreated
|
|
func mustRecreateCompactedDB(sourceFiles []storage.IndexFile) bool {
|
|
if len(sourceFiles) != 1 {
|
|
// do not recreate if there are multiple source files
|
|
return false
|
|
} else if time.Since(sourceFiles[0].ModifiedAt) < recreateCompactedDBOlderThan {
|
|
// do not recreate if the source file is younger than the threshold
|
|
return false
|
|
}
|
|
|
|
// recreate the compacted db only if we have not recreated it before
|
|
return !strings.HasSuffix(sourceFiles[0].Name, recreatedCompactedDBSuffix)
|
|
}
|
|
|