Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/storage/stores/shipper/compactor/table.go

539 lines
19 KiB

package compactor
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/concurrency"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"go.etcd.io/bbolt"
"github.com/grafana/loki/pkg/storage/chunk/client/local"
chunk_util "github.com/grafana/loki/pkg/storage/chunk/client/util"
"github.com/grafana/loki/pkg/storage/stores/shipper/compactor/retention"
"github.com/grafana/loki/pkg/storage/stores/shipper/storage"
shipper_util "github.com/grafana/loki/pkg/storage/stores/shipper/util"
util_log "github.com/grafana/loki/pkg/util/log"
)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Below we show various formats that we have for structuring index in the object store. //
// //
// FORMAT1 FORMAT2 FORMAT3 //
// //
// table1 table1 table1 //
// | | | //
// ----> db1.gz ----> db1.gz ----> user1 //
// | | | | //
// ----> index ----> user1 | ----> db1.gz //
// ----> user2 | | //
// | ----> index //
// ----> user2 //
// | //
// ----> db1.gz //
// | //
// ----> index //
// //
// FORMAT1 - `table1` has 1 db named db1.gz and 1 boltdb bucket named `index` which contains index for all the users. //
// It is in use when the flag to build per user index is not enabled. //
// Ingesters write the index in Format1 which then compactor compacts down in same format. //
// //
// FORMAT2 - `table1` has 1 db named db1.gz and 1 boltdb bucket each for `user1` and `user2` containing //
// index just for those users. //
// It is an intermediate format built by ingesters when the flag to build per user index is enabled. //
// //
// FORMAT3 - `table1` has 1 folder each for `user1` and `user2` containing index files having index just for those users. //
// Compactor builds index in this format from Format2. //
// //
// THING TO NOTE HERE IS COMPACTOR BUILDS INDEX IN FORMAT1 FROM FORMAT1 AND FORMAT3 FROM FORMAT2. //
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
const (
uploaderName = "compactor"
uploadIndexSetsConcurrency = 10
readDBsConcurrency = 50
batchSize = 1000
// we want to recreate compactedDB when the chances of it changing due to compaction or deletion of data are low.
// this is to avoid recreation of the DB too often which would be too costly in a large cluster.
recreateCompactedDBOlderThan = 12 * time.Hour
dropFreePagesTxMaxSize = 100 * 1024 * 1024 // 100MB
recreatedCompactedDBSuffix = ".r.gz"
)
type indexEntry struct {
k, v []byte
}
type tableExpirationChecker interface {
IntervalMayHaveExpiredChunks(interval model.Interval, userID string) bool
}
type table struct {
name string
workingDirectory string
indexStorageClient storage.Client
tableMarker retention.TableMarker
expirationChecker tableExpirationChecker
baseUserIndexSet, baseCommonIndexSet storage.IndexSet
indexSets map[string]*indexSet
indexSetsMtx sync.RWMutex
usersWithPerUserIndex []string
uploadCompactedDB bool
compactedDB *bbolt.DB
logger log.Logger
ctx context.Context
}
func newTable(ctx context.Context, workingDirectory string, indexStorageClient storage.Client,
tableMarker retention.TableMarker, expirationChecker tableExpirationChecker,
) (*table, error) {
err := chunk_util.EnsureDirectory(workingDirectory)
if err != nil {
return nil, err
}
table := table{
ctx: ctx,
name: filepath.Base(workingDirectory),
workingDirectory: workingDirectory,
indexStorageClient: indexStorageClient,
tableMarker: tableMarker,
expirationChecker: expirationChecker,
indexSets: map[string]*indexSet{},
baseUserIndexSet: storage.NewIndexSet(indexStorageClient, true),
baseCommonIndexSet: storage.NewIndexSet(indexStorageClient, false),
}
table.logger = log.With(util_log.Logger, "table-name", table.name)
return &table, nil
}
func (t *table) compact(applyRetention bool) error {
indexFiles, usersWithPerUserIndex, err := t.indexStorageClient.ListFiles(t.ctx, t.name, false)
if err != nil {
return err
}
if len(indexFiles) == 0 && len(usersWithPerUserIndex) == 0 {
level.Info(t.logger).Log("msg", "no common index files and user index found")
return nil
}
t.usersWithPerUserIndex = usersWithPerUserIndex
level.Info(t.logger).Log("msg", "listed files", "count", len(indexFiles))
defer func() {
for _, is := range t.indexSets {
is.cleanup()
}
if t.compactedDB != nil {
if err := t.compactedDB.Close(); err != nil {
level.Error(t.logger).Log("msg", "error closing compacted DB", "err", err)
}
}
if err := os.RemoveAll(t.workingDirectory); err != nil {
level.Error(t.logger).Log("msg", fmt.Sprintf("failed to remove working directory %s", t.workingDirectory), "err", err)
}
}()
dbsCompacted := false
if len(indexFiles) > 1 || (len(indexFiles) == 1 && !strings.HasPrefix(indexFiles[0].Name, uploaderName)) {
// if we have more than 1 index file or the only file we have is not from the compactor then, we need to compact them.
dbsCompacted = true
if err := t.compactFiles(indexFiles); err != nil {
return err
}
} else if len(indexFiles) == 1 && (applyRetention || mustRecreateCompactedDB(indexFiles)) {
// we have just 1 common index file which is already compacted.
// initialize common compacted db if we need to apply retention, or we need to recreate it
downloadAt := filepath.Join(t.workingDirectory, indexFiles[0].Name)
err = shipper_util.DownloadFileFromStorage(downloadAt, shipper_util.IsCompressedFile(indexFiles[0].Name),
false, shipper_util.LoggerWithFilename(t.logger, indexFiles[0].Name),
func() (io.ReadCloser, error) {
return t.baseCommonIndexSet.GetFile(t.ctx, t.name, "", indexFiles[0].Name)
})
if err != nil {
return err
}
t.compactedDB, err = openBoltdbFileWithNoSync(downloadAt)
if err != nil {
return err
}
}
// initialize common index set if we have initialized compacted db.
if t.compactedDB != nil {
// remove the source files if we did a compaction which gets reflected in dbsCompacted
t.indexSets[""], err = newCommonIndex(t.ctx, t.name, t.workingDirectory, t.compactedDB, t.uploadCompactedDB,
indexFiles, dbsCompacted, t.baseCommonIndexSet, t.logger)
if err != nil {
return err
}
}
if applyRetention {
err := t.applyRetention()
if err != nil {
return err
}
}
return t.done()
}
// done takes care of final operations which includes:
// - initializing user index sets which requires recreation of files
// - call indexSet.done() on all the index sets.
func (t *table) done() error {
for _, userID := range t.usersWithPerUserIndex {
if _, ok := t.indexSets[userID]; ok {
continue
}
indexFiles, err := t.baseUserIndexSet.ListFiles(t.ctx, t.name, userID, false)
if err != nil {
return err
}
// initialize the user index sets for:
// - compaction if we have more than 1 index file, taken care of by index set initialization
// - recreation if mustRecreateCompactedDB says so, taken care of by indexSet.done call below
if len(indexFiles) > 1 || mustRecreateCompactedDB(indexFiles) {
t.indexSets[userID], err = t.getOrCreateUserIndex(userID)
if err != nil {
return err
}
}
}
userIDs := make([]string, 0, len(t.indexSets))
for userID := range t.indexSets {
// indexSet.done() uploads the compacted db and cleans up the source index files.
// For user index sets, the files from common index sets are also a source of index.
// if we cleanup common index sets first, and we fail to upload newly compacted dbs in user index sets, then we will lose data.
// To avoid any data loss, we should call done() on common index sets at the end.
if userID == "" {
continue
}
userIDs = append(userIDs, userID)
}
err := concurrency.ForEachJob(t.ctx, len(userIDs), uploadIndexSetsConcurrency, func(ctx context.Context, idx int) error {
return t.indexSets[userIDs[idx]].done()
})
if err != nil {
return err
}
if commonIndexSet, ok := t.indexSets[""]; ok {
if err := commonIndexSet.done(); err != nil {
return err
}
}
return nil
}
// applyRetention applies retention on the index sets
func (t *table) applyRetention() error {
tableInterval := retention.ExtractIntervalFromTableName(t.name)
// call runRetention on the already initialized index sets which may have expired chunks
for userID, is := range t.indexSets {
if !t.expirationChecker.IntervalMayHaveExpiredChunks(tableInterval, userID) {
continue
}
err := is.runRetention(t.tableMarker)
if err != nil {
return err
}
}
// find and call runRetention on the uninitialized index sets which may have expired chunks
for _, userID := range t.usersWithPerUserIndex {
if _, ok := t.indexSets[userID]; ok {
continue
}
if !t.expirationChecker.IntervalMayHaveExpiredChunks(tableInterval, userID) {
continue
}
var err error
t.indexSets[userID], err = t.getOrCreateUserIndex(userID)
if err != nil {
return err
}
err = t.indexSets[userID].runRetention(t.tableMarker)
if err != nil {
return err
}
}
return nil
}
// compactFiles compacts the given files into a single file.
func (t *table) compactFiles(files []storage.IndexFile) error {
var err error
level.Info(t.logger).Log("msg", "starting compaction of dbs")
compactedDBName := filepath.Join(t.workingDirectory, fmt.Sprint(time.Now().Unix()))
// if we find a previously compacted file, use it as a seed file to copy other index into it
seedSourceFileIdx := compactedFileIdx(files)
if seedSourceFileIdx != -1 {
t.uploadCompactedDB = true
compactedDBName = filepath.Join(t.workingDirectory, files[seedSourceFileIdx].Name)
level.Info(t.logger).Log("msg", fmt.Sprintf("using %s as seed file", files[seedSourceFileIdx].Name))
err = shipper_util.DownloadFileFromStorage(compactedDBName, shipper_util.IsCompressedFile(files[seedSourceFileIdx].Name),
false, shipper_util.LoggerWithFilename(t.logger, files[seedSourceFileIdx].Name), func() (io.ReadCloser, error) {
return t.baseCommonIndexSet.GetFile(t.ctx, t.name, "", files[seedSourceFileIdx].Name)
})
if err != nil {
return err
}
}
t.compactedDB, err = openBoltdbFileWithNoSync(compactedDBName)
if err != nil {
return err
}
// go through each file and build index in FORMAT1 from FORMAT1 files and FORMAT3 from FORMAT2 files
return concurrency.ForEachJob(t.ctx, len(files), readDBsConcurrency, func(ctx context.Context, idx int) error {
workNum := idx
// skip seed file
if workNum == seedSourceFileIdx {
return nil
}
fileName := files[idx].Name
downloadAt := filepath.Join(t.workingDirectory, fileName)
err = shipper_util.DownloadFileFromStorage(downloadAt, shipper_util.IsCompressedFile(fileName),
false, shipper_util.LoggerWithFilename(t.logger, fileName), func() (io.ReadCloser, error) {
return t.baseCommonIndexSet.GetFile(t.ctx, t.name, "", fileName)
})
if err != nil {
return err
}
return readFile(t.logger, downloadAt, t.writeBatch)
})
}
// writeBatch writes a batch to compactedDB
func (t *table) writeBatch(bucketName string, batch []indexEntry) error {
if bucketName == shipper_util.GetUnsafeString(local.IndexBucketName) {
return t.writeCommonIndex(batch)
}
return t.writeUserIndex(bucketName, batch)
}
// writeCommonIndex writes a batch to compactedDB which is for FORMAT1 index
func (t *table) writeCommonIndex(batch []indexEntry) error {
t.uploadCompactedDB = true
return t.compactedDB.Batch(func(tx *bbolt.Tx) error {
b, err := tx.CreateBucketIfNotExists(local.IndexBucketName)
if err != nil {
return err
}
for _, w := range batch {
err = b.Put(w.k, w.v)
if err != nil {
return err
}
}
return nil
})
}
// writeUserIndex sends a batch to write to the user index set which is for FORMAT3 index
func (t *table) writeUserIndex(userID string, batch []indexEntry) error {
ui, err := t.getOrCreateUserIndex(userID)
if err != nil {
return errors.Wrapf(err, "failed to get user index for user %s", userID)
}
return ui.writeBatch(userID, batch)
}
func (t *table) getOrCreateUserIndex(userID string) (*indexSet, error) {
// if index set is already there, use it.
t.indexSetsMtx.RLock()
ui, ok := t.indexSets[userID]
t.indexSetsMtx.RUnlock()
if !ok {
t.indexSetsMtx.Lock()
// check if some other competing goroutine got the lock before us and created the table, use it if so.
ui, ok = t.indexSets[userID]
if !ok {
// table not found, creating one.
level.Info(t.logger).Log("msg", fmt.Sprintf("initializing indexSet for user %s", userID))
var err error
ui, err = newUserIndex(t.ctx, t.name, userID, t.baseUserIndexSet, filepath.Join(t.workingDirectory, userID), t.logger)
if err != nil {
return nil, err
}
t.indexSets[userID] = ui
}
t.indexSetsMtx.Unlock()
}
return ui, ui.isReady()
}
// openBoltdbFileWithNoSync opens a boltdb file and configures it to not sync the file to disk.
// Compaction process is idempotent and we do not retain the files so there is no need to sync them to disk.
func openBoltdbFileWithNoSync(path string) (*bbolt.DB, error) {
boltdb, err := shipper_util.SafeOpenBoltdbFile(path)
if err != nil {
return nil, err
}
// no need to enforce write to disk, we'll upload and delete the file anyway.
boltdb.NoSync = true
return boltdb, nil
}
// compactedFileIdx returns index of previously compacted file(which starts with uploaderName).
// If it can't find a previously compacted file, it would return -1.
func compactedFileIdx(files []storage.IndexFile) int {
for i, file := range files {
if strings.HasPrefix(file.Name, uploaderName) {
return i
}
}
return -1
}
// readFile reads an index file and sends batch of index to writeBatch func.
func readFile(logger log.Logger, path string, writeBatch func(userID string, batch []indexEntry) error) error {
level.Debug(logger).Log("msg", "reading file for compaction", "path", path)
db, err := openBoltdbFileWithNoSync(path)
if err != nil {
return err
}
defer func() {
if err := db.Close(); err != nil {
level.Error(logger).Log("msg", "failed to close db", "path", path, "err", err)
}
if err = os.Remove(path); err != nil {
level.Error(logger).Log("msg", "failed to remove file", "path", path, "err", err)
}
}()
batch := make([]indexEntry, 0, batchSize)
return db.View(func(tx *bbolt.Tx) error {
return tx.ForEach(func(name []byte, b *bbolt.Bucket) error {
batch = batch[:0]
bucketNameStr := string(name)
err := b.ForEach(func(k, v []byte) error {
ie := indexEntry{
k: make([]byte, len(k)),
v: make([]byte, len(v)),
}
// make a copy since k, v are only valid for the life of the transaction.
// See: https://godoc.org/github.com/boltdb/bolt#Cursor.Seek
copy(ie.k, k)
copy(ie.v, v)
batch = append(batch, ie)
if len(batch) == cap(batch) {
// batch is full, write the batch and create a new one.
err := writeBatch(bucketNameStr, batch)
if err != nil {
return err
}
batch = batch[:0]
}
return nil
})
if err != nil {
return err
}
// write the remaining batch which might have been left unwritten due to it not being full yet.
return writeBatch(bucketNameStr, batch)
})
})
}
// uploadFile uploads the compacted db in compressed format.
func uploadFile(compactedDBPath string, putFileFunc func(file io.ReadSeeker) error, logger log.Logger) error {
// compress the compactedDB.
compressedDBPath := fmt.Sprintf("%s.gz", compactedDBPath)
err := shipper_util.CompressFile(compactedDBPath, compressedDBPath, false)
if err != nil {
return err
}
// open the file for reading.
compressedDB, err := os.Open(compressedDBPath)
if err != nil {
return err
}
defer func() {
if err := compressedDB.Close(); err != nil {
level.Error(logger).Log("msg", "failed to close file", "path", compactedDBPath, "err", err)
}
if err := os.Remove(compressedDBPath); err != nil {
level.Error(logger).Log("msg", "failed to remove file", "path", compressedDBPath, "err", err)
}
}()
err = putFileFunc(compressedDB)
if err != nil {
return err
}
return nil
}
// mustRecreateCompactedDB returns true if the compacted db should be recreated
func mustRecreateCompactedDB(sourceFiles []storage.IndexFile) bool {
if len(sourceFiles) != 1 {
// do not recreate if there are multiple source files
return false
} else if time.Since(sourceFiles[0].ModifiedAt) < recreateCompactedDBOlderThan {
// do not recreate if the source file is younger than the threshold
return false
}
// recreate the compacted db only if we have not recreated it before
return !strings.HasSuffix(sourceFiles[0].Name, recreatedCompactedDBSuffix)
}