mirror of https://github.com/grafana/loki
bloom blocks downloading queue (#11201)
implemented bloom blocks downloading queue to control the concurrency of downloading the blocks from the storage Signed-off-by: Vladyslav Diachenko <vlad.diachenko@grafana.com>pull/11347/head
parent
10fe48b815
commit
75cfe59596
@ -0,0 +1,230 @@ |
||||
package bloomshipper |
||||
|
||||
import ( |
||||
"context" |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
"os" |
||||
"path/filepath" |
||||
"strconv" |
||||
"strings" |
||||
"time" |
||||
|
||||
"github.com/go-kit/log" |
||||
"github.com/go-kit/log/level" |
||||
"github.com/grafana/dskit/services" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
|
||||
"github.com/grafana/loki/pkg/queue" |
||||
v1 "github.com/grafana/loki/pkg/storage/bloom/v1" |
||||
"github.com/grafana/loki/pkg/storage/stores/shipper/bloomshipper/config" |
||||
"github.com/grafana/loki/pkg/util" |
||||
"github.com/grafana/loki/pkg/util/constants" |
||||
) |
||||
|
||||
type blockDownloader struct { |
||||
logger log.Logger |
||||
|
||||
workingDirectory string |
||||
queueMetrics *queue.Metrics |
||||
queue *queue.RequestQueue |
||||
blockClient BlockClient |
||||
limits Limits |
||||
activeUsersService *util.ActiveUsersCleanupService |
||||
|
||||
ctx context.Context |
||||
manager *services.Manager |
||||
onWorkerStopCallback func() |
||||
} |
||||
|
||||
func newBlockDownloader(config config.Config, blockClient BlockClient, limits Limits, logger log.Logger, reg prometheus.Registerer) (*blockDownloader, error) { |
||||
queueMetrics := queue.NewMetrics(reg, constants.Loki, "bloom_blocks_downloader") |
||||
//add cleanup service
|
||||
downloadingQueue := queue.NewRequestQueue(config.BlocksDownloadingQueue.MaxTasksEnqueuedPerTenant, time.Minute, queueMetrics) |
||||
activeUsersService := util.NewActiveUsersCleanupWithDefaultValues(queueMetrics.Cleanup) |
||||
|
||||
ctx := context.Background() |
||||
manager, err := services.NewManager(downloadingQueue, activeUsersService) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("error creating service manager: %w", err) |
||||
} |
||||
err = services.StartManagerAndAwaitHealthy(ctx, manager) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("error starting service manager: %w", err) |
||||
} |
||||
|
||||
b := &blockDownloader{ |
||||
ctx: ctx, |
||||
logger: logger, |
||||
workingDirectory: config.WorkingDirectory, |
||||
queueMetrics: queueMetrics, |
||||
queue: downloadingQueue, |
||||
blockClient: blockClient, |
||||
activeUsersService: activeUsersService, |
||||
limits: limits, |
||||
manager: manager, |
||||
onWorkerStopCallback: onWorkerStopNoopCallback, |
||||
} |
||||
|
||||
for i := 0; i < config.BlocksDownloadingQueue.WorkersCount; i++ { |
||||
go b.serveDownloadingTasks(fmt.Sprintf("worker-%d", i)) |
||||
} |
||||
return b, nil |
||||
} |
||||
|
||||
type BlockDownloadingTask struct { |
||||
ctx context.Context |
||||
block BlockRef |
||||
// ErrCh is a send-only channel to write an error to
|
||||
ErrCh chan<- error |
||||
// ResultsCh is a send-only channel to return the block querier for the downloaded block
|
||||
ResultsCh chan<- blockWithQuerier |
||||
} |
||||
|
||||
func NewBlockDownloadingTask(ctx context.Context, block BlockRef, resCh chan<- blockWithQuerier, errCh chan<- error) *BlockDownloadingTask { |
||||
return &BlockDownloadingTask{ |
||||
ctx: ctx, |
||||
block: block, |
||||
ErrCh: errCh, |
||||
ResultsCh: resCh, |
||||
} |
||||
} |
||||
|
||||
// noop implementation
|
||||
var onWorkerStopNoopCallback = func() {} |
||||
|
||||
func (d *blockDownloader) serveDownloadingTasks(workerID string) { |
||||
logger := log.With(d.logger, "worker", workerID) |
||||
level.Debug(logger).Log("msg", "starting worker") |
||||
|
||||
d.queue.RegisterConsumerConnection(workerID) |
||||
defer d.queue.UnregisterConsumerConnection(workerID) |
||||
//this callback is used only in the tests to assert that worker is stopped
|
||||
defer d.onWorkerStopCallback() |
||||
|
||||
idx := queue.StartIndexWithLocalQueue |
||||
|
||||
for { |
||||
item, newIdx, err := d.queue.Dequeue(d.ctx, idx, workerID) |
||||
if err != nil { |
||||
if !errors.Is(err, queue.ErrStopped) && !errors.Is(err, context.Canceled) { |
||||
level.Error(logger).Log("msg", "failed to dequeue task", "err", err) |
||||
continue |
||||
} |
||||
level.Info(logger).Log("msg", "stopping worker") |
||||
return |
||||
} |
||||
task, ok := item.(*BlockDownloadingTask) |
||||
if !ok { |
||||
level.Error(logger).Log("msg", "failed to cast to BlockDownloadingTask", "item", fmt.Sprintf("%+v", item), "type", fmt.Sprintf("%T", item)) |
||||
continue |
||||
} |
||||
|
||||
idx = newIdx |
||||
blockPath := task.block.BlockPath |
||||
//todo add cache before downloading
|
||||
level.Debug(logger).Log("msg", "start downloading the block", "block", blockPath) |
||||
block, err := d.blockClient.GetBlock(task.ctx, task.block) |
||||
if err != nil { |
||||
level.Error(logger).Log("msg", "error downloading the block", "block", blockPath, "err", err) |
||||
task.ErrCh <- fmt.Errorf("error downloading the block %s : %w", blockPath, err) |
||||
continue |
||||
} |
||||
directory, err := d.extractBlock(&block, time.Now()) |
||||
if err != nil { |
||||
level.Error(logger).Log("msg", "error extracting the block", "block", blockPath, "err", err) |
||||
task.ErrCh <- fmt.Errorf("error extracting the block %s : %w", blockPath, err) |
||||
continue |
||||
} |
||||
level.Debug(d.logger).Log("msg", "block has been downloaded and extracted", "block", task.block.BlockPath, "directory", directory) |
||||
blockQuerier := d.createBlockQuerier(directory) |
||||
task.ResultsCh <- blockWithQuerier{ |
||||
BlockRef: task.block, |
||||
BlockQuerier: blockQuerier, |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (d *blockDownloader) downloadBlocks(ctx context.Context, tenantID string, references []BlockRef) (chan blockWithQuerier, chan error) { |
||||
d.activeUsersService.UpdateUserTimestamp(tenantID, time.Now()) |
||||
// we need to have errCh with size that can keep max count of errors to prevent the case when
|
||||
// the queue worker reported the error to this channel before the current goroutine
|
||||
// and this goroutine will go to the deadlock because it won't be able to report an error
|
||||
// because nothing reads this channel at this moment.
|
||||
errCh := make(chan error, len(references)) |
||||
blocksCh := make(chan blockWithQuerier, len(references)) |
||||
|
||||
downloadingParallelism := d.limits.BloomGatewayBlocksDownloadingParallelism(tenantID) |
||||
for _, reference := range references { |
||||
task := NewBlockDownloadingTask(ctx, reference, blocksCh, errCh) |
||||
level.Debug(d.logger).Log("msg", "enqueuing task to download block", "block", reference.BlockPath) |
||||
err := d.queue.Enqueue(tenantID, nil, task, downloadingParallelism, nil) |
||||
if err != nil { |
||||
errCh <- fmt.Errorf("error enquing downloading task for block %s : %w", reference.BlockPath, err) |
||||
return blocksCh, errCh |
||||
} |
||||
} |
||||
return blocksCh, errCh |
||||
} |
||||
|
||||
type blockWithQuerier struct { |
||||
BlockRef |
||||
*v1.BlockQuerier |
||||
} |
||||
|
||||
// extract the files into directory and returns absolute path to this directory.
|
||||
func (d *blockDownloader) extractBlock(block *Block, ts time.Time) (string, error) { |
||||
workingDirectoryPath := filepath.Join(d.workingDirectory, block.BlockPath, strconv.FormatInt(ts.UnixMilli(), 10)) |
||||
err := os.MkdirAll(workingDirectoryPath, os.ModePerm) |
||||
if err != nil { |
||||
return "", fmt.Errorf("can not create directory to extract the block: %w", err) |
||||
} |
||||
archivePath, err := writeDataToTempFile(workingDirectoryPath, block) |
||||
if err != nil { |
||||
return "", fmt.Errorf("error writing data to temp file: %w", err) |
||||
} |
||||
defer func() { |
||||
os.Remove(archivePath) |
||||
// todo log err
|
||||
}() |
||||
err = extractArchive(archivePath, workingDirectoryPath) |
||||
if err != nil { |
||||
return "", fmt.Errorf("error extracting archive: %w", err) |
||||
} |
||||
return workingDirectoryPath, nil |
||||
} |
||||
|
||||
func (d *blockDownloader) createBlockQuerier(directory string) *v1.BlockQuerier { |
||||
reader := v1.NewDirectoryBlockReader(directory) |
||||
block := v1.NewBlock(reader) |
||||
return v1.NewBlockQuerier(block) |
||||
} |
||||
|
||||
func (d *blockDownloader) stop() { |
||||
_ = services.StopManagerAndAwaitStopped(d.ctx, d.manager) |
||||
} |
||||
|
||||
func writeDataToTempFile(workingDirectoryPath string, block *Block) (string, error) { |
||||
defer block.Data.Close() |
||||
archivePath := filepath.Join(workingDirectoryPath, block.BlockPath[strings.LastIndex(block.BlockPath, delimiter)+1:]) |
||||
|
||||
archiveFile, err := os.Create(archivePath) |
||||
if err != nil { |
||||
return "", fmt.Errorf("error creating empty file to store the archiver: %w", err) |
||||
} |
||||
defer archiveFile.Close() |
||||
_, err = io.Copy(archiveFile, block.Data) |
||||
if err != nil { |
||||
return "", fmt.Errorf("error writing data to archive file: %w", err) |
||||
} |
||||
return archivePath, nil |
||||
} |
||||
|
||||
func extractArchive(archivePath string, workingDirectoryPath string) error { |
||||
file, err := os.Open(archivePath) |
||||
if err != nil { |
||||
return fmt.Errorf("error opening archive file %s: %w", file.Name(), err) |
||||
} |
||||
return v1.UnTarGz(workingDirectoryPath, file) |
||||
} |
@ -0,0 +1,168 @@ |
||||
package bloomshipper |
||||
|
||||
import ( |
||||
"bytes" |
||||
"context" |
||||
"fmt" |
||||
"io" |
||||
"os" |
||||
"path/filepath" |
||||
"strconv" |
||||
"testing" |
||||
"time" |
||||
|
||||
"github.com/go-kit/log" |
||||
"github.com/google/uuid" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/stretchr/testify/require" |
||||
"go.uber.org/atomic" |
||||
|
||||
v1 "github.com/grafana/loki/pkg/storage/bloom/v1" |
||||
"github.com/grafana/loki/pkg/storage/stores/shipper/bloomshipper/config" |
||||
"github.com/grafana/loki/pkg/validation" |
||||
) |
||||
|
||||
func Test_blockDownloader_downloadBlocks(t *testing.T) { |
||||
overrides, err := validation.NewOverrides(validation.Limits{BloomGatewayBlocksDownloadingParallelism: 20}, nil) |
||||
require.NoError(t, err) |
||||
workingDirectory := t.TempDir() |
||||
|
||||
blockReferences, blockClient := createFakeBlocks(t, 20) |
||||
blockClient.responseDelay = 100 * time.Millisecond |
||||
workersCount := 10 |
||||
downloader, err := newBlockDownloader(config.Config{ |
||||
WorkingDirectory: workingDirectory, |
||||
BlocksDownloadingQueue: config.DownloadingQueueConfig{ |
||||
WorkersCount: workersCount, |
||||
MaxTasksEnqueuedPerTenant: 20, |
||||
}, |
||||
}, blockClient, overrides, log.NewNopLogger(), prometheus.DefaultRegisterer) |
||||
stoppedWorkersCount := atomic.NewInt32(0) |
||||
downloader.onWorkerStopCallback = func() { |
||||
stoppedWorkersCount.Inc() |
||||
} |
||||
require.NoError(t, err) |
||||
blocksCh, errorsCh := downloader.downloadBlocks(context.Background(), "fake", blockReferences) |
||||
downloadedBlocks := make(map[string]any, len(blockReferences)) |
||||
done := make(chan bool) |
||||
go func() { |
||||
for i := 0; i < 20; i++ { |
||||
block := <-blocksCh |
||||
downloadedBlocks[block.BlockPath] = nil |
||||
} |
||||
done <- true |
||||
}() |
||||
|
||||
select { |
||||
//20 blocks, 10 workers, fixed delay 100ms per block: the total downloading time must be ~200ms.
|
||||
case <-time.After(2 * time.Second): |
||||
t.Fatalf("test must complete before the timeout") |
||||
case err := <-errorsCh: |
||||
require.NoError(t, err) |
||||
case <-done: |
||||
} |
||||
require.Len(t, downloadedBlocks, 20, "all 20 block must be downloaded") |
||||
|
||||
downloader.stop() |
||||
require.Eventuallyf(t, func() bool { |
||||
return stoppedWorkersCount.Load() == int32(workersCount) |
||||
}, 1*time.Second, 10*time.Millisecond, "expected all %d workers to be stopped", workersCount) |
||||
} |
||||
|
||||
// creates fake blocks and returns map[block-path]Block and mockBlockClient
|
||||
func createFakeBlocks(t *testing.T, count int) ([]BlockRef, *mockBlockClient) { |
||||
mockData := make(map[string]Block, count) |
||||
refs := make([]BlockRef, 0, count) |
||||
for i := 0; i < count; i++ { |
||||
archive, _, _ := createBlockArchive(t) |
||||
block := Block{ |
||||
BlockRef: BlockRef{ |
||||
BlockPath: fmt.Sprintf("block-path-%d", i), |
||||
}, |
||||
Data: archive, |
||||
} |
||||
mockData[block.BlockPath] = block |
||||
refs = append(refs, block.BlockRef) |
||||
} |
||||
return refs, &mockBlockClient{mockData: mockData} |
||||
} |
||||
|
||||
type mockBlockClient struct { |
||||
responseDelay time.Duration |
||||
mockData map[string]Block |
||||
} |
||||
|
||||
func (m *mockBlockClient) GetBlock(_ context.Context, reference BlockRef) (Block, error) { |
||||
time.Sleep(m.responseDelay) |
||||
block, exists := m.mockData[reference.BlockPath] |
||||
if exists { |
||||
return block, nil |
||||
} |
||||
|
||||
return block, fmt.Errorf("block %s is not found in mockData", reference.BlockPath) |
||||
} |
||||
|
||||
func (m *mockBlockClient) PutBlocks(_ context.Context, _ []Block) ([]Block, error) { |
||||
panic("implement me") |
||||
} |
||||
|
||||
func (m *mockBlockClient) DeleteBlocks(_ context.Context, _ []BlockRef) error { |
||||
panic("implement me") |
||||
} |
||||
|
||||
func Test_blockDownloader_extractBlock(t *testing.T) { |
||||
blockFile, bloomFileContent, seriesFileContent := createBlockArchive(t) |
||||
|
||||
workingDir := t.TempDir() |
||||
downloader := &blockDownloader{workingDirectory: workingDir} |
||||
ts := time.Now().UTC() |
||||
block := Block{ |
||||
BlockRef: BlockRef{BlockPath: "first-period-19621/tenantA/metas/ff-fff-1695272400-1695276000-aaa"}, |
||||
Data: blockFile, |
||||
} |
||||
|
||||
actualPath, err := downloader.extractBlock(&block, ts) |
||||
|
||||
require.NoError(t, err) |
||||
expectedPath := filepath.Join(workingDir, block.BlockPath, strconv.FormatInt(ts.UnixMilli(), 10)) |
||||
require.Equal(t, expectedPath, actualPath, |
||||
"expected archive to be extracted to working directory under the same path as blockPath and with timestamp suffix") |
||||
require.FileExists(t, filepath.Join(expectedPath, v1.BloomFileName)) |
||||
require.FileExists(t, filepath.Join(expectedPath, v1.SeriesFileName)) |
||||
|
||||
actualBloomFileContent, err := os.ReadFile(filepath.Join(expectedPath, v1.BloomFileName)) |
||||
require.NoError(t, err) |
||||
require.Equal(t, bloomFileContent, string(actualBloomFileContent)) |
||||
|
||||
actualSeriesFileContent, err := os.ReadFile(filepath.Join(expectedPath, v1.SeriesFileName)) |
||||
require.NoError(t, err) |
||||
require.Equal(t, seriesFileContent, string(actualSeriesFileContent)) |
||||
} |
||||
|
||||
func createBlockArchive(t *testing.T) (*os.File, string, string) { |
||||
dir := t.TempDir() |
||||
mockBlockDir := filepath.Join(dir, "mock-block-dir") |
||||
err := os.MkdirAll(mockBlockDir, 0777) |
||||
require.NoError(t, err) |
||||
bloomFile, err := os.Create(filepath.Join(mockBlockDir, v1.BloomFileName)) |
||||
require.NoError(t, err) |
||||
bloomFileContent := uuid.NewString() |
||||
_, err = io.Copy(bloomFile, bytes.NewReader([]byte(bloomFileContent))) |
||||
require.NoError(t, err) |
||||
|
||||
seriesFile, err := os.Create(filepath.Join(mockBlockDir, v1.SeriesFileName)) |
||||
require.NoError(t, err) |
||||
seriesFileContent := uuid.NewString() |
||||
_, err = io.Copy(seriesFile, bytes.NewReader([]byte(seriesFileContent))) |
||||
require.NoError(t, err) |
||||
|
||||
blockFilePath := filepath.Join(dir, "test-block-archive") |
||||
file, err := os.OpenFile(blockFilePath, os.O_CREATE|os.O_RDWR, 0700) |
||||
require.NoError(t, err) |
||||
err = v1.TarGz(file, v1.NewDirectoryBlockReader(mockBlockDir)) |
||||
require.NoError(t, err) |
||||
|
||||
blockFile, err := os.OpenFile(blockFilePath, os.O_RDONLY, 0700) |
||||
require.NoError(t, err) |
||||
return blockFile, bloomFileContent, seriesFileContent |
||||
} |
Loading…
Reference in new issue