mirror of https://github.com/grafana/loki
chore(dataobj): Download pages in 16MB batches (#16689)
parent
bc2111e8f1
commit
8c4eb428c3
@ -0,0 +1,109 @@ |
||||
package encoding |
||||
|
||||
import ( |
||||
"cmp" |
||||
"iter" |
||||
"slices" |
||||
) |
||||
|
||||
// The windowing utilities allow for grouping subsections of a file into
|
||||
// windows of a specified size; for example, given a slices of pages to
|
||||
// download and a window size of 16MB, pages will be grouped such that the
|
||||
// first byte of the first page and the last byte of the last page are no more
|
||||
// than 16MB apart.
|
||||
|
||||
// window represents a window of file subsections.
|
||||
type window[T any] []windowedElement[T] |
||||
|
||||
// Start returns the first element in the window.
|
||||
func (w window[T]) Start() T { |
||||
var zero T |
||||
if len(w) == 0 { |
||||
return zero |
||||
} |
||||
return w[0].Data |
||||
} |
||||
|
||||
// End returns the last element in the window.
|
||||
func (w window[T]) End() T { |
||||
var zero T |
||||
if len(w) == 0 { |
||||
return zero |
||||
} |
||||
return w[len(w)-1].Data |
||||
} |
||||
|
||||
type windowedElement[T any] struct { |
||||
Data T // Windowed data.
|
||||
Position int // Position of the element in the original slice pre-windowing.
|
||||
} |
||||
|
||||
type getElementInfo[T any] func(v T) (offset, size uint64) |
||||
|
||||
// iterWindows groups elements into windows of a specified size, returning an
|
||||
// iterator over the windows. The input slice is not modified.
|
||||
func iterWindows[T any](elements []T, getInfo getElementInfo[T], windowSize int64) iter.Seq[window[T]] { |
||||
// Sort elements by their start position.
|
||||
sortedElements := make(window[T], len(elements)) |
||||
for i, element := range elements { |
||||
sortedElements[i] = windowedElement[T]{Data: element, Position: i} |
||||
} |
||||
slices.SortFunc(sortedElements, func(a, b windowedElement[T]) int { |
||||
aOffset, _ := getInfo(a.Data) |
||||
bOffset, _ := getInfo(b.Data) |
||||
return cmp.Compare(aOffset, bOffset) |
||||
}) |
||||
|
||||
return func(yield func(window[T]) bool) { |
||||
var start, end int |
||||
|
||||
for end < len(sortedElements) { |
||||
startElement := sortedElements[start] |
||||
currentElement := sortedElements[end] |
||||
|
||||
var ( |
||||
startOffset, _ = getInfo(startElement.Data) |
||||
endOffset, endSize = getInfo(currentElement.Data) |
||||
) |
||||
|
||||
var ( |
||||
startByte = startOffset |
||||
endByte = endOffset + endSize |
||||
) |
||||
|
||||
switch { |
||||
case endByte-startByte > uint64(windowSize) && start == end: |
||||
// We have an empty window and the element is larger than the current
|
||||
// window size. We want to immediately add the page into the window and
|
||||
// yield what we have.
|
||||
end++ |
||||
|
||||
if !yield(sortedElements[start:end]) { |
||||
return |
||||
} |
||||
start = end |
||||
|
||||
case endByte-startByte > uint64(windowSize) && start < end: |
||||
// Including end in the window would exceed the window size; we yield
|
||||
// everything up to end and start a new window from end.
|
||||
//
|
||||
// We *do not* increment end here; if we did, we would start with two
|
||||
// elements in the next window.
|
||||
if !yield(sortedElements[start:end]) { |
||||
return |
||||
} |
||||
start = end |
||||
|
||||
default: |
||||
// The element fits within the window size; move end forward so it gets
|
||||
// included.
|
||||
end++ |
||||
} |
||||
} |
||||
|
||||
// Yield all remaining elements.
|
||||
if start < len(sortedElements) { |
||||
yield(sortedElements[start:]) |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,147 @@ |
||||
package encoding |
||||
|
||||
import ( |
||||
"fmt" |
||||
"slices" |
||||
"testing" |
||||
|
||||
"github.com/stretchr/testify/require" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd" |
||||
) |
||||
|
||||
func Test_windowPages(t *testing.T) { |
||||
tt := []struct { |
||||
name string |
||||
pages []*fakePageDesc |
||||
windowSize int64 |
||||
expect []window[*fakePageDesc] |
||||
}{ |
||||
{ |
||||
name: "empty pages", |
||||
pages: nil, |
||||
windowSize: 1_000_000, |
||||
expect: nil, |
||||
}, |
||||
{ |
||||
name: "single page smaller than window", |
||||
pages: []*fakePageDesc{newFakePage(0, 100)}, |
||||
windowSize: 1_000_000, |
||||
expect: []window[*fakePageDesc]{ |
||||
{{Data: newFakePage(0, 100), Position: 0}}, |
||||
}, |
||||
}, |
||||
{ |
||||
name: "single page larger than window", |
||||
pages: []*fakePageDesc{newFakePage(0, 5_000_000)}, |
||||
windowSize: 5_000_000, |
||||
expect: []window[*fakePageDesc]{ |
||||
{{Data: newFakePage(0, 5_000_000), Position: 0}}, |
||||
}, |
||||
}, |
||||
{ |
||||
name: "basic grouping", |
||||
pages: []*fakePageDesc{ |
||||
newFakePage(0, 100), |
||||
newFakePage(100, 100), |
||||
newFakePage(200, 100), |
||||
|
||||
newFakePage(1500, 100), |
||||
newFakePage(1600, 100), |
||||
}, |
||||
windowSize: 1000, |
||||
expect: []window[*fakePageDesc]{ |
||||
{ |
||||
{Data: newFakePage(0, 100), Position: 0}, |
||||
{Data: newFakePage(100, 100), Position: 1}, |
||||
{Data: newFakePage(200, 100), Position: 2}, |
||||
}, |
||||
{ |
||||
{Data: newFakePage(1500, 100), Position: 3}, |
||||
{Data: newFakePage(1600, 100), Position: 4}, |
||||
}, |
||||
}, |
||||
}, |
||||
{ |
||||
name: "basic grouping (unordered)", |
||||
pages: []*fakePageDesc{ |
||||
newFakePage(1500, 100), |
||||
newFakePage(200, 100), |
||||
newFakePage(100, 100), |
||||
|
||||
newFakePage(1600, 100), |
||||
newFakePage(0, 100), |
||||
}, |
||||
windowSize: 1000, |
||||
expect: []window[*fakePageDesc]{ |
||||
{ |
||||
{Data: newFakePage(0, 100), Position: 4}, |
||||
{Data: newFakePage(100, 100), Position: 2}, |
||||
{Data: newFakePage(200, 100), Position: 1}, |
||||
}, |
||||
{ |
||||
{Data: newFakePage(1500, 100), Position: 0}, |
||||
{Data: newFakePage(1600, 100), Position: 3}, |
||||
}, |
||||
}, |
||||
}, |
||||
{ |
||||
name: "grouping with large page", |
||||
pages: []*fakePageDesc{ |
||||
newFakePage(0, 100), |
||||
newFakePage(100, 100), |
||||
newFakePage(200, 1000), |
||||
newFakePage(300, 100), |
||||
newFakePage(400, 100), |
||||
}, |
||||
windowSize: 500, |
||||
expect: []window[*fakePageDesc]{ |
||||
{ |
||||
{Data: newFakePage(0, 100), Position: 0}, |
||||
{Data: newFakePage(100, 100), Position: 1}, |
||||
}, |
||||
{ |
||||
{Data: newFakePage(200, 1000), Position: 2}, |
||||
}, |
||||
{ |
||||
{Data: newFakePage(300, 100), Position: 3}, |
||||
{Data: newFakePage(400, 100), Position: 4}, |
||||
}, |
||||
}, |
||||
}, |
||||
} |
||||
|
||||
for _, tc := range tt { |
||||
t.Run(tc.name, func(t *testing.T) { |
||||
getInfo := func(p *fakePageDesc) (uint64, uint64) { |
||||
return p.Info.DataOffset, p.Info.DataSize |
||||
} |
||||
actual := slices.Collect(iterWindows(tc.pages, getInfo, tc.windowSize)) |
||||
|
||||
for wi, w := range actual { |
||||
for pi, p := range w { |
||||
t.Logf("window %d page %d: %#v\n", wi, pi, p.Data) |
||||
} |
||||
} |
||||
|
||||
require.Equal(t, tc.expect, actual) |
||||
}) |
||||
} |
||||
} |
||||
|
||||
type fakePageDesc struct{ Info *datasetmd.PageInfo } |
||||
|
||||
func (f *fakePageDesc) GetInfo() *datasetmd.PageInfo { return f.Info } |
||||
|
||||
func (f *fakePageDesc) GoString() string { |
||||
return fmt.Sprintf("(start: %d, size: %d)", f.Info.DataOffset, f.Info.DataSize) |
||||
} |
||||
|
||||
func newFakePage(offset, size uint64) *fakePageDesc { |
||||
return &fakePageDesc{ |
||||
Info: &datasetmd.PageInfo{ |
||||
DataOffset: offset, |
||||
DataSize: size, |
||||
}, |
||||
} |
||||
} |
Loading…
Reference in new issue