mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
389 lines
12 KiB
389 lines
12 KiB
package dataset
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
|
|
"github.com/grafana/loki/v3/pkg/columnar"
|
|
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
|
|
"github.com/grafana/loki/v3/pkg/memory"
|
|
)
|
|
|
|
type pageReader struct {
|
|
page Page
|
|
physicalType datasetmd.PhysicalType
|
|
compression datasetmd.CompressionType
|
|
ready bool // Whether the pageReader is initialized for page.
|
|
|
|
lastPhysicalType datasetmd.PhysicalType
|
|
lastEncoding datasetmd.EncodingType
|
|
|
|
closer io.Closer
|
|
presenceDec *bitmapDecoder
|
|
valuesDec valueDecoder
|
|
|
|
pageRow int64
|
|
nextRow int64
|
|
}
|
|
|
|
// newPageReader returns a new pageReader that reads from the provided page.
|
|
// The page must hold values of the provided value type, and be compressed with
|
|
// the provided compression type.
|
|
func newPageReader(p Page, physicalType datasetmd.PhysicalType, compression datasetmd.CompressionType) *pageReader {
|
|
var pr pageReader
|
|
pr.Reset(p, physicalType, compression)
|
|
return &pr
|
|
}
|
|
|
|
// Read returns an array of up to the next count values from the page.
|
|
// At the end of the page, Read returns nil, io.EOF.
|
|
//
|
|
// If there was an error reading the page, Read returns the error with
|
|
// no array.
|
|
func (pr *pageReader) Read(ctx context.Context, alloc *memory.Allocator, count int) (columnar.Array, error) {
|
|
// We need to initialize our readers before we can read from the page.
|
|
//
|
|
// If we've seeked backwards and our page row is now ahead of the row we want
|
|
// to read, we need to reinitialize the page reader to read from the start of
|
|
// the page.
|
|
if !pr.ready || pr.pageRow > pr.nextRow {
|
|
err := pr.init(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// "Skip" rows until we reach the starting row we want to read.
|
|
if err := pr.skipUnwantedRows(alloc); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Do the real read now.
|
|
arr, err := pr.readColumnar(alloc, count, false)
|
|
if arr != nil {
|
|
pr.nextRow += int64(arr.Len())
|
|
}
|
|
return arr, err
|
|
}
|
|
|
|
func (pr *pageReader) skipUnwantedRows(alloc *memory.Allocator) error {
|
|
if pr.pageRow >= pr.nextRow {
|
|
// Nothing to skip.
|
|
return nil
|
|
}
|
|
|
|
// Since we don't need the values to live beyond this read call, we can
|
|
// create a short-lived allocator. This will also allow the "real" read to
|
|
// reuse any memory that was created during this step.
|
|
tempAlloc := memory.NewAllocator(alloc)
|
|
defer tempAlloc.Free()
|
|
|
|
readCount := int(pr.nextRow - pr.pageRow)
|
|
_, err := pr.readColumnar(alloc, readCount, true)
|
|
return err
|
|
}
|
|
|
|
// readColumnar implements the actual Read operation for count rows. If skip is
|
|
// true, no array values are returned, permitting for skipping expensive work.
|
|
func (pr *pageReader) readColumnar(alloc *memory.Allocator, count int, skip bool) (columnar.Array, error) {
|
|
// First read presence values for the next count rows.
|
|
bm := memory.NewBitmap(alloc, count)
|
|
err := pr.presenceDec.DecodeTo(&bm, count)
|
|
|
|
gotCount := bm.Len()
|
|
if err != nil && !errors.Is(err, io.EOF) {
|
|
return nil, err
|
|
} else if gotCount == 0 && errors.Is(err, io.EOF) {
|
|
// If we've hit EOF, we can immediately close the inner reader to release
|
|
// any resources back, rather than waiting for the next call to
|
|
// [pageReader.init] to do it.
|
|
_ = pr.Close()
|
|
|
|
return nil, io.EOF
|
|
} else if gotCount == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
// The number of bits set to 1 in presenceBuf determines how many values we
|
|
// need to read from the inner page.
|
|
presentCount := bm.SetCount()
|
|
|
|
var values columnar.Array
|
|
|
|
// Now fill up to presentCount values of concrete values.
|
|
if presentCount > 0 {
|
|
// TODO(rfratto): Add a "skip" mode to decoders to allow them to bypass
|
|
// building an array if it's not going to be used.
|
|
values, err = pr.valuesDec.Decode(alloc, presentCount)
|
|
if err != nil && !errors.Is(err, io.EOF) {
|
|
return nil, err
|
|
} else if values == nil {
|
|
return nil, fmt.Errorf("unexpected nil values")
|
|
} else if values.Len() != presentCount {
|
|
return nil, fmt.Errorf("unexpected number of values: %d, expected: %d", values.Len(), presentCount)
|
|
}
|
|
}
|
|
|
|
pr.pageRow += int64(gotCount)
|
|
|
|
if skip {
|
|
return nil, nil
|
|
}
|
|
return materializeSparseArray(alloc, pr.lastPhysicalType, bm, values)
|
|
}
|
|
|
|
func (pr *pageReader) init(ctx context.Context) error {
|
|
// Close any existing reader from a previous pageReader init. Even though
|
|
// this also happens in [pageReader.Close], we want to do it here as well in
|
|
// case we seeked backwards in a file.
|
|
if err := pr.Close(); err != nil {
|
|
return fmt.Errorf("closing previous page: %w", err)
|
|
}
|
|
|
|
data, err := pr.page.ReadPage(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
memPage := &MemPage{
|
|
Desc: *pr.page.PageDesc(),
|
|
Data: data,
|
|
}
|
|
|
|
openedPage, pageCloser, err := memPage.open(pr.compression)
|
|
if err != nil {
|
|
return fmt.Errorf("opening page for reading: %w", err)
|
|
}
|
|
|
|
pr.presenceDec = pr.getPresenceDecoder()
|
|
pr.presenceDec.Reset(openedPage.PresenceData)
|
|
|
|
if pr.valuesDec == nil || pr.lastPhysicalType != pr.physicalType || pr.lastEncoding != memPage.Desc.Encoding {
|
|
var ok bool
|
|
pr.valuesDec, ok = newValueDecoder(pr.physicalType, memPage.Desc.Encoding, openedPage.ValueData)
|
|
if !ok {
|
|
return fmt.Errorf("unsupported value encoding %s/%s", pr.physicalType, memPage.Desc.Encoding)
|
|
}
|
|
} else {
|
|
pr.valuesDec.Reset(openedPage.ValueData)
|
|
}
|
|
|
|
pr.ready = true
|
|
pr.closer = pageCloser
|
|
pr.lastPhysicalType = pr.physicalType
|
|
pr.lastEncoding = memPage.Desc.Encoding
|
|
pr.pageRow = 0
|
|
return nil
|
|
}
|
|
|
|
// materializeSparseArray materializes a dense array into a sparse [Value] slice
|
|
// based on a presence bitmap. If denseValues is nil, a [columnar.Null] is
|
|
// returned with the length of validity.
|
|
//
|
|
// If denseValues is non-nil, denseValues.Len() must be equal to
|
|
// validity.ClearCount().
|
|
//
|
|
// # Safety
|
|
//
|
|
// Memory from validity and denseValues may be moved to the returned array.
|
|
// These values must have been allocated with alloc to prevent use-after-free.
|
|
func materializeSparseArray(alloc *memory.Allocator, typ datasetmd.PhysicalType, validity memory.Bitmap, denseValues columnar.Array) (columnar.Array, error) {
|
|
if denseValues != nil && validity.SetCount() != denseValues.Len() {
|
|
panic(fmt.Sprintf("invariant broken: validity set count (%d) is not array length (%d)", validity.SetCount(), denseValues.Len()))
|
|
}
|
|
|
|
switch arr := denseValues.(type) {
|
|
case *columnar.UTF8:
|
|
return materializeSparseUTF8(alloc, validity, arr)
|
|
case *columnar.Number[int64]:
|
|
return materializeSparseNumber[int64](alloc, validity, arr)
|
|
case *columnar.Number[uint64]:
|
|
return materializeSparseNumber[uint64](alloc, validity, arr)
|
|
case nil:
|
|
return materializeNulls(alloc, typ, validity)
|
|
default:
|
|
panic(fmt.Sprintf("found unexpected type %T", arr))
|
|
}
|
|
}
|
|
|
|
func materializeSparseUTF8(alloc *memory.Allocator, validity memory.Bitmap, denseValues *columnar.UTF8) (columnar.Array, error) {
|
|
// The data buffer can remain the same, but we need to make a new offsets
|
|
// buffer to account for all the nulls.
|
|
offsetsBuf := memory.NewBuffer[int32](alloc, validity.Len()+1)
|
|
offsetsBuf.Resize(validity.Len() + 1)
|
|
offsets := offsetsBuf.Data()
|
|
|
|
// Since we're moving the data directly from the dense values array, our
|
|
// offsets need to start whenever the source offsets starts. Based on our
|
|
// decoders, this will always be 0, but we keep this logic here to be
|
|
// defensive.
|
|
srcOffsets := denseValues.Offsets()
|
|
offsets[0] = srcOffsets[0]
|
|
|
|
var (
|
|
denseIndex = 0
|
|
lastOffset = offsets[0]
|
|
)
|
|
|
|
for i := range validity.Len() {
|
|
if !validity.Get(i) {
|
|
offsets[i+1] = lastOffset
|
|
continue
|
|
}
|
|
|
|
// Find the end offset to push from the src.
|
|
srcEnd := srcOffsets[denseIndex+1]
|
|
denseIndex++
|
|
|
|
offsets[i+1] = srcEnd
|
|
lastOffset = srcEnd
|
|
}
|
|
|
|
return columnar.NewUTF8(denseValues.Data(), offsets, validity), nil
|
|
}
|
|
|
|
func materializeSparseNumber[T columnar.Numeric](alloc *memory.Allocator, validity memory.Bitmap, denseValues *columnar.Number[T]) (columnar.Array, error) {
|
|
valuesBuf := memory.NewBuffer[T](alloc, validity.Len())
|
|
valuesBuf.Resize(validity.Len())
|
|
values := valuesBuf.Data()
|
|
|
|
srcValues := denseValues.Values()
|
|
|
|
var srcIndex int
|
|
for i := range validity.Len() {
|
|
if !validity.Get(i) {
|
|
continue
|
|
}
|
|
values[i] = srcValues[srcIndex]
|
|
srcIndex++
|
|
}
|
|
return columnar.NewNumber[T](values, validity), nil
|
|
}
|
|
|
|
func materializeNulls(alloc *memory.Allocator, typ datasetmd.PhysicalType, validity memory.Bitmap) (columnar.Array, error) {
|
|
if validity.SetCount() > 0 {
|
|
panic(fmt.Sprintf("unexpected non-null values: %d", validity.SetCount()))
|
|
}
|
|
|
|
// NOTE(rfratto): we need to return an array of the expected type here,
|
|
// since other operations will require all arrays to be of the same type.
|
|
//
|
|
// TODO(rfratto): Should we update functions like [columnar.Concat] to
|
|
// accept some of the arrays being Null? Would that slow things down too
|
|
// much?
|
|
switch typ {
|
|
case datasetmd.PHYSICAL_TYPE_INT64:
|
|
valuesBuffer := memory.NewBuffer[int64](alloc, validity.Len())
|
|
valuesBuffer.Resize(validity.Len())
|
|
valuesBuffer.Clear()
|
|
|
|
return columnar.NewNumber[int64](valuesBuffer.Data(), validity), nil
|
|
|
|
case datasetmd.PHYSICAL_TYPE_UINT64:
|
|
valuesBuffer := memory.NewBuffer[uint64](alloc, validity.Len())
|
|
valuesBuffer.Resize(validity.Len())
|
|
valuesBuffer.Clear()
|
|
|
|
return columnar.NewNumber[uint64](valuesBuffer.Data(), validity), nil
|
|
|
|
case datasetmd.PHYSICAL_TYPE_BINARY:
|
|
offsetsBuffer := memory.NewBuffer[int32](alloc, validity.Len()+1)
|
|
offsetsBuffer.Resize(validity.Len() + 1)
|
|
offsetsBuffer.Clear()
|
|
|
|
return columnar.NewUTF8(nil, offsetsBuffer.Data(), validity), nil
|
|
|
|
default:
|
|
return columnar.NewNull(validity), nil
|
|
}
|
|
}
|
|
|
|
// Seek sets the row offset for the next Read call, interpreted according to
|
|
// whence:
|
|
//
|
|
// - [io.SeekStart] seeks relative to the start of the page,
|
|
// - [io.SeekCurrent] seeks relative to the current offset, and
|
|
// - [io.SeekEnd] seeks relative to the end (for example, offset = -2
|
|
// specifies the penultimate row of the page).
|
|
//
|
|
// Row offsets are relative to pages and not a column that the page may belong
|
|
// to.
|
|
//
|
|
// Seek returns the new offset relative to the start of the page or an error,
|
|
// if any.
|
|
//
|
|
// To retrieve the current offset without modification, call Seek with 0 and
|
|
// [io.SeekCurrent].
|
|
//
|
|
// Seeking to an offset before the start of the page is an error. Seeking to
|
|
// beyond the end of the page will cause the next Read to return [io.EOF].
|
|
func (pr *pageReader) Seek(offset int64, whence int) (int64, error) {
|
|
switch whence {
|
|
case io.SeekStart:
|
|
if offset < 0 {
|
|
return 0, errors.New("invalid offset")
|
|
}
|
|
pr.nextRow = offset
|
|
|
|
case io.SeekCurrent:
|
|
if pr.nextRow+offset < 0 {
|
|
return 0, errors.New("invalid offset")
|
|
}
|
|
pr.nextRow += offset
|
|
|
|
case io.SeekEnd:
|
|
lastRow := int64(pr.page.PageDesc().RowCount)
|
|
if lastRow+offset < 0 {
|
|
return 0, errors.New("invalid offset")
|
|
}
|
|
pr.nextRow = lastRow + offset
|
|
|
|
default:
|
|
return 0, fmt.Errorf("invalid whence value %d", whence)
|
|
}
|
|
|
|
return pr.nextRow, nil
|
|
}
|
|
|
|
// Reset resets the page reader to read from the start of the provided page.
|
|
// This permits reusing a page reader rather than allocating a new one.
|
|
func (pr *pageReader) Reset(page Page, physicalType datasetmd.PhysicalType, compression datasetmd.CompressionType) {
|
|
pr.page = page
|
|
pr.physicalType = physicalType
|
|
pr.compression = compression
|
|
pr.ready = false
|
|
|
|
if pr.presenceDec != nil {
|
|
pr.presenceDec.Reset(nil)
|
|
}
|
|
if pr.valuesDec != nil {
|
|
pr.valuesDec.Reset(nil)
|
|
}
|
|
|
|
pr.pageRow = 0
|
|
pr.nextRow = 0
|
|
|
|
// Close the underlying reader if one is open so resources get released
|
|
// sooner.
|
|
_ = pr.Close()
|
|
}
|
|
|
|
// Close closes the pageReader. Closed pageReaders can be reused by calling
|
|
// [pageReader.Reset].
|
|
func (pr *pageReader) Close() error {
|
|
if pr.closer != nil {
|
|
err := pr.closer.Close()
|
|
pr.closer = nil
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (pr *pageReader) getPresenceDecoder() *bitmapDecoder {
|
|
if pr.presenceDec == nil {
|
|
return newBitmapDecoder(nil)
|
|
}
|
|
return pr.presenceDec
|
|
}
|
|
|