Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
loki/pkg/dataobj/internal/encoding/decoder_range.go

472 lines
14 KiB

package encoding
import (
"bytes"
"context"
"fmt"
"io"
"github.com/grafana/loki/v3/pkg/dataobj/internal/dataset"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/filemd"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/logsmd"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/streamsmd"
"github.com/grafana/loki/v3/pkg/dataobj/internal/result"
)
// windowSize specifies the maximum amount of data to download at once from
// object storage. 16MB is chosen based on S3's [recommendations] for
// Byte-Range fetches, which recommends either 8MB or 16MB.
//
// As windowing is designed to reduce the number of requests made to object
// storage, 16MB is chosen over 8MB, as it will lead to fewer requests.
//
// [recommendations]: https://docs.aws.amazon.com/whitepapers/latest/s3-optimizing-performance-best-practices/use-byte-range-fetches.html
const windowSize = 16_000_000
// rangeReader is an interface that can read a range of bytes from an object.
type rangeReader interface {
// Size returns the full size of the object.
Size(ctx context.Context) (int64, error)
// ReadRange returns a reader over a range of bytes. Callers may create
// multiple current instance of ReadRange.
ReadRange(ctx context.Context, offset int64, length int64) (io.ReadCloser, error)
}
type rangeDecoder struct {
r rangeReader
}
func (rd *rangeDecoder) Metadata(ctx context.Context) (*filemd.Metadata, error) {
tailer, err := rd.tailer(ctx)
if err != nil {
return nil, fmt.Errorf("reading tailer: %w", err)
}
rc, err := rd.r.ReadRange(ctx, int64(tailer.FileSize-tailer.MetadataSize-8), int64(tailer.MetadataSize))
if err != nil {
return nil, fmt.Errorf("getting metadata: %w", err)
}
defer rc.Close()
br, release := getBufioReader(rc)
defer release()
return decodeFileMetadata(br)
}
type tailer struct {
MetadataSize uint64
FileSize uint64
}
func (rd *rangeDecoder) tailer(ctx context.Context) (tailer, error) {
size, err := rd.r.Size(ctx)
if err != nil {
return tailer{}, fmt.Errorf("reading attributes: %w", err)
}
// Read the last 8 bytes of the object to get the metadata size and magic.
rc, err := rd.r.ReadRange(ctx, size-8, 8)
if err != nil {
return tailer{}, fmt.Errorf("getting file tailer: %w", err)
}
defer rc.Close()
br, release := getBufioReader(rc)
defer release()
metadataSize, err := decodeTailer(br)
if err != nil {
return tailer{}, fmt.Errorf("scanning tailer: %w", err)
}
return tailer{
MetadataSize: uint64(metadataSize),
FileSize: uint64(size),
}, nil
}
func (rd *rangeDecoder) StreamsDecoder(metadata *filemd.Metadata, section *filemd.SectionInfo) StreamsDecoder {
return &rangeStreamsDecoder{rr: rd.r, md: metadata, sec: section}
}
func (rd *rangeDecoder) LogsDecoder(metadata *filemd.Metadata, section *filemd.SectionInfo) LogsDecoder {
return &rangeLogsDecoder{rr: rd.r, md: metadata, sec: section}
}
type rangeStreamsDecoder struct {
// TODO(rfratto): restrict sections from reading outside of their regions.
rr rangeReader // Reader for absolute ranges within the file.
md *filemd.Metadata
sec *filemd.SectionInfo
}
func (rd *rangeStreamsDecoder) Columns(ctx context.Context) ([]*streamsmd.ColumnDesc, error) {
typ, err := GetSectionType(rd.md, rd.sec)
if err != nil {
return nil, fmt.Errorf("failed to read section type: %w", err)
} else if got, want := typ, SectionTypeStreams; got != want {
return nil, fmt.Errorf("unexpected section type: got=%s want=%s", got, want)
}
metadataRegion, err := findMetadataRegion(rd.sec)
if err != nil {
return nil, err
} else if metadataRegion == nil {
return nil, fmt.Errorf("section is missing metadata")
}
rc, err := rd.rr.ReadRange(ctx, int64(metadataRegion.Offset), int64(metadataRegion.Length))
if err != nil {
return nil, fmt.Errorf("reading streams section metadata: %w", err)
}
defer rc.Close()
br, release := getBufioReader(rc)
defer release()
md, err := decodeStreamsMetadata(br)
if err != nil {
return nil, err
}
return md.Columns, nil
}
// findMetadataRegion returns the region where a section's metadata is stored.
// If section specifies the new [filemd.SectionLayout] field, then the region
// from tha layout is returned. Otherwise, it returns the deprecated
// MetadataOffset and MetadataSize fields.
//
// findMetadataRegion returns an error if both the layout and metadata fields
// are set.
//
// findMetadtaRegion returns nil for sections without metadata.
func findMetadataRegion(section *filemd.SectionInfo) (*filemd.Region, error) {
// Fallbacks to deprecated fields if the layout is not set.
var (
deprecatedOffset = section.MetadataOffset //nolint:staticcheck // Ignore deprecation warning
deprecatedSize = section.MetadataSize //nolint:staticcheck // Ignore deprecation warning
)
if section.Layout != nil {
if deprecatedOffset != 0 || deprecatedSize != 0 {
return nil, fmt.Errorf("invalid section: both layout and deprecated metadata fields are set")
}
return section.Layout.Metadata, nil
}
return &filemd.Region{
Offset: deprecatedOffset,
Length: deprecatedSize,
}, nil
}
func (rd *rangeStreamsDecoder) Pages(ctx context.Context, columns []*streamsmd.ColumnDesc) result.Seq[[]*streamsmd.PageDesc] {
return result.Iter(func(yield func([]*streamsmd.PageDesc) bool) error {
baseOffset, err := findDataOffset(rd.sec)
if err != nil {
return err
}
results := make([][]*streamsmd.PageDesc, len(columns))
columnInfo := func(c *streamsmd.ColumnDesc) (uint64, uint64) {
return c.GetInfo().MetadataOffset, c.GetInfo().MetadataSize
}
for window := range iterWindows(columns, columnInfo, windowSize) {
if len(window) == 0 {
continue
}
var (
windowOffset = window.Start().GetInfo().MetadataOffset
windowSize = (window.End().GetInfo().MetadataOffset + window.End().GetInfo().MetadataSize) - windowOffset
)
rc, err := rd.rr.ReadRange(ctx, int64(baseOffset+windowOffset), int64(windowSize))
if err != nil {
return fmt.Errorf("reading column data: %w", err)
}
data, err := readAndClose(rc, windowSize)
if err != nil {
return fmt.Errorf("read column data: %w", err)
}
for _, wp := range window {
// Find the slice in the data for this column.
var (
columnOffset = wp.Data.GetInfo().MetadataOffset
dataOffset = columnOffset - windowOffset
)
r := bytes.NewReader(data[dataOffset : dataOffset+wp.Data.GetInfo().MetadataSize])
md, err := decodeStreamsColumnMetadata(r)
if err != nil {
return err
}
// wp.Position is the position of the column in the original pages
// slice; this retains the proper order of data in results.
results[wp.Position] = md.Pages
}
}
for _, data := range results {
if !yield(data) {
return nil
}
}
return nil
})
}
// findDataOffset returns the base byte offset from where all reads of a
// section start.
//
// Older versions of data objects use absolute offsets for page data. Newer
// versions (where [filemd.SectionLayout] is provided) use offsets relative to
// the start of a section's data region.
//
// If a section specifies a layout but has no data region, then the section has
// no data for reading, and findDataOffset returns an error.
func findDataOffset(section *filemd.SectionInfo) (uint64, error) {
if section.Layout != nil {
if section.Layout.Data == nil {
return 0, fmt.Errorf("section has no data")
}
return section.Layout.Data.Offset, nil
}
return 0, nil
}
// readAndClose reads exactly size bytes from rc and then closes it.
func readAndClose(rc io.ReadCloser, size uint64) ([]byte, error) {
defer rc.Close()
data := make([]byte, size)
if _, err := io.ReadFull(rc, data); err != nil {
return nil, fmt.Errorf("read column data: %w", err)
}
return data, nil
}
func (rd *rangeStreamsDecoder) ReadPages(ctx context.Context, pages []*streamsmd.PageDesc) result.Seq[dataset.PageData] {
return result.Iter(func(yield func(dataset.PageData) bool) error {
baseOffset, err := findDataOffset(rd.sec)
if err != nil {
return err
}
results := make([]dataset.PageData, len(pages))
pageInfo := func(p *streamsmd.PageDesc) (uint64, uint64) {
return p.GetInfo().DataOffset, p.GetInfo().DataSize
}
// TODO(rfratto): If there are many windows, it may make sense to read them
// in parallel.
for window := range iterWindows(pages, pageInfo, windowSize) {
if len(window) == 0 {
continue
}
var (
windowOffset = window.Start().GetInfo().DataOffset
windowSize = (window.End().GetInfo().DataOffset + window.End().GetInfo().DataSize) - windowOffset
)
rc, err := rd.rr.ReadRange(ctx, int64(baseOffset+windowOffset), int64(windowSize))
if err != nil {
return fmt.Errorf("reading page data: %w", err)
}
data, err := readAndClose(rc, windowSize)
if err != nil {
return fmt.Errorf("read page data: %w", err)
}
for _, wp := range window {
// Find the slice in the data for this page.
var (
pageOffset = wp.Data.GetInfo().DataOffset
dataOffset = pageOffset - windowOffset
)
// wp.Position is the position of the page in the original pages slice;
// this retains the proper order of data in results.
results[wp.Position] = dataset.PageData(data[dataOffset : dataOffset+wp.Data.GetInfo().DataSize])
}
}
for _, data := range results {
if !yield(data) {
return nil
}
}
return nil
})
}
type rangeLogsDecoder struct {
// TODO(rfratto): restrict sections from reading outside of their regions.
rr rangeReader // Reader for absolute ranges within the file.
md *filemd.Metadata
sec *filemd.SectionInfo
}
func (rd *rangeLogsDecoder) Columns(ctx context.Context) ([]*logsmd.ColumnDesc, error) {
typ, err := GetSectionType(rd.md, rd.sec)
if err != nil {
return nil, fmt.Errorf("failed to read section type: %w", err)
} else if got, want := typ, SectionTypeLogs; got != want {
return nil, fmt.Errorf("unexpected section type: got=%s want=%s", got, want)
}
metadataRegion, err := findMetadataRegion(rd.sec)
if err != nil {
return nil, err
} else if metadataRegion == nil {
return nil, fmt.Errorf("section is missing metadata")
}
rc, err := rd.rr.ReadRange(ctx, int64(metadataRegion.Offset), int64(metadataRegion.Length))
if err != nil {
return nil, fmt.Errorf("reading logs section metadata: %w", err)
}
defer rc.Close()
br, release := getBufioReader(rc)
defer release()
md, err := decodeLogsMetadata(br)
if err != nil {
return nil, err
}
return md.Columns, nil
}
func (rd *rangeLogsDecoder) Pages(ctx context.Context, columns []*logsmd.ColumnDesc) result.Seq[[]*logsmd.PageDesc] {
return result.Iter(func(yield func([]*logsmd.PageDesc) bool) error {
baseOffset, err := findDataOffset(rd.sec)
if err != nil {
return err
}
results := make([][]*logsmd.PageDesc, len(columns))
columnInfo := func(c *logsmd.ColumnDesc) (uint64, uint64) {
return c.GetInfo().MetadataOffset, c.GetInfo().MetadataSize
}
for window := range iterWindows(columns, columnInfo, windowSize) {
if len(window) == 0 {
continue
}
var (
windowOffset = window.Start().GetInfo().MetadataOffset
windowSize = (window.End().GetInfo().MetadataOffset + window.End().GetInfo().MetadataSize) - windowOffset
)
rc, err := rd.rr.ReadRange(ctx, int64(baseOffset+windowOffset), int64(windowSize))
if err != nil {
return fmt.Errorf("reading column data: %w", err)
}
data, err := readAndClose(rc, windowSize)
if err != nil {
return fmt.Errorf("read page data: %w", err)
}
for _, wp := range window {
// Find the slice in the data for this column.
var (
columnOffset = wp.Data.GetInfo().MetadataOffset
dataOffset = columnOffset - windowOffset
)
r := bytes.NewReader(data[dataOffset : dataOffset+wp.Data.GetInfo().MetadataSize])
md, err := decodeLogsColumnMetadata(r)
if err != nil {
return err
}
// wp.Position is the position of the column in the original pages
// slice; this retains the proper order of data in results.
results[wp.Position] = md.Pages
}
}
for _, data := range results {
if !yield(data) {
return nil
}
}
return nil
})
}
func (rd *rangeLogsDecoder) ReadPages(ctx context.Context, pages []*logsmd.PageDesc) result.Seq[dataset.PageData] {
return result.Iter(func(yield func(dataset.PageData) bool) error {
baseOffset, err := findDataOffset(rd.sec)
if err != nil {
return err
}
results := make([]dataset.PageData, len(pages))
pageInfo := func(p *logsmd.PageDesc) (uint64, uint64) {
return p.GetInfo().DataOffset, p.GetInfo().DataSize
}
// TODO(rfratto): If there are many windows, it may make sense to read them
// in parallel.
for window := range iterWindows(pages, pageInfo, windowSize) {
if len(window) == 0 {
continue
}
var (
windowOffset = window.Start().GetInfo().DataOffset
windowSize = (window.End().GetInfo().DataOffset + window.End().GetInfo().DataSize) - windowOffset
)
rc, err := rd.rr.ReadRange(ctx, int64(baseOffset+windowOffset), int64(windowSize))
if err != nil {
return fmt.Errorf("reading page data: %w", err)
}
data, err := readAndClose(rc, windowSize)
if err != nil {
return fmt.Errorf("read page data: %w", err)
}
for _, wp := range window {
// Find the slice in the data for this page.
var (
pageOffset = wp.Data.GetInfo().DataOffset
dataOffset = pageOffset - windowOffset
)
// wp.Position is the position of the page in the original pages slice;
// this retains the proper order of data in results.
results[wp.Position] = dataset.PageData(data[dataOffset : dataOffset+wp.Data.GetInfo().DataSize])
}
}
for _, data := range results {
if !yield(data) {
return nil
}
}
return nil
})
}