mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
471 lines
14 KiB
471 lines
14 KiB
package encoding
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
|
|
"github.com/grafana/loki/v3/pkg/dataobj/internal/dataset"
|
|
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/filemd"
|
|
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/logsmd"
|
|
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/streamsmd"
|
|
"github.com/grafana/loki/v3/pkg/dataobj/internal/result"
|
|
)
|
|
|
|
// windowSize specifies the maximum amount of data to download at once from
|
|
// object storage. 16MB is chosen based on S3's [recommendations] for
|
|
// Byte-Range fetches, which recommends either 8MB or 16MB.
|
|
//
|
|
// As windowing is designed to reduce the number of requests made to object
|
|
// storage, 16MB is chosen over 8MB, as it will lead to fewer requests.
|
|
//
|
|
// [recommendations]: https://docs.aws.amazon.com/whitepapers/latest/s3-optimizing-performance-best-practices/use-byte-range-fetches.html
|
|
const windowSize = 16_000_000
|
|
|
|
// rangeReader is an interface that can read a range of bytes from an object.
|
|
type rangeReader interface {
|
|
// Size returns the full size of the object.
|
|
Size(ctx context.Context) (int64, error)
|
|
|
|
// ReadRange returns a reader over a range of bytes. Callers may create
|
|
// multiple current instance of ReadRange.
|
|
ReadRange(ctx context.Context, offset int64, length int64) (io.ReadCloser, error)
|
|
}
|
|
|
|
type rangeDecoder struct {
|
|
r rangeReader
|
|
}
|
|
|
|
func (rd *rangeDecoder) Metadata(ctx context.Context) (*filemd.Metadata, error) {
|
|
tailer, err := rd.tailer(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("reading tailer: %w", err)
|
|
}
|
|
|
|
rc, err := rd.r.ReadRange(ctx, int64(tailer.FileSize-tailer.MetadataSize-8), int64(tailer.MetadataSize))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("getting metadata: %w", err)
|
|
}
|
|
defer rc.Close()
|
|
|
|
br, release := getBufioReader(rc)
|
|
defer release()
|
|
|
|
return decodeFileMetadata(br)
|
|
}
|
|
|
|
type tailer struct {
|
|
MetadataSize uint64
|
|
FileSize uint64
|
|
}
|
|
|
|
func (rd *rangeDecoder) tailer(ctx context.Context) (tailer, error) {
|
|
size, err := rd.r.Size(ctx)
|
|
if err != nil {
|
|
return tailer{}, fmt.Errorf("reading attributes: %w", err)
|
|
}
|
|
|
|
// Read the last 8 bytes of the object to get the metadata size and magic.
|
|
rc, err := rd.r.ReadRange(ctx, size-8, 8)
|
|
if err != nil {
|
|
return tailer{}, fmt.Errorf("getting file tailer: %w", err)
|
|
}
|
|
defer rc.Close()
|
|
|
|
br, release := getBufioReader(rc)
|
|
defer release()
|
|
|
|
metadataSize, err := decodeTailer(br)
|
|
if err != nil {
|
|
return tailer{}, fmt.Errorf("scanning tailer: %w", err)
|
|
}
|
|
|
|
return tailer{
|
|
MetadataSize: uint64(metadataSize),
|
|
FileSize: uint64(size),
|
|
}, nil
|
|
}
|
|
|
|
func (rd *rangeDecoder) StreamsDecoder(metadata *filemd.Metadata, section *filemd.SectionInfo) StreamsDecoder {
|
|
return &rangeStreamsDecoder{rr: rd.r, md: metadata, sec: section}
|
|
}
|
|
|
|
func (rd *rangeDecoder) LogsDecoder(metadata *filemd.Metadata, section *filemd.SectionInfo) LogsDecoder {
|
|
return &rangeLogsDecoder{rr: rd.r, md: metadata, sec: section}
|
|
}
|
|
|
|
type rangeStreamsDecoder struct {
|
|
// TODO(rfratto): restrict sections from reading outside of their regions.
|
|
|
|
rr rangeReader // Reader for absolute ranges within the file.
|
|
md *filemd.Metadata
|
|
sec *filemd.SectionInfo
|
|
}
|
|
|
|
func (rd *rangeStreamsDecoder) Columns(ctx context.Context) ([]*streamsmd.ColumnDesc, error) {
|
|
typ, err := GetSectionType(rd.md, rd.sec)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read section type: %w", err)
|
|
} else if got, want := typ, SectionTypeStreams; got != want {
|
|
return nil, fmt.Errorf("unexpected section type: got=%s want=%s", got, want)
|
|
}
|
|
|
|
metadataRegion, err := findMetadataRegion(rd.sec)
|
|
if err != nil {
|
|
return nil, err
|
|
} else if metadataRegion == nil {
|
|
return nil, fmt.Errorf("section is missing metadata")
|
|
}
|
|
|
|
rc, err := rd.rr.ReadRange(ctx, int64(metadataRegion.Offset), int64(metadataRegion.Length))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("reading streams section metadata: %w", err)
|
|
}
|
|
defer rc.Close()
|
|
|
|
br, release := getBufioReader(rc)
|
|
defer release()
|
|
|
|
md, err := decodeStreamsMetadata(br)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return md.Columns, nil
|
|
}
|
|
|
|
// findMetadataRegion returns the region where a section's metadata is stored.
|
|
// If section specifies the new [filemd.SectionLayout] field, then the region
|
|
// from tha layout is returned. Otherwise, it returns the deprecated
|
|
// MetadataOffset and MetadataSize fields.
|
|
//
|
|
// findMetadataRegion returns an error if both the layout and metadata fields
|
|
// are set.
|
|
//
|
|
// findMetadtaRegion returns nil for sections without metadata.
|
|
func findMetadataRegion(section *filemd.SectionInfo) (*filemd.Region, error) {
|
|
// Fallbacks to deprecated fields if the layout is not set.
|
|
var (
|
|
deprecatedOffset = section.MetadataOffset //nolint:staticcheck // Ignore deprecation warning
|
|
deprecatedSize = section.MetadataSize //nolint:staticcheck // Ignore deprecation warning
|
|
)
|
|
|
|
if section.Layout != nil {
|
|
if deprecatedOffset != 0 || deprecatedSize != 0 {
|
|
return nil, fmt.Errorf("invalid section: both layout and deprecated metadata fields are set")
|
|
}
|
|
return section.Layout.Metadata, nil
|
|
}
|
|
|
|
return &filemd.Region{
|
|
Offset: deprecatedOffset,
|
|
Length: deprecatedSize,
|
|
}, nil
|
|
}
|
|
|
|
func (rd *rangeStreamsDecoder) Pages(ctx context.Context, columns []*streamsmd.ColumnDesc) result.Seq[[]*streamsmd.PageDesc] {
|
|
return result.Iter(func(yield func([]*streamsmd.PageDesc) bool) error {
|
|
baseOffset, err := findDataOffset(rd.sec)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
results := make([][]*streamsmd.PageDesc, len(columns))
|
|
|
|
columnInfo := func(c *streamsmd.ColumnDesc) (uint64, uint64) {
|
|
return c.GetInfo().MetadataOffset, c.GetInfo().MetadataSize
|
|
}
|
|
|
|
for window := range iterWindows(columns, columnInfo, windowSize) {
|
|
if len(window) == 0 {
|
|
continue
|
|
}
|
|
|
|
var (
|
|
windowOffset = window.Start().GetInfo().MetadataOffset
|
|
windowSize = (window.End().GetInfo().MetadataOffset + window.End().GetInfo().MetadataSize) - windowOffset
|
|
)
|
|
|
|
rc, err := rd.rr.ReadRange(ctx, int64(baseOffset+windowOffset), int64(windowSize))
|
|
if err != nil {
|
|
return fmt.Errorf("reading column data: %w", err)
|
|
}
|
|
data, err := readAndClose(rc, windowSize)
|
|
if err != nil {
|
|
return fmt.Errorf("read column data: %w", err)
|
|
}
|
|
|
|
for _, wp := range window {
|
|
// Find the slice in the data for this column.
|
|
var (
|
|
columnOffset = wp.Data.GetInfo().MetadataOffset
|
|
dataOffset = columnOffset - windowOffset
|
|
)
|
|
|
|
r := bytes.NewReader(data[dataOffset : dataOffset+wp.Data.GetInfo().MetadataSize])
|
|
|
|
md, err := decodeStreamsColumnMetadata(r)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// wp.Position is the position of the column in the original pages
|
|
// slice; this retains the proper order of data in results.
|
|
results[wp.Position] = md.Pages
|
|
}
|
|
}
|
|
|
|
for _, data := range results {
|
|
if !yield(data) {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// findDataOffset returns the base byte offset from where all reads of a
|
|
// section start.
|
|
//
|
|
// Older versions of data objects use absolute offsets for page data. Newer
|
|
// versions (where [filemd.SectionLayout] is provided) use offsets relative to
|
|
// the start of a section's data region.
|
|
//
|
|
// If a section specifies a layout but has no data region, then the section has
|
|
// no data for reading, and findDataOffset returns an error.
|
|
func findDataOffset(section *filemd.SectionInfo) (uint64, error) {
|
|
if section.Layout != nil {
|
|
if section.Layout.Data == nil {
|
|
return 0, fmt.Errorf("section has no data")
|
|
}
|
|
return section.Layout.Data.Offset, nil
|
|
}
|
|
return 0, nil
|
|
}
|
|
|
|
// readAndClose reads exactly size bytes from rc and then closes it.
|
|
func readAndClose(rc io.ReadCloser, size uint64) ([]byte, error) {
|
|
defer rc.Close()
|
|
|
|
data := make([]byte, size)
|
|
if _, err := io.ReadFull(rc, data); err != nil {
|
|
return nil, fmt.Errorf("read column data: %w", err)
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func (rd *rangeStreamsDecoder) ReadPages(ctx context.Context, pages []*streamsmd.PageDesc) result.Seq[dataset.PageData] {
|
|
return result.Iter(func(yield func(dataset.PageData) bool) error {
|
|
baseOffset, err := findDataOffset(rd.sec)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
results := make([]dataset.PageData, len(pages))
|
|
|
|
pageInfo := func(p *streamsmd.PageDesc) (uint64, uint64) {
|
|
return p.GetInfo().DataOffset, p.GetInfo().DataSize
|
|
}
|
|
|
|
// TODO(rfratto): If there are many windows, it may make sense to read them
|
|
// in parallel.
|
|
for window := range iterWindows(pages, pageInfo, windowSize) {
|
|
if len(window) == 0 {
|
|
continue
|
|
}
|
|
|
|
var (
|
|
windowOffset = window.Start().GetInfo().DataOffset
|
|
windowSize = (window.End().GetInfo().DataOffset + window.End().GetInfo().DataSize) - windowOffset
|
|
)
|
|
|
|
rc, err := rd.rr.ReadRange(ctx, int64(baseOffset+windowOffset), int64(windowSize))
|
|
if err != nil {
|
|
return fmt.Errorf("reading page data: %w", err)
|
|
}
|
|
data, err := readAndClose(rc, windowSize)
|
|
if err != nil {
|
|
return fmt.Errorf("read page data: %w", err)
|
|
}
|
|
|
|
for _, wp := range window {
|
|
// Find the slice in the data for this page.
|
|
var (
|
|
pageOffset = wp.Data.GetInfo().DataOffset
|
|
dataOffset = pageOffset - windowOffset
|
|
)
|
|
|
|
// wp.Position is the position of the page in the original pages slice;
|
|
// this retains the proper order of data in results.
|
|
results[wp.Position] = dataset.PageData(data[dataOffset : dataOffset+wp.Data.GetInfo().DataSize])
|
|
}
|
|
}
|
|
|
|
for _, data := range results {
|
|
if !yield(data) {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
type rangeLogsDecoder struct {
|
|
// TODO(rfratto): restrict sections from reading outside of their regions.
|
|
|
|
rr rangeReader // Reader for absolute ranges within the file.
|
|
md *filemd.Metadata
|
|
sec *filemd.SectionInfo
|
|
}
|
|
|
|
func (rd *rangeLogsDecoder) Columns(ctx context.Context) ([]*logsmd.ColumnDesc, error) {
|
|
typ, err := GetSectionType(rd.md, rd.sec)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read section type: %w", err)
|
|
} else if got, want := typ, SectionTypeLogs; got != want {
|
|
return nil, fmt.Errorf("unexpected section type: got=%s want=%s", got, want)
|
|
}
|
|
|
|
metadataRegion, err := findMetadataRegion(rd.sec)
|
|
if err != nil {
|
|
return nil, err
|
|
} else if metadataRegion == nil {
|
|
return nil, fmt.Errorf("section is missing metadata")
|
|
}
|
|
|
|
rc, err := rd.rr.ReadRange(ctx, int64(metadataRegion.Offset), int64(metadataRegion.Length))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("reading logs section metadata: %w", err)
|
|
}
|
|
defer rc.Close()
|
|
|
|
br, release := getBufioReader(rc)
|
|
defer release()
|
|
|
|
md, err := decodeLogsMetadata(br)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return md.Columns, nil
|
|
}
|
|
|
|
func (rd *rangeLogsDecoder) Pages(ctx context.Context, columns []*logsmd.ColumnDesc) result.Seq[[]*logsmd.PageDesc] {
|
|
return result.Iter(func(yield func([]*logsmd.PageDesc) bool) error {
|
|
baseOffset, err := findDataOffset(rd.sec)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
results := make([][]*logsmd.PageDesc, len(columns))
|
|
|
|
columnInfo := func(c *logsmd.ColumnDesc) (uint64, uint64) {
|
|
return c.GetInfo().MetadataOffset, c.GetInfo().MetadataSize
|
|
}
|
|
|
|
for window := range iterWindows(columns, columnInfo, windowSize) {
|
|
if len(window) == 0 {
|
|
continue
|
|
}
|
|
|
|
var (
|
|
windowOffset = window.Start().GetInfo().MetadataOffset
|
|
windowSize = (window.End().GetInfo().MetadataOffset + window.End().GetInfo().MetadataSize) - windowOffset
|
|
)
|
|
|
|
rc, err := rd.rr.ReadRange(ctx, int64(baseOffset+windowOffset), int64(windowSize))
|
|
if err != nil {
|
|
return fmt.Errorf("reading column data: %w", err)
|
|
}
|
|
data, err := readAndClose(rc, windowSize)
|
|
if err != nil {
|
|
return fmt.Errorf("read page data: %w", err)
|
|
}
|
|
|
|
for _, wp := range window {
|
|
// Find the slice in the data for this column.
|
|
var (
|
|
columnOffset = wp.Data.GetInfo().MetadataOffset
|
|
dataOffset = columnOffset - windowOffset
|
|
)
|
|
|
|
r := bytes.NewReader(data[dataOffset : dataOffset+wp.Data.GetInfo().MetadataSize])
|
|
|
|
md, err := decodeLogsColumnMetadata(r)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// wp.Position is the position of the column in the original pages
|
|
// slice; this retains the proper order of data in results.
|
|
results[wp.Position] = md.Pages
|
|
}
|
|
}
|
|
|
|
for _, data := range results {
|
|
if !yield(data) {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func (rd *rangeLogsDecoder) ReadPages(ctx context.Context, pages []*logsmd.PageDesc) result.Seq[dataset.PageData] {
|
|
return result.Iter(func(yield func(dataset.PageData) bool) error {
|
|
baseOffset, err := findDataOffset(rd.sec)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
results := make([]dataset.PageData, len(pages))
|
|
|
|
pageInfo := func(p *logsmd.PageDesc) (uint64, uint64) {
|
|
return p.GetInfo().DataOffset, p.GetInfo().DataSize
|
|
}
|
|
|
|
// TODO(rfratto): If there are many windows, it may make sense to read them
|
|
// in parallel.
|
|
for window := range iterWindows(pages, pageInfo, windowSize) {
|
|
if len(window) == 0 {
|
|
continue
|
|
}
|
|
|
|
var (
|
|
windowOffset = window.Start().GetInfo().DataOffset
|
|
windowSize = (window.End().GetInfo().DataOffset + window.End().GetInfo().DataSize) - windowOffset
|
|
)
|
|
|
|
rc, err := rd.rr.ReadRange(ctx, int64(baseOffset+windowOffset), int64(windowSize))
|
|
if err != nil {
|
|
return fmt.Errorf("reading page data: %w", err)
|
|
}
|
|
data, err := readAndClose(rc, windowSize)
|
|
if err != nil {
|
|
return fmt.Errorf("read page data: %w", err)
|
|
}
|
|
|
|
for _, wp := range window {
|
|
// Find the slice in the data for this page.
|
|
var (
|
|
pageOffset = wp.Data.GetInfo().DataOffset
|
|
dataOffset = pageOffset - windowOffset
|
|
)
|
|
|
|
// wp.Position is the position of the page in the original pages slice;
|
|
// this retains the proper order of data in results.
|
|
results[wp.Position] = dataset.PageData(data[dataOffset : dataOffset+wp.Data.GetInfo().DataSize])
|
|
}
|
|
}
|
|
|
|
for _, data := range results {
|
|
if !yield(data) {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|