Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/util/rangeio/rangeio.go

478 lines
15 KiB

// Package rangeio provides basic interfaces and utilities for reading ranges of
// data.
package rangeio
import (
"bytes"
"cmp"
"context"
"errors"
"flag"
"io"
"runtime"
"slices"
"sort"
"sync"
"time"
"github.com/dustin/go-humanize"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"go.uber.org/atomic"
"golang.org/x/sync/errgroup"
)
var tracer = otel.Tracer("pkg/util/rangeio")
// Range represents a range of data to be read.
type Range struct {
// Data to read into; exactly len(Data) bytes will be read, or an error will
// be returned.
Data []byte
// Offset to start reading from.
Offset int64
}
// Len returns the length of the range.
func (r Range) Len() int64 { return int64(len(r.Data)) }
// Reader is the interface that wraps the basic ReadRange method. Reader is
// similar to [io.ReaderAt], but allows providing a [context.Context] for
// canceling the operation.
type Reader interface {
// ReadRange reads len(r.Data) bytes into r.Data starting at r.Offset in the
// underlying input source.
//
// It returns the number of bytes read (0 <= n <= len(r.Data)) and any error
// encountered.
//
// When ReadRange returns n < len(r.Data), it returns a non-nil error
// explaining why more bytes were not returned. The error must be [io.EOF]
// when reading beyond the end of the input source.
//
// ReadRange may use all of r.Data as scratch space during the call, even if
// less than len(r.Data) bytes are read. If some data is available but not
// len(r.Data) bytes, ReadRange blocks until either all the data is
// available or an error occurs.
//
// Implementations are recommended but not required to immediately respond
// to the cancellation of ctx; for example, cancellation may not occur
// immediately when using disk-based I/O.
//
// If the len(r.Data) bytes returned by ReadRange are at the end of the
// input source, ReadRange may return either err == [io.EOF] or err == nil.
//
// If ReadRange is reading from an input source with a seek offset,
// ReadRange should not affect nor be affected by the underlying seek
// offset.
//
// It is safe to call ReadRange concurrently from multiple goroutines.
//
// Implementations must not retain r.Data after the call returns.
ReadRange(ctx context.Context, r Range) (int, error)
}
// Config configures the behavior of [ReadRanges].
type Config struct {
// MaxParallelism is the maximum number of goroutines that may be used to
// read ranges in parallel. If MaxParallelism <= 0, [runtime.NumCPU] is
// used.
//
// Ranges are split into smaller ranges (no smaller than MinRangeSize) to
// get as close as possible to MaxParallelism.
//
// If MaxParallelism is 1, ReadRanges will read each range sequentially.
MaxParallelism int `yaml:"max_parallelism" category:"experimental"`
// CoalesceSize determines the maximum size (in bytes) of a gap between each
// pair of ranges that causes them to be coalesced into a single range.
CoalesceSize int `yaml:"coalesce_size" category:"experimental"`
// MaxRangeSize determines the maximum size (in bytes) of a range. Ranges
// won't be coalesced if they exceed this size, and existing ranges will be
// split if they exceed this size (down to MinRangeSize).
MaxRangeSize int `yaml:"max_range_size" category:"experimental"`
// MinRangeSize determines the minimum size (in bytes) of a range. When a
// range is split, it won't be split into units smaller than MinRangeSize.
MinRangeSize int `yaml:"min_range_size" category:"experimental"`
}
func (cfg *Config) RegisterFlags(prefix string, fs *flag.FlagSet) {
fs.IntVar(&cfg.MaxParallelism, prefix+"max-parallelism", DefaultConfig.MaxParallelism, "Experimental: maximum number of parallel reads")
fs.IntVar(&cfg.CoalesceSize, prefix+"coalesce-size", DefaultConfig.CoalesceSize, "Experimental: maximum distance (in bytes) between ranges that causes them to be coalesced into a single range")
fs.IntVar(&cfg.MaxRangeSize, prefix+"max-range-size", DefaultConfig.MaxRangeSize, "Experimental: maximum size of a byte range")
fs.IntVar(&cfg.MinRangeSize, prefix+"min-range-size", DefaultConfig.MinRangeSize, "Experimental: minimum size of a byte range")
}
func (cfg *Config) IsZero() bool {
var zero Config
return cfg == nil || *cfg == zero
}
// effectiveParallelism returns the effective parallelism limit.
func (cfg *Config) effectiveParallelism() int {
if cfg.MaxParallelism <= 0 {
return runtime.NumCPU()
}
return cfg.MaxParallelism
}
// DefaultConfig holds the default values for [Config].
var DefaultConfig = Config{
// Benchmarks of GCS and S3 revealed that more parallelism is always better.
// However, too much parallelism (especially parallel [ReadRanges] calls)
// can eventually saturate the network. MaxParallelism of 10 appears to
// provide a good balance of throughput without saturating the network.
MaxParallelism: 10,
// Coalesce ranges no further than 1 MiB apart. 1 MiB is a good balance
// between combining ranges without introducing too many "wasted" bytes.
CoalesceSize: 1 << 20, // 1 MiB
// Constrain ranges to be no longer than 8 MiB. Benchmarks of GCS and S3
// revealed that 8 MiB typically offers the best throughput when there's
// enough ranges to fill MaxParallelism.
MaxRangeSize: 8 * (1 << 20), // 8 MiB
// Constrain split ranges to be no smaller than 1 MiB.
MinRangeSize: 1 << 20, // 1 MiB
}
var bytesBufferPool = &sync.Pool{
New: func() any {
return &bytes.Buffer{}
},
}
// ReadRanges reads the set of ranges from the provided Reader, populating Data
// for each element in ranges.
//
// ReadRanges makes a copy of ranges and optimizes them for performance:
// coalescing ranges that are close together and splitting ranges that are too
// large. The optimized set of ranges are read in parallel.
//
// The optimization behavior of ReadRanges can be controlled by providing a
// context injected with custom configuration by [WithConfig]. If there is no
// custom configuration in the context, [DefaultConfig] is used.
//
// ReadRanges returns an error if any call to r.ReadRange returns an error.
// ReadRanges only returns [io.EOF] if one of the ranges is beyond the end of
// the input source.
func ReadRanges(ctx context.Context, r Reader, ranges []Range) error {
// We store our own start time so we can calculate read throughput at the
// end.
startTime := time.Now()
ctx, span := tracer.Start(ctx, "ReadRanges", trace.WithTimestamp(startTime))
defer span.End()
cfg := configFromContext(ctx)
if cfg == nil {
cfg = &DefaultConfig
span.SetAttributes(attribute.Bool("config.default", true))
}
optimized, releaseBuffers := optimizeRanges(cfg, ranges)
defer releaseBuffers()
span.AddEvent("optimized ranges")
// Once we optimized the ranges we can set up the rest of our attributes.
span.SetAttributes(readRangesAttributes(cfg, ranges, optimized)...)
defer injectThroughputAttribute(span, startTime, optimized)
g, ctx := errgroup.WithContext(ctx)
g.SetLimit(cfg.effectiveParallelism())
var gotEOF atomic.Bool
for _, targetRange := range optimized {
// Ignore ranges that happened to be empty.
if len(targetRange.Data) == 0 {
continue
}
g.Go(func() error {
tr := tracedReader{inner: r}
n, err := tr.ReadRange(ctx, targetRange)
// ReadRange must return a non-nil error if it read fewer than the
// requested amount of bytes.
//
// In the case of io.EOF (because we tried reading beyond the end of
// the input source), we don't want to cancel the other goroutines,
// so we store the EOF marker for later.
if n < len(targetRange.Data) {
if errors.Is(err, io.EOF) {
gotEOF.Store(true)
return nil
}
return err
}
return nil
})
}
if err := g.Wait(); err != nil {
span.SetStatus(codes.Error, err.Error())
return err
}
span.AddEvent("finished reading ranges")
// Now that we read the ranges, we can copy the data back into the original
// slice.
for _, r := range ranges {
// Ignore ranges that happened to be empty.
if len(r.Data) == 0 {
continue
}
// Our range may have been split across smaller ranges, so we need to
// gradually copy the data back into the original slice by finding each
// subslice in optimized.
offset := r.Offset
output := r.Data
for len(output) > 0 {
// Find the first slice that ends after the offset we're looking
// for.
i := sort.Search(len(optimized), func(i int) bool {
endByte := optimized[i].Offset + int64(len(optimized[i].Data))
return endByte > offset
})
if i == len(optimized) {
// This can't ever happen; our ranges are always inside the
// optimized slice.
return errors.New("requested offset missing from coalesced ranges")
}
copied := copy(output, optimized[i].Data[offset-optimized[i].Offset:])
// Move our offset and output forward by the amount of copied data.
offset += int64(copied)
output = output[copied:]
}
}
span.AddEvent("copied data to inputs")
span.SetStatus(codes.Ok, "") // Even if we got [io.EOF], we treat the operation as successful here.
if gotEOF.Load() {
return io.EOF
}
return nil
}
// optimizeRanges optimizes the set of ranges based on cfg. The returned slice
// of ranges is sorted.
//
// If permitParallelismSplit is true, ranges will be also be split to try to
// reach at least cfg.MaxParallelism ranges.
//
// If cfg is nil, [DefaultConfig] is used.
func optimizeRanges(cfg *Config, in []Range) ([]Range, func()) {
if cfg == nil {
cfg = &DefaultConfig
}
// chunk is a [Range] but without the allocated data slice.
type chunk struct {
Offset int64
Length int
}
sorted := make([]chunk, len(in))
for i := range in {
sorted[i].Offset = in[i].Offset
sorted[i].Length = len(in[i].Data)
}
slices.SortFunc(sorted, func(a, b chunk) int { return cmp.Compare(a.Offset, b.Offset) })
var coalescedChunks []chunk
for i := 0; i < len(sorted); {
coalescedOffset := sorted[i].Offset
coalescedEnd := coalescedOffset + int64(sorted[i].Length)
// Look at ranges after i to see if we can coalesce them.
peekIndex := i + 1
for peekIndex < len(sorted) {
peekOffset := sorted[peekIndex].Offset
peekEnd := sorted[peekIndex].Offset + int64(sorted[peekIndex].Length)
if coalescedEnd > peekOffset {
// Coalesce overlapping ranges, regardless of the size. They can
// be split in the follow up logic after this loop.
goto Coalesce
}
if peekOffset-coalescedEnd > int64(cfg.CoalesceSize) {
// Gap between the current coalesced range and the peek range is
// too big; stop.
break
} else if peekEnd-coalescedOffset > int64(cfg.MaxRangeSize) {
// Coalescing the peeked range would cause the range to exceed
// the max size; stop.
break
}
Coalesce:
coalescedEnd = max(coalescedEnd, peekEnd)
peekIndex++
}
// Our coalesced range is now [coalescedOffset, coalescedEnd). This
// may exceed our max range size in two cases:
//
// 1. We merged overlapping ranges
// 2. We received a range which was already larger than the max range
// size.
//
// Ranges which are too big should be split into halves, unless those
// halves would be smaller than our minimum size; it's preferable to
// have something too big than too small.
targetLength := coalescedEnd - coalescedOffset
if targetLength > int64(cfg.MaxRangeSize) && targetLength >= int64(cfg.MinRangeSize*2) {
targetLength = max(targetLength/2, int64(cfg.MinRangeSize))
}
// NOTE(rfratto): This loop will only run once if targetLength was left
// unchanged.
for off := coalescedOffset; off < coalescedEnd; off += targetLength {
splitEndOffset := min(off+targetLength, coalescedEnd)
splitLength := int(splitEndOffset - off)
coalescedChunks = append(coalescedChunks, chunk{
Length: splitLength,
Offset: off,
})
}
// The next iteration should start where the previous loop stopped.
i = peekIndex
}
// Final pass: after the ranges have been coalesced, we may have ended up
// with fewer ranges than the amount of parallelism. In this case, it's
// better to have smaller ranges that spread out the work, so we'll split
// ranges until we have enough.
//
// This is a no-op if we already have enough ranges.
for i := 0; i < len(coalescedChunks) && len(coalescedChunks) < cfg.effectiveParallelism(); {
// Ignore ranges which are too small or where splitting would cause them
// to become too small.
if coalescedChunks[i].Length < (cfg.MinRangeSize * 2) {
i++
continue
}
orig := coalescedChunks[i]
targetLength := max(orig.Length/2, cfg.MinRangeSize)
// Split the range in-order so we don't have to re-sort at the end.
coalescedChunks = slices.Replace(coalescedChunks, i, i+1, chunk{
Offset: orig.Offset,
Length: targetLength,
}, chunk{
Offset: orig.Offset + int64(targetLength),
Length: orig.Length - targetLength,
})
i += 2 // Skip over the range we just inserted.
}
// Convert our chunks into target ranges.
out := make([]Range, len(coalescedChunks))
usedBuffers := make([]*bytes.Buffer, 0, len(out))
for i := range coalescedChunks {
size := coalescedChunks[i].Length
buf := bytesBufferPool.Get().(*bytes.Buffer)
buf.Reset()
buf.Grow(size)
usedBuffers = append(usedBuffers, buf)
out[i] = Range{
Data: buf.Bytes()[:size],
Offset: coalescedChunks[i].Offset,
}
}
return out, func() {
for _, buf := range usedBuffers {
bytesBufferPool.Put(buf)
}
}
}
func rangesSize(ranges []Range) uint64 {
var total uint64
for _, r := range ranges {
total += uint64(r.Len())
}
return total
}
// readRangesAttributes retrieves attributes about [ReadRanges] to be injected in spans.
func readRangesAttributes(cfg *Config, ranges, optimizedRanges []Range) []attribute.KeyValue {
origSize := rangesSize(ranges)
optimizedSize := rangesSize(optimizedRanges)
return []attribute.KeyValue{
attribute.Int("config.max_paralleism", cfg.MaxParallelism),
attribute.Stringer("config.coalesce_size", bytesStringer(uint64(cfg.CoalesceSize))),
attribute.Stringer("config.max_range_size", bytesStringer(uint64(cfg.MaxRangeSize))),
attribute.Stringer("config.min_range_size", bytesStringer(uint64(cfg.MinRangeSize))),
attribute.Int("config.effective_parlalelism", cfg.effectiveParallelism()),
attribute.Int("input.ranges.count", len(ranges)),
attribute.Stringer("input.ranges.size", bytesStringer(origSize)),
attribute.Int("optimized.ranges.count", len(optimizedRanges)),
attribute.Stringer("optimized.ranges.size", bytesStringer(optimizedSize)),
}
}
func injectThroughputAttribute(span trace.Span, startTime time.Time, optimizedRanges []Range) {
size := rangesSize(optimizedRanges)
bytesPerSec := float64(size) / time.Since(startTime).Seconds()
span.SetAttributes(attribute.Stringer("optimized.ranges.throughput", bytesStringer(uint64(bytesPerSec))))
}
type bytesStringer uint64
func (s bytesStringer) String() string {
return humanize.Bytes(uint64(s))
}
// tracedReader injects span events after reading a range.
type tracedReader struct {
inner Reader
}
func (tr tracedReader) ReadRange(ctx context.Context, r Range) (int, error) {
start := time.Now()
span := trace.SpanFromContext(ctx)
n, err := tr.inner.ReadRange(ctx, r)
if span.IsRecording() {
bytesPerSec := float64(r.Len()) / time.Since(start).Seconds()
span.AddEvent("read optimized range", trace.WithAttributes(
attribute.Int64("offset", r.Offset),
attribute.Int64("len", r.Len()),
attribute.Int("read.size", n),
attribute.Stringer("read.duration", time.Since(start)),
attribute.Stringer("read.throughput", bytesStringer(uint64(bytesPerSec))),
))
}
return n, err
}