|
|
|
|
@ -3,10 +3,15 @@ package dataset |
|
|
|
|
import ( |
|
|
|
|
"context" |
|
|
|
|
"errors" |
|
|
|
|
"fmt" |
|
|
|
|
"io" |
|
|
|
|
"iter" |
|
|
|
|
"math/rand" |
|
|
|
|
"slices" |
|
|
|
|
"strconv" |
|
|
|
|
"testing" |
|
|
|
|
|
|
|
|
|
"github.com/dustin/go-humanize" |
|
|
|
|
"github.com/stretchr/testify/require" |
|
|
|
|
|
|
|
|
|
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd" |
|
|
|
|
@ -31,9 +36,11 @@ func Test_Reader_ReadWithPredicate(t *testing.T) { |
|
|
|
|
r := NewReader(ReaderOptions{ |
|
|
|
|
Dataset: dset, |
|
|
|
|
Columns: columns, |
|
|
|
|
Predicate: GreaterThanPredicate{ |
|
|
|
|
Column: columns[3], // birth_year column
|
|
|
|
|
Value: Int64Value(1985), |
|
|
|
|
Predicates: []Predicate{ |
|
|
|
|
GreaterThanPredicate{ |
|
|
|
|
Column: columns[3], // birth_year column
|
|
|
|
|
Value: Int64Value(1985), |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
}) |
|
|
|
|
defer r.Close() |
|
|
|
|
@ -65,9 +72,11 @@ func Test_Reader_ReadWithPageFiltering(t *testing.T) { |
|
|
|
|
//
|
|
|
|
|
// TODO(rfratto): make it easier to prove that a predicate includes a value
|
|
|
|
|
// which is out of range of at least one page.
|
|
|
|
|
Predicate: EqualPredicate{ |
|
|
|
|
Column: columns[0], // first_name column
|
|
|
|
|
Value: ByteArrayValue([]byte("Henry")), |
|
|
|
|
Predicates: []Predicate{ |
|
|
|
|
EqualPredicate{ |
|
|
|
|
Column: columns[0], // first_name column
|
|
|
|
|
Value: ByteArrayValue([]byte("Henry")), |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
}) |
|
|
|
|
defer r.Close() |
|
|
|
|
@ -92,9 +101,11 @@ func Test_Reader_ReadWithPredicate_NoSecondary(t *testing.T) { |
|
|
|
|
r := NewReader(ReaderOptions{ |
|
|
|
|
Dataset: dset, |
|
|
|
|
Columns: []Column{columns[3]}, |
|
|
|
|
Predicate: GreaterThanPredicate{ |
|
|
|
|
Column: columns[3], // birth_year column
|
|
|
|
|
Value: Int64Value(1985), |
|
|
|
|
Predicates: []Predicate{ |
|
|
|
|
GreaterThanPredicate{ |
|
|
|
|
Column: columns[3], // birth_year column
|
|
|
|
|
Value: Int64Value(1985), |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
}) |
|
|
|
|
defer r.Close() |
|
|
|
|
@ -141,10 +152,10 @@ func Test_Reader_Stats(t *testing.T) { |
|
|
|
|
r := NewReader(ReaderOptions{ |
|
|
|
|
Dataset: dset, |
|
|
|
|
Columns: columns, |
|
|
|
|
Predicate: GreaterThanPredicate{ |
|
|
|
|
Predicates: []Predicate{GreaterThanPredicate{ |
|
|
|
|
Column: columns[3], // birth_year column
|
|
|
|
|
Value: Int64Value(1985), |
|
|
|
|
}, |
|
|
|
|
}}, |
|
|
|
|
}) |
|
|
|
|
defer r.Close() |
|
|
|
|
|
|
|
|
|
@ -361,9 +372,9 @@ func Test_BuildPredicateRanges(t *testing.T) { |
|
|
|
|
for _, tc := range tt { |
|
|
|
|
t.Run(tc.name, func(t *testing.T) { |
|
|
|
|
r := NewReader(ReaderOptions{ |
|
|
|
|
Dataset: ds, |
|
|
|
|
Columns: cols, |
|
|
|
|
Predicate: tc.predicate, |
|
|
|
|
Dataset: ds, |
|
|
|
|
Columns: cols, |
|
|
|
|
Predicates: []Predicate{tc.predicate}, |
|
|
|
|
}) |
|
|
|
|
defer r.Close() |
|
|
|
|
|
|
|
|
|
@ -462,3 +473,394 @@ func encodeInt64Value(t *testing.T, v int64) []byte { |
|
|
|
|
require.NoError(t, err) |
|
|
|
|
return data |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func BenchmarkReader(b *testing.B) { |
|
|
|
|
generator := DatasetGenerator{ |
|
|
|
|
RowCount: 1_000_000, |
|
|
|
|
PageSizeHint: 2 * 1024 * 1024, // 2MB
|
|
|
|
|
Columns: []generatorColumnConfig{ |
|
|
|
|
{ |
|
|
|
|
Name: "stream", |
|
|
|
|
ValueType: datasetmd.VALUE_TYPE_INT64, |
|
|
|
|
Encoding: datasetmd.ENCODING_TYPE_DELTA, |
|
|
|
|
Compression: datasetmd.COMPRESSION_TYPE_NONE, |
|
|
|
|
CardinalityTarget: 1000, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
Name: "timestamp", |
|
|
|
|
ValueType: datasetmd.VALUE_TYPE_INT64, |
|
|
|
|
Encoding: datasetmd.ENCODING_TYPE_DELTA, |
|
|
|
|
Compression: datasetmd.COMPRESSION_TYPE_NONE, |
|
|
|
|
CardinalityTarget: 100_000, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
Name: "log", |
|
|
|
|
ValueType: datasetmd.VALUE_TYPE_BYTE_ARRAY, |
|
|
|
|
Encoding: datasetmd.ENCODING_TYPE_PLAIN, |
|
|
|
|
Compression: datasetmd.COMPRESSION_TYPE_NONE, |
|
|
|
|
AvgSize: 1024, |
|
|
|
|
CardinalityTarget: 100_000, |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
readPatterns := []struct { |
|
|
|
|
name string |
|
|
|
|
batchSize int |
|
|
|
|
}{ |
|
|
|
|
{ |
|
|
|
|
name: "batch=100", |
|
|
|
|
batchSize: 100, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
name: "batch=10k", |
|
|
|
|
batchSize: 10_000, |
|
|
|
|
}, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Generate dataset once per case
|
|
|
|
|
ds, cols := generator.Build(b, rand.Int63()) |
|
|
|
|
opts := ReaderOptions{ |
|
|
|
|
Dataset: ds, |
|
|
|
|
Columns: cols, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for _, rp := range readPatterns { |
|
|
|
|
b.Run(rp.name, func(b *testing.B) { |
|
|
|
|
b.ResetTimer() |
|
|
|
|
b.ReportAllocs() |
|
|
|
|
|
|
|
|
|
batch := make([]Row, rp.batchSize) |
|
|
|
|
for b.Loop() { |
|
|
|
|
reader := NewReader(opts) |
|
|
|
|
var rowsRead int |
|
|
|
|
for { |
|
|
|
|
n, err := reader.Read(context.Background(), batch) |
|
|
|
|
if err == io.EOF { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
if err != nil { |
|
|
|
|
b.Fatal(err) |
|
|
|
|
} |
|
|
|
|
rowsRead += n |
|
|
|
|
} |
|
|
|
|
reader.Close() |
|
|
|
|
|
|
|
|
|
b.ReportMetric(float64(rowsRead)/float64(b.N), "rows/op") |
|
|
|
|
} |
|
|
|
|
}) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func BenchmarkPredicateExecution(b *testing.B) { |
|
|
|
|
// Generate dataset with two columns, one with high cardinality and one with low cardinality
|
|
|
|
|
// higher the cardinality, more selective the predicate
|
|
|
|
|
generator := DatasetGenerator{ |
|
|
|
|
RowCount: 1_000_000, |
|
|
|
|
// set large page size to not realise benefits from page pruning since the goal
|
|
|
|
|
// of this benchmark is to measure the gains from sequential predicate evaluation alone.
|
|
|
|
|
PageSizeHint: 100 * 1024 * 1024, |
|
|
|
|
Columns: []generatorColumnConfig{ |
|
|
|
|
{ |
|
|
|
|
Name: "more_selective", |
|
|
|
|
ValueType: datasetmd.VALUE_TYPE_INT64, |
|
|
|
|
Encoding: datasetmd.ENCODING_TYPE_DELTA, |
|
|
|
|
Compression: datasetmd.COMPRESSION_TYPE_NONE, |
|
|
|
|
CardinalityTarget: 500_000, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
Name: "less_selective", |
|
|
|
|
ValueType: datasetmd.VALUE_TYPE_INT64, |
|
|
|
|
Encoding: datasetmd.ENCODING_TYPE_DELTA, |
|
|
|
|
Compression: datasetmd.COMPRESSION_TYPE_NONE, |
|
|
|
|
CardinalityTarget: 100, |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
ds, cols := generator.Build(b, rand.Int63()) |
|
|
|
|
|
|
|
|
|
var col1Value, col2Value int64 |
|
|
|
|
idx := rand.Intn(generator.RowCount) // Randomly select a row index to use for the predicate values
|
|
|
|
|
|
|
|
|
|
currentPos := 0 |
|
|
|
|
batch := make([]Row, 1000) |
|
|
|
|
// read the dataset once to pick a random row for predicate generation
|
|
|
|
|
reader := NewReader(ReaderOptions{ |
|
|
|
|
Dataset: ds, |
|
|
|
|
Columns: cols, |
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
for { |
|
|
|
|
n, err := reader.Read(context.Background(), batch) |
|
|
|
|
if err == io.EOF { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
if err != nil { |
|
|
|
|
b.Fatal(err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Check if our target index is in this batch
|
|
|
|
|
if idx >= currentPos && idx < currentPos+n { |
|
|
|
|
selectedRow := batch[idx-currentPos] |
|
|
|
|
col1Value = selectedRow.Values[0].Int64() |
|
|
|
|
col2Value = selectedRow.Values[1].Int64() |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
currentPos += n |
|
|
|
|
} |
|
|
|
|
reader.Close() |
|
|
|
|
|
|
|
|
|
predicatePatterns := []struct { |
|
|
|
|
name string |
|
|
|
|
predicates []Predicate |
|
|
|
|
}{ |
|
|
|
|
{ |
|
|
|
|
name: "combined", |
|
|
|
|
predicates: []Predicate{ |
|
|
|
|
AndPredicate{ |
|
|
|
|
Left: EqualPredicate{ |
|
|
|
|
Column: cols[0], |
|
|
|
|
Value: Int64Value(col1Value), |
|
|
|
|
}, |
|
|
|
|
Right: EqualPredicate{ |
|
|
|
|
Column: cols[1], |
|
|
|
|
Value: Int64Value(col2Value), |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
name: "high", |
|
|
|
|
predicates: []Predicate{ |
|
|
|
|
EqualPredicate{ |
|
|
|
|
Column: cols[0], |
|
|
|
|
Value: Int64Value(col1Value), |
|
|
|
|
}, |
|
|
|
|
EqualPredicate{ |
|
|
|
|
Column: cols[1], |
|
|
|
|
Value: Int64Value(col2Value), |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
name: "low", |
|
|
|
|
predicates: []Predicate{ |
|
|
|
|
EqualPredicate{ |
|
|
|
|
Column: cols[1], |
|
|
|
|
Value: Int64Value(col2Value), |
|
|
|
|
}, |
|
|
|
|
EqualPredicate{ |
|
|
|
|
Column: cols[0], |
|
|
|
|
Value: Int64Value(col1Value), |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for _, pp := range predicatePatterns { |
|
|
|
|
b.Run("selectivity="+pp.name, func(b *testing.B) { |
|
|
|
|
b.ResetTimer() |
|
|
|
|
b.ReportAllocs() |
|
|
|
|
|
|
|
|
|
for b.Loop() { |
|
|
|
|
reader := NewReader(ReaderOptions{ |
|
|
|
|
Dataset: ds, |
|
|
|
|
Columns: cols, |
|
|
|
|
Predicates: pp.predicates, |
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
batch := make([]Row, 10000) |
|
|
|
|
|
|
|
|
|
for { |
|
|
|
|
_, err := reader.Read(context.Background(), batch) |
|
|
|
|
if err == io.EOF { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
if err != nil { |
|
|
|
|
b.Fatal(err) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
reader.Close() |
|
|
|
|
} |
|
|
|
|
}) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
type generatorColumnConfig struct { |
|
|
|
|
Name string |
|
|
|
|
ValueType datasetmd.ValueType |
|
|
|
|
Encoding datasetmd.EncodingType |
|
|
|
|
Compression datasetmd.CompressionType |
|
|
|
|
|
|
|
|
|
AvgSize int64 // Average size in bytes for variable-length types
|
|
|
|
|
CardinalityTarget int64 // Target number of unique values
|
|
|
|
|
SparsityRate float64 // 0.0-1.0, where 1.0 means all values are null
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func columnValues(rng *rand.Rand, cfg generatorColumnConfig) iter.Seq[Value] { |
|
|
|
|
switch cfg.ValueType { |
|
|
|
|
case datasetmd.VALUE_TYPE_INT64, datasetmd.VALUE_TYPE_UINT64: |
|
|
|
|
return numberValues(rng, cfg) |
|
|
|
|
case datasetmd.VALUE_TYPE_BYTE_ARRAY: |
|
|
|
|
return stringValues(rng, cfg) |
|
|
|
|
default: |
|
|
|
|
panic(fmt.Sprintf("unsupported type for generation: %v", cfg.ValueType)) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func stringValues(rng *rand.Rand, cfg generatorColumnConfig) iter.Seq[Value] { |
|
|
|
|
// Pre-generate the set of unique values we'll cycle through
|
|
|
|
|
uniqueValues := make([]Value, cfg.CardinalityTarget) |
|
|
|
|
for i := range int(cfg.CardinalityTarget) { |
|
|
|
|
// Generate size between 0.5x and 1.5x of average size
|
|
|
|
|
size := int(float64(cfg.AvgSize) * (0.5 + rng.Float64())) |
|
|
|
|
|
|
|
|
|
// Convert number to string and create padded result
|
|
|
|
|
str := make([]byte, size) |
|
|
|
|
num := []byte(strconv.Itoa(i)) |
|
|
|
|
copy(str, num) |
|
|
|
|
for j := len(num); j < size; j++ { |
|
|
|
|
str[j] = 'x' |
|
|
|
|
} |
|
|
|
|
uniqueValues[i] = ByteArrayValue(str) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return func(yield func(Value) bool) { |
|
|
|
|
for { |
|
|
|
|
if !yield(uniqueValues[rng.Intn(len(uniqueValues))]) { |
|
|
|
|
return |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func numberValues(rng *rand.Rand, cfg generatorColumnConfig) iter.Seq[Value] { |
|
|
|
|
return func(yield func(Value) bool) { |
|
|
|
|
for { |
|
|
|
|
v := rng.Int63n(cfg.CardinalityTarget) |
|
|
|
|
switch cfg.ValueType { |
|
|
|
|
case datasetmd.VALUE_TYPE_INT64: |
|
|
|
|
if !yield(Int64Value(v)) { |
|
|
|
|
return |
|
|
|
|
} |
|
|
|
|
case datasetmd.VALUE_TYPE_UINT64: |
|
|
|
|
if !yield(Uint64Value(uint64(v))) { |
|
|
|
|
return |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
type DatasetGenerator struct { |
|
|
|
|
RowCount int |
|
|
|
|
PageSizeHint int |
|
|
|
|
Columns []generatorColumnConfig |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (g *DatasetGenerator) Build(t testing.TB, seed int64) (Dataset, []Column) { |
|
|
|
|
t.Helper() |
|
|
|
|
|
|
|
|
|
memColumns := make([]*MemColumn, 0, len(g.Columns)) |
|
|
|
|
rng := rand.New(rand.NewSource(seed)) |
|
|
|
|
|
|
|
|
|
for _, colCfg := range g.Columns { |
|
|
|
|
next, stop := iter.Pull(columnValues(rng, colCfg)) |
|
|
|
|
defer stop() |
|
|
|
|
|
|
|
|
|
opts := BuilderOptions{ |
|
|
|
|
PageSizeHint: g.PageSizeHint, |
|
|
|
|
Value: colCfg.ValueType, |
|
|
|
|
Encoding: colCfg.Encoding, |
|
|
|
|
Compression: colCfg.Compression, |
|
|
|
|
Statistics: StatisticsOptions{ |
|
|
|
|
StoreCardinalityStats: true, |
|
|
|
|
}, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if colCfg.ValueType == datasetmd.VALUE_TYPE_INT64 || colCfg.ValueType == datasetmd.VALUE_TYPE_UINT64 { |
|
|
|
|
opts.Statistics.StoreRangeStats = true |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Create a builder for this column
|
|
|
|
|
builder, err := NewColumnBuilder(colCfg.Name, opts) |
|
|
|
|
require.NoError(t, err) |
|
|
|
|
|
|
|
|
|
// Add values to the builder
|
|
|
|
|
for i := range g.RowCount { |
|
|
|
|
if rng.Float64() < colCfg.SparsityRate { |
|
|
|
|
continue |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
val, ok := next() |
|
|
|
|
require.True(t, ok, "generator should yield values") |
|
|
|
|
require.NoError(t, builder.Append(i, val)) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
col, err := builder.Flush() |
|
|
|
|
require.NoError(t, err) |
|
|
|
|
|
|
|
|
|
memColumns = append(memColumns, col) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
ds := FromMemory(memColumns) |
|
|
|
|
cols, err := result.Collect(ds.ListColumns(context.Background())) |
|
|
|
|
require.NoError(t, err) |
|
|
|
|
|
|
|
|
|
return ds, cols |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Test_DatasetGenerator is a helper to debug the dataset generation
|
|
|
|
|
func Test_DatasetGenerator(t *testing.T) { |
|
|
|
|
g := DatasetGenerator{ |
|
|
|
|
RowCount: 1_000_000, |
|
|
|
|
PageSizeHint: 2 * 1024 * 1024, // 2MB
|
|
|
|
|
Columns: []generatorColumnConfig{ |
|
|
|
|
{ |
|
|
|
|
Name: "timestamp", |
|
|
|
|
ValueType: datasetmd.VALUE_TYPE_INT64, |
|
|
|
|
Encoding: datasetmd.ENCODING_TYPE_DELTA, |
|
|
|
|
Compression: datasetmd.COMPRESSION_TYPE_NONE, |
|
|
|
|
CardinalityTarget: 100_000, |
|
|
|
|
SparsityRate: 0.0, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
Name: "label", |
|
|
|
|
ValueType: datasetmd.VALUE_TYPE_BYTE_ARRAY, |
|
|
|
|
Encoding: datasetmd.ENCODING_TYPE_PLAIN, |
|
|
|
|
Compression: datasetmd.COMPRESSION_TYPE_NONE, |
|
|
|
|
AvgSize: 32, |
|
|
|
|
CardinalityTarget: 100, |
|
|
|
|
SparsityRate: 0.3, |
|
|
|
|
}, |
|
|
|
|
}, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
_, cols := g.Build(t, rand.Int63()) |
|
|
|
|
require.Equal(t, 2, len(cols)) |
|
|
|
|
require.Equal(t, g.RowCount, cols[0].ColumnInfo().RowsCount) |
|
|
|
|
// TODO: Row count is < expected. Must be a result of null values at the end.
|
|
|
|
|
// Remove this comment once the issue is fixed.
|
|
|
|
|
// require.Equal(t, g.RowCount, cols[1].ColumnInfo().RowsCount)
|
|
|
|
|
|
|
|
|
|
require.NotNil(t, cols[0].ColumnInfo().Statistics.CardinalityCount) |
|
|
|
|
require.NotNil(t, cols[1].ColumnInfo().Statistics.CardinalityCount) |
|
|
|
|
|
|
|
|
|
t.Logf("timestamp column cardinality: %d", cols[0].ColumnInfo().Statistics.CardinalityCount) |
|
|
|
|
t.Logf("label column cardinality: %d", cols[1].ColumnInfo().Statistics.CardinalityCount) |
|
|
|
|
|
|
|
|
|
require.NotNil(t, cols[0].ColumnInfo().Statistics.MinValue) |
|
|
|
|
require.NotNil(t, cols[0].ColumnInfo().Statistics.MaxValue) |
|
|
|
|
|
|
|
|
|
var minValue, maxValue Value |
|
|
|
|
require.NoError(t, minValue.UnmarshalBinary(cols[0].ColumnInfo().Statistics.MinValue)) |
|
|
|
|
require.NoError(t, maxValue.UnmarshalBinary(cols[0].ColumnInfo().Statistics.MaxValue)) |
|
|
|
|
|
|
|
|
|
t.Logf("timestamp column min: %d", minValue.Int64()) |
|
|
|
|
t.Logf("timestamp column max: %d", maxValue.Int64()) |
|
|
|
|
|
|
|
|
|
t.Logf("timestamp column size: %s", humanize.Bytes(uint64(cols[0].ColumnInfo().UncompressedSize))) |
|
|
|
|
t.Logf("label column size: %s", humanize.Bytes(uint64(cols[1].ColumnInfo().UncompressedSize))) |
|
|
|
|
} |
|
|
|
|
|