chore(dataobj): column building (#15634)

This commit adds the ability to accumulate sequences of dataset.Value
into a column, which is split up across multiple pages.

Each page is broken down into two parts:

* A bitmap-encoded sequence of booleans, where 1 indicates a row has a
  value and 0 indicates the row is NULL, and

* the encoded sequence of non-NULL values, whose encoding is determined
  by the column options.

The sequence of non-NULL values is then optionally compressed.

This commit also includes initial support for reading these columns,
starting with internal-only helper utilities for unit tests.
pull/15652/head
Robert Fratto 1 year ago committed by GitHub
parent 672f91c3f9
commit f2fc0c26c2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 25
      pkg/dataobj/internal/dataset/column.go
  2. 173
      pkg/dataobj/internal/dataset/column_builder.go
  3. 20
      pkg/dataobj/internal/dataset/column_iter.go
  4. 61
      pkg/dataobj/internal/dataset/column_test.go
  5. 99
      pkg/dataobj/internal/dataset/page.go
  6. 240
      pkg/dataobj/internal/dataset/page_builder.go
  7. 123
      pkg/dataobj/internal/dataset/page_compress_writer.go
  8. 53
      pkg/dataobj/internal/dataset/page_iter.go
  9. 83
      pkg/dataobj/internal/dataset/page_test.go
  10. 518
      pkg/dataobj/internal/metadata/datasetmd/datasetmd.pb.go
  11. 26
      pkg/dataobj/internal/metadata/datasetmd/datasetmd.proto
  12. 101
      pkg/dataobj/internal/result/result.go
  13. 18
      pkg/dataobj/internal/streamio/varint.go

@ -0,0 +1,25 @@
package dataset
import "github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
// Helper types.
type (
// ColumnInfo describes a column.
ColumnInfo struct {
Name string // Name of the column, if any.
Type datasetmd.ValueType // Type of values in the column.
Compression datasetmd.CompressionType // Compression used for the column.
RowsCount int // Total number of rows in the column.
CompressedSize int // Total size of all pages in the column after compression.
UncompressedSize int // Total size of all pages in the column before compression.
Statistics *datasetmd.Statistics // Optional statistics for the column.
}
)
// MemColumn holds a set of pages of a common type.
type MemColumn struct {
Info ColumnInfo // Information about the column.
Pages []*MemPage // The set of pages in the column.
}

@ -0,0 +1,173 @@
package dataset
import (
"fmt"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
)
// BuilderOptions configures common settings for building pages.
type BuilderOptions struct {
// PageSizeHint is the soft limit for the size of the page. Builders try to
// fill pages as close to this size as possible, but the actual size may be
// slightly larger or smaller.
PageSizeHint int
// Value is the value type of data to write.
Value datasetmd.ValueType
// Encoding is the encoding algorithm to use for values.
Encoding datasetmd.EncodingType
// Compression is the compression algorithm to use for values.
Compression datasetmd.CompressionType
}
// A ColumnBuilder builds a sequence of [Value] entries of a common type into a
// column. Values are accumulated into a buffer and then flushed into
// [MemPage]s once the size of data exceeds a configurable limit.
type ColumnBuilder struct {
name string
opts BuilderOptions
rows int // Total number of rows in the column.
pages []*MemPage
builder *pageBuilder
}
// NewColumnBuilder creates a new ColumnBuilder from the optional name and
// provided options. NewColumnBuilder returns an error if the options are
// invalid.
func NewColumnBuilder(name string, opts BuilderOptions) (*ColumnBuilder, error) {
builder, err := newPageBuilder(opts)
if err != nil {
return nil, fmt.Errorf("creating page builder: %w", err)
}
return &ColumnBuilder{
name: name,
opts: opts,
builder: builder,
}, nil
}
// Append adds a new value into cb with the given zero-indexed row number. If
// the row number is higher than the current number of rows in cb, null values
// are added up to the new row.
//
// Append returns an error if the row number is out-of-order.
func (cb *ColumnBuilder) Append(row int, value Value) error {
if row < cb.rows {
return fmt.Errorf("row %d is older than current row %d", row, cb.rows)
}
// We give two attempts to append the data to the buffer; if the buffer is
// full, we cut a page and then append to the newly reset buffer.
//
// The second iteration should never fail, as the buffer will always be empty
// then.
for range 2 {
if cb.append(row, value) {
cb.rows = row + 1
return nil
}
cb.flushPage()
}
panic("ColumnBuilder.Append: failed to append value to fresh buffer")
}
// Backfill adds NULLs into cb up to (but not including) the provided row
// number. If values exist up to the provided row number, Backfill does
// nothing.
func (cb *ColumnBuilder) Backfill(row int) {
// We give two attempts to append the data to the buffer; if the buffer is
// full, we cut a page and then append again to the newly reset buffer.
//
// The second iteration should never fail, as the buffer will always be
// empty.
for range 2 {
if cb.backfill(row) {
return
}
cb.flushPage()
}
panic("ColumnBuilder.Backfill: failed to backfill buffer")
}
func (cb *ColumnBuilder) backfill(row int) bool {
for row > cb.rows {
if !cb.builder.AppendNull() {
return false
}
cb.rows++
}
return true
}
func (cb *ColumnBuilder) append(row int, value Value) bool {
// Backfill up to row.
if !cb.backfill(row) {
return false
}
return cb.builder.Append(value)
}
// Flush converts data in cb into a [MemColumn]. Afterwards, cb is reset to a
// fresh state and can be reused.
func (cb *ColumnBuilder) Flush() (*MemColumn, error) {
cb.flushPage()
info := ColumnInfo{
Name: cb.name,
Type: cb.opts.Value,
Compression: cb.opts.Compression,
}
// TODO(rfratto): Should we compute column-wide statistics if they're
// available in pages?
//
// That would potentially work for min/max values, but not for count
// distinct, unless we had a way to pass sketches around.
for _, page := range cb.pages {
info.RowsCount += page.Info.RowCount
info.CompressedSize += page.Info.CompressedSize
info.UncompressedSize += page.Info.UncompressedSize
}
column := &MemColumn{
Info: info,
Pages: cb.pages,
}
cb.Reset()
return column, nil
}
func (cb *ColumnBuilder) flushPage() {
if cb.builder.Rows() == 0 {
return
}
page, err := cb.builder.Flush()
if err != nil {
// Flush should only return an error when it's empty, which we already
// ensure it's not in the lines above.
panic(fmt.Sprintf("failed to flush page: %s", err))
}
cb.pages = append(cb.pages, page)
}
// Reset clears all data in cb and resets it to a fresh state.
func (cb *ColumnBuilder) Reset() {
cb.rows = 0
cb.pages = nil
cb.builder.Reset()
}

@ -0,0 +1,20 @@
package dataset
import "github.com/grafana/loki/v3/pkg/dataobj/internal/result"
func iterMemColumn(col *MemColumn) result.Seq[Value] {
return result.Iter(func(yield func(Value) bool) error {
for _, page := range col.Pages {
for result := range iterMemPage(page, col.Info.Type, col.Info.Compression) {
val, err := result.Value()
if err != nil {
return err
} else if !yield(val) {
return nil
}
}
}
return nil
})
}

@ -0,0 +1,61 @@
package dataset
import (
"testing"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
)
func TestColumnBuilder_ReadWrite(t *testing.T) {
in := []string{
"hello, world!",
"",
"this is a test of the emergency broadcast system",
"this is only a test",
"if this were a real emergency, you would be instructed to panic",
"but it's not, so don't",
"",
"this concludes the test",
"thank you for your cooperation",
"goodbye",
}
opts := BuilderOptions{
// Set the size to 0 so each column has exactly one value.
PageSizeHint: 0,
Value: datasetmd.VALUE_TYPE_STRING,
Compression: datasetmd.COMPRESSION_TYPE_ZSTD,
Encoding: datasetmd.ENCODING_TYPE_PLAIN,
}
b, err := NewColumnBuilder("", opts)
require.NoError(t, err)
for i, s := range in {
require.NoError(t, b.Append(i, StringValue(s)))
}
col, err := b.Flush()
require.NoError(t, err)
require.Equal(t, datasetmd.VALUE_TYPE_STRING, col.Info.Type)
require.Greater(t, len(col.Pages), 1)
t.Log("Uncompressed size: ", col.Info.UncompressedSize)
t.Log("Compressed size: ", col.Info.CompressedSize)
t.Log("Pages: ", len(col.Pages))
var actual []string
for result := range iterMemColumn(col) {
val, err := result.Value()
require.NoError(t, err)
if val.IsNil() || val.IsZero() {
actual = append(actual, "")
} else {
require.Equal(t, datasetmd.VALUE_TYPE_STRING, val.Type())
actual = append(actual, val.String())
}
}
require.Equal(t, in, actual)
}

@ -0,0 +1,99 @@
package dataset
import (
"bytes"
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"github.com/golang/snappy"
"github.com/klauspost/compress/zstd"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
)
// Helper types.
type (
// PageData holds the raw data for a page. Data is formatted as:
//
// <uvarint(presence-bitmap-size)> <presence-bitmap> <values-data>
//
// The presence-bitmap is a bitmap-encoded sequence of booleans, where values
// describe which rows are present (1) or nil (0). The presence bitmap is
// always stored uncompressed.
//
// values-data is then the encoded and optionally compressed sequence of
// non-NULL values.
PageData []byte
// PageInfo describes a page.
PageInfo struct {
UncompressedSize int // UncompressedSize is the size of a page before compression.
CompressedSize int // CompressedSize is the size of a page after compression.
CRC32 uint32 // CRC32 checksum of the page after encoding and compression.
RowCount int // RowCount is the number of rows in the page, including NULLs.
Encoding datasetmd.EncodingType // Encoding used for values in the page.
Stats *datasetmd.Statistics // Optional statistics for the page.
}
)
// MemPage holds an encoded (and optionally compressed) sequence of [Value]
// entries of a common type. Use [ColumnBuilder] to construct sets of pages.
type MemPage struct {
Info PageInfo // Information about the page.
Data PageData // Data for the page.
}
var checksumTable = crc32.MakeTable(crc32.Castagnoli)
// reader returns a reader for decompressed page data. Reader returns an error
// if the CRC32 fails to validate.
func (p *MemPage) reader(compression datasetmd.CompressionType) (presence io.Reader, values io.ReadCloser, err error) {
if actual := crc32.Checksum(p.Data, checksumTable); p.Info.CRC32 != actual {
return nil, nil, fmt.Errorf("invalid CRC32 checksum %x, expected %x", actual, p.Info.CRC32)
}
bitmapSize, n := binary.Uvarint(p.Data)
if n <= 0 {
return nil, nil, fmt.Errorf("reading presence bitmap size: %w", err)
}
var (
bitmapReader = bytes.NewReader(p.Data[n : n+int(bitmapSize)])
compressedDataReader = bytes.NewReader(p.Data[n+int(bitmapSize):])
)
switch compression {
case datasetmd.COMPRESSION_TYPE_UNSPECIFIED, datasetmd.COMPRESSION_TYPE_NONE:
return bitmapReader, io.NopCloser(compressedDataReader), nil
case datasetmd.COMPRESSION_TYPE_SNAPPY:
sr := snappy.NewReader(compressedDataReader)
return bitmapReader, io.NopCloser(sr), nil
case datasetmd.COMPRESSION_TYPE_ZSTD:
zr, err := zstd.NewReader(compressedDataReader)
if err != nil {
return nil, nil, fmt.Errorf("opening zstd reader: %w", err)
}
return bitmapReader, newZstdReader(zr), nil
}
panic(fmt.Sprintf("dataset.MemPage.reader: unknown compression type %q", compression.String()))
}
// zstdReader implements [io.ReadCloser] for a [zstd.Decoder].
type zstdReader struct{ *zstd.Decoder }
// newZstdReader returns a new [io.ReadCloser] for a [zstd.Decoder].
func newZstdReader(dec *zstd.Decoder) io.ReadCloser {
return &zstdReader{Decoder: dec}
}
// Close implements [io.Closer].
func (r *zstdReader) Close() error {
r.Decoder.Close()
return nil
}

@ -0,0 +1,240 @@
package dataset
import (
"bytes"
"encoding/binary"
"fmt"
"hash/crc32"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
"github.com/grafana/loki/v3/pkg/dataobj/internal/streamio"
)
// pageBuilder accumulates sequences of [Value] in memory until reaching a
// configurable size limit. A [MemPage] can then be created from a PageBuiler
// by calling [pageBuilder.Flush].
type pageBuilder struct {
// Each pageBuilder writes two sets of data.
//
// The first set of data is a presence bitmap which tells readers which rows
// are present. Use use 1 to indicate presence and 0 to indicate absence
// (NULL). This bitmap always uses bitmap encoding regardless of the encoding
// type used for values.
//
// The second set of data is the encoded set of non-NULL values. As an
// optimization, the zero value is treated as NULL.
//
// The two sets of data are accmumulated into separate buffers, with the
// presence bitmap being written uncompresed and the values being written
// with the configured compression type, if any.
//
// To orchestrate building two sets of data, we have a few components:
//
// * The final buffers which hold encoded and potentially compressed data.
// * The writer performing compression for values.
// * The encoders that write values.
opts BuilderOptions
presenceBuffer *bytes.Buffer // presenceBuffer holds the encoded presence bitmap.
valuesBuffer *bytes.Buffer // valuesBuffer holds encoded and optionally compressed values.
valuesWriter *compressWriter // Compresses data and writes to valuesBuffer.
presenceEnc *bitmapEncoder
valuesEnc valueEncoder
rows int // Number of rows appended to the builder.
}
// newPageBuilder creates a new pageBuilder that stores a sequence of [Value]s.
// newPageBuilder returns an error if there is no encoder available for the
// combination of opts.Value and opts.Encoding.
func newPageBuilder(opts BuilderOptions) (*pageBuilder, error) {
var (
presenceBuffer = bytes.NewBuffer(nil)
valuesBuffer = bytes.NewBuffer(make([]byte, 0, opts.PageSizeHint))
valuesWriter = newCompressWriter(valuesBuffer, opts.Compression)
)
presenceEnc := newBitmapEncoder(presenceBuffer)
valuesEnc, ok := newValueEncoder(opts.Value, opts.Encoding, valuesWriter)
if !ok {
return nil, fmt.Errorf("no encoder available for %s/%s", opts.Value, opts.Encoding)
}
return &pageBuilder{
opts: opts,
presenceBuffer: presenceBuffer,
valuesBuffer: valuesBuffer,
valuesWriter: valuesWriter,
presenceEnc: presenceEnc,
valuesEnc: valuesEnc,
}, nil
}
// Append appends value into the pageBuilder. Append returns true if the data
// was appended; false if the pageBuilder is full.
func (b *pageBuilder) Append(value Value) bool {
if value.IsNil() || value.IsZero() {
return b.AppendNull()
}
// We can't accurately know whether adding value would tip us over the page
// size: we don't know the current state of the encoders and we don't know
// for sure how much space value will fill.
//
// We use a rough estimate which will tend to overshoot the page size, making
// sure we rarely go over.
if sz := b.estimatedSize(); sz > 0 && sz+valueSize(value) > b.opts.PageSizeHint {
return false
}
// The following calls won't fail; they only return errors when the
// underlying writers fail, which ours cannot.
if err := b.presenceEnc.Encode(Uint64Value(1)); err != nil {
panic(fmt.Sprintf("pageBuilder.Append: encoding presence bitmap entry: %v", err))
}
if err := b.valuesEnc.Encode(value); err != nil {
panic(fmt.Sprintf("pageBuilder.Append: encoding value: %v", err))
}
b.rows++
return true
}
// AppendNull appends a NULL value to the Builder. AppendNull returns true if
// the NULL was appended, or false if the Builder is full.
func (b *pageBuilder) AppendNull() bool {
// See comment in Append for why we can only estimate the cost of appending a
// value.
//
// Here we assume appending a NULL costs one byte, but in reality most NULLs
// have no cost depending on the state of our bitmap encoder.
if sz := b.estimatedSize(); sz > 0 && sz+1 > b.opts.PageSizeHint {
return false
}
// The following call won't fail; it only returns an error when the
// underlying writer fails, which ours cannot.
if err := b.presenceEnc.Encode(Uint64Value(0)); err != nil {
panic(fmt.Sprintf("Builder.AppendNull: encoding presence bitmap entry: %v", err))
}
b.rows++
return true
}
func valueSize(v Value) int {
switch v.Type() {
case datasetmd.VALUE_TYPE_INT64:
// Assuming that int64s are written as varints.
return streamio.VarintSize(v.Int64())
case datasetmd.VALUE_TYPE_UINT64:
// Assuming that uint64s are written as uvarints.
return streamio.UvarintSize(v.Uint64())
case datasetmd.VALUE_TYPE_STRING:
// Assuming that strings are PLAIN encoded using their length and bytes.
str := v.String()
return binary.Size(len(str)) + len(str)
}
return 0
}
// estimatedSize returns the estimated uncompressed size of the builder in
// bytes.
func (b *pageBuilder) estimatedSize() int {
// This estimate doesn't account for any values in encoders which haven't
// been flushed yet. However, encoder buffers are usually small enough that
// we wouldn't massively overshoot our estimate.
return b.presenceBuffer.Len() + b.valuesWriter.BytesWritten()
}
// Rows returns the number of rows appended to the pageBuilder.
func (b *pageBuilder) Rows() int { return b.rows }
// Flush converts data in pageBuilder into a [MemPage], and returns it.
// Afterwards, pageBuilder is reset to a fresh state and can be reused. Flush
// returns an error if the pageBuilder is empty.
//
// To avoid computing useless stats, the Stats field of the returned Page is
// unset. If stats are needed for a page, callers should compute them by
// iterating over the returned Page.
func (b *pageBuilder) Flush() (*MemPage, error) {
if b.rows == 0 {
return nil, fmt.Errorf("no data to flush")
}
// Before we can build the page we need to finish flushing our encoders and writers.
if err := b.presenceEnc.Flush(); err != nil {
return nil, fmt.Errorf("flushing presence encoder: %w", err)
} else if err := b.valuesEnc.Flush(); err != nil {
return nil, fmt.Errorf("flushing values encoder: %w", err)
} else if err := b.valuesWriter.Flush(); err != nil {
return nil, fmt.Errorf("flushing values writer: %w", err)
}
// The final data of our page is the combination of the presence bitmap and
// the values. To denote when one ends and the other begins, we prepend the
// data with the size of the presence bitmap as a uvarint. See the doc
// comment of [PageData] for more information.
var (
headerSize = streamio.UvarintSize(uint64(b.presenceBuffer.Len()))
presenceSize = b.presenceBuffer.Len()
valuesSize = b.valuesBuffer.Len()
finalData = bytes.NewBuffer(make([]byte, 0, headerSize+presenceSize+valuesSize))
)
if err := streamio.WriteUvarint(finalData, uint64(b.presenceBuffer.Len())); err != nil {
return nil, fmt.Errorf("writing presence buffer size: %w", err)
} else if _, err := b.presenceBuffer.WriteTo(finalData); err != nil {
return nil, fmt.Errorf("writing presence buffer: %w", err)
} else if _, err := b.valuesBuffer.WriteTo(finalData); err != nil {
return nil, fmt.Errorf("writing values buffer: %w", err)
}
checksum := crc32.Checksum(finalData.Bytes(), checksumTable)
page := MemPage{
Info: PageInfo{
UncompressedSize: headerSize + presenceSize + b.valuesWriter.BytesWritten(),
CompressedSize: finalData.Len(),
CRC32: checksum,
RowCount: b.rows,
Encoding: b.opts.Encoding,
// TODO(rfratto): At the moment we don't compute stats because they're
// not going to be valuable in every scenario: the min/max values for log
// lines is less useful compared to the min/max values for timestamps.
//
// In the future, we may wish to add more options to pageBuilder to tell
// it to compute a subset of stats to avoid needing a second iteration
// over the page to compute them.
Stats: nil,
},
Data: finalData.Bytes(),
}
b.Reset() // Reset state before returning.
return &page, nil
}
// Reset resets the pageBuilder to a fresh state, allowing it to be reused.
func (b *pageBuilder) Reset() {
b.presenceBuffer.Reset()
b.valuesBuffer.Reset()
b.valuesWriter.Reset(b.valuesBuffer)
b.presenceBuffer.Reset()
b.valuesEnc.Reset(b.valuesWriter)
b.rows = 0
}

@ -0,0 +1,123 @@
package dataset
import (
"bufio"
"fmt"
"io"
"github.com/golang/snappy"
"github.com/klauspost/compress/zstd"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
"github.com/grafana/loki/v3/pkg/dataobj/internal/streamio"
)
// A compressWriter is a [streamio.Writer] that compresses data passed to it.
type compressWriter struct {
// To be able to implmeent [io.ByteWriter], we always write directly to buf,
// which then flushes to w once it's full.
w io.WriteCloser // Compressing writer.
buf *bufio.Writer // Buffered writer in front of w to be able to call WriteByte.
compression datasetmd.CompressionType // Compression type being used.
rawBytes int // Number of uncompressed bytes written.
}
var _ streamio.Writer = (*compressWriter)(nil)
func newCompressWriter(w io.Writer, ty datasetmd.CompressionType) *compressWriter {
c := compressWriter{compression: ty}
c.Reset(w)
return &c
}
// Write writes p to c.
func (c *compressWriter) Write(p []byte) (n int, err error) {
n, err = c.buf.Write(p)
c.rawBytes += n
return
}
// WriteByte writes a single byte to c.
func (c *compressWriter) WriteByte(b byte) error {
if err := c.buf.WriteByte(b); err != nil {
return err
}
c.rawBytes++
return nil
}
// Flush compresses any pending uncompressed data in the buffer.
func (c *compressWriter) Flush() error {
// Flush our buffer first so c.w is up to date.
if err := c.buf.Flush(); err != nil {
return fmt.Errorf("flushing buffer: %w", err)
}
// c.w may not support Flush (such as when using no compression), so we check
// first.
if f, ok := c.w.(interface{ Flush() error }); ok {
if err := f.Flush(); err != nil {
return fmt.Errorf("flushing compressing writer: %w", err)
}
}
return nil
}
// Reset discards the writer's state and switches the compressor to write to w.
// This permits reusing a compressWriter rather than allocating a new one.
func (c *compressWriter) Reset(w io.Writer) {
resetter, ok := c.w.(interface{ Reset(io.Writer) })
switch ok {
case true:
resetter.Reset(w)
default:
// c.w is unset or doesn't support Reset; build a new writer.
var compressedWriter io.WriteCloser
switch c.compression {
case datasetmd.COMPRESSION_TYPE_UNSPECIFIED, datasetmd.COMPRESSION_TYPE_NONE:
compressedWriter = nopCloseWriter{w}
case datasetmd.COMPRESSION_TYPE_SNAPPY:
compressedWriter = snappy.NewBufferedWriter(w)
case datasetmd.COMPRESSION_TYPE_ZSTD:
zw, err := zstd.NewWriter(w, zstd.WithEncoderLevel(zstd.SpeedBestCompression))
if err != nil {
panic(fmt.Sprintf("compressWriter.Reset: creating zstd writer: %v", err))
}
compressedWriter = zw
default:
panic(fmt.Sprintf("compressWriter.Reset: unknown compression type %v", c.compression))
}
c.w = compressedWriter
}
if c.buf != nil {
c.buf.Reset(c.w)
} else {
c.buf = bufio.NewWriter(c.w)
}
c.rawBytes = 0
}
// BytesWritten returns the number of uncompressed bytes written to c.
func (c *compressWriter) BytesWritten() int { return c.rawBytes }
// Close flushes and then closes c.
func (c *compressWriter) Close() error {
if err := c.Flush(); err != nil {
return err
}
return c.w.Close()
}
type nopCloseWriter struct{ w io.Writer }
func (w nopCloseWriter) Write(p []byte) (n int, err error) { return w.w.Write(p) }
func (w nopCloseWriter) Close() error { return nil }

@ -0,0 +1,53 @@
package dataset
import (
"bufio"
"errors"
"fmt"
"io"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
"github.com/grafana/loki/v3/pkg/dataobj/internal/result"
)
func iterMemPage(p *MemPage, valueType datasetmd.ValueType, compressionType datasetmd.CompressionType) result.Seq[Value] {
return result.Iter(func(yield func(Value) bool) error {
presenceReader, valuesReader, err := p.reader(compressionType)
if err != nil {
return fmt.Errorf("opening page for reading: %w", err)
}
defer valuesReader.Close()
presenceDec := newBitmapDecoder(bufio.NewReader(presenceReader))
valuesDec, ok := newValueDecoder(valueType, p.Info.Encoding, bufio.NewReader(valuesReader))
if !ok {
return fmt.Errorf("no decoder available for %s/%s", valueType, p.Info.Encoding)
}
for {
var value Value
present, err := presenceDec.Decode()
if errors.Is(err, io.EOF) {
return nil
} else if err != nil {
return fmt.Errorf("decoding presence bitmap: %w", err)
} else if present.Type() != datasetmd.VALUE_TYPE_UINT64 {
return fmt.Errorf("unexpected presence type %s", present.Type())
}
// value is currently nil. If the presence bitmap says our row has a
// value, we decode it into value.
if present.Uint64() == 1 {
value, err = valuesDec.Decode()
if err != nil {
return fmt.Errorf("decoding value: %w", err)
}
}
if !yield(value) {
return nil
}
}
})
}

@ -0,0 +1,83 @@
package dataset
import (
"math/rand"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
)
func Test_pageBuilder_WriteRead(t *testing.T) {
in := []string{
"hello, world!",
"",
"this is a test of the emergency broadcast system",
"this is only a test",
"if this were a real emergency, you would be instructed to panic",
"but it's not, so don't",
"",
"this concludes the test",
"thank you for your cooperation",
"goodbye",
}
opts := BuilderOptions{
PageSizeHint: 1024,
Value: datasetmd.VALUE_TYPE_STRING,
Compression: datasetmd.COMPRESSION_TYPE_ZSTD,
Encoding: datasetmd.ENCODING_TYPE_PLAIN,
}
b, err := newPageBuilder(opts)
require.NoError(t, err)
for _, s := range in {
require.True(t, b.Append(StringValue(s)))
}
page, err := b.Flush()
require.NoError(t, err)
t.Log("Uncompressed size: ", page.Info.UncompressedSize)
t.Log("Compressed size: ", page.Info.CompressedSize)
var actual []string
for result := range iterMemPage(page, opts.Value, opts.Compression) {
val, err := result.Value()
require.NoError(t, err)
if val.IsNil() || val.IsZero() {
actual = append(actual, "")
} else {
require.Equal(t, datasetmd.VALUE_TYPE_STRING, val.Type())
actual = append(actual, val.String())
}
}
require.Equal(t, in, actual)
}
func Test_pageBuilder_Fill(t *testing.T) {
opts := BuilderOptions{
PageSizeHint: 1_500_000,
Value: datasetmd.VALUE_TYPE_INT64,
Compression: datasetmd.COMPRESSION_TYPE_NONE,
Encoding: datasetmd.ENCODING_TYPE_DELTA,
}
buf, err := newPageBuilder(opts)
require.NoError(t, err)
ts := time.Now().UTC()
for buf.Append(Int64Value(ts.UnixNano())) {
ts = ts.Add(time.Duration(rand.Intn(5000)) * time.Millisecond)
}
page, err := buf.Flush()
require.NoError(t, err)
require.Equal(t, page.Info.UncompressedSize, page.Info.CompressedSize)
t.Log("Uncompressed size: ", page.Info.UncompressedSize)
t.Log("Compressed size: ", page.Info.CompressedSize)
t.Log("Row count: ", page.Info.RowCount)
}

@ -4,10 +4,15 @@
package datasetmd
import (
bytes "bytes"
fmt "fmt"
proto "github.com/gogo/protobuf/proto"
io "io"
math "math"
math_bits "math/bits"
reflect "reflect"
strconv "strconv"
strings "strings"
)
// Reference imports to suppress errors if they are not otherwise used.
@ -88,9 +93,99 @@ func (EncodingType) EnumDescriptor() ([]byte, []int) {
return fileDescriptor_7ab9d5b21b743868, []int{1}
}
// CompressionType represents the valid compression types that can be used for
// compressing values in a page.
type CompressionType int32
const (
// Invalid compression type.
COMPRESSION_TYPE_UNSPECIFIED CompressionType = 0
// No compression.
COMPRESSION_TYPE_NONE CompressionType = 1
// Snappy compression.
COMPRESSION_TYPE_SNAPPY CompressionType = 2
// Zstd compression.
COMPRESSION_TYPE_ZSTD CompressionType = 3
)
var CompressionType_name = map[int32]string{
0: "COMPRESSION_TYPE_UNSPECIFIED",
1: "COMPRESSION_TYPE_NONE",
2: "COMPRESSION_TYPE_SNAPPY",
3: "COMPRESSION_TYPE_ZSTD",
}
var CompressionType_value = map[string]int32{
"COMPRESSION_TYPE_UNSPECIFIED": 0,
"COMPRESSION_TYPE_NONE": 1,
"COMPRESSION_TYPE_SNAPPY": 2,
"COMPRESSION_TYPE_ZSTD": 3,
}
func (CompressionType) EnumDescriptor() ([]byte, []int) {
return fileDescriptor_7ab9d5b21b743868, []int{2}
}
// Statistics about a column or a page. All statistics are optional and are
// conditionally set depending on the column type.
type Statistics struct {
// Minimum value.
MinValue []byte `protobuf:"bytes,1,opt,name=min_value,json=minValue,proto3" json:"min_value,omitempty"`
// Maximum value.
MaxValue []byte `protobuf:"bytes,2,opt,name=max_value,json=maxValue,proto3" json:"max_value,omitempty"`
}
func (m *Statistics) Reset() { *m = Statistics{} }
func (*Statistics) ProtoMessage() {}
func (*Statistics) Descriptor() ([]byte, []int) {
return fileDescriptor_7ab9d5b21b743868, []int{0}
}
func (m *Statistics) XXX_Unmarshal(b []byte) error {
return m.Unmarshal(b)
}
func (m *Statistics) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
if deterministic {
return xxx_messageInfo_Statistics.Marshal(b, m, deterministic)
} else {
b = b[:cap(b)]
n, err := m.MarshalToSizedBuffer(b)
if err != nil {
return nil, err
}
return b[:n], nil
}
}
func (m *Statistics) XXX_Merge(src proto.Message) {
xxx_messageInfo_Statistics.Merge(m, src)
}
func (m *Statistics) XXX_Size() int {
return m.Size()
}
func (m *Statistics) XXX_DiscardUnknown() {
xxx_messageInfo_Statistics.DiscardUnknown(m)
}
var xxx_messageInfo_Statistics proto.InternalMessageInfo
func (m *Statistics) GetMinValue() []byte {
if m != nil {
return m.MinValue
}
return nil
}
func (m *Statistics) GetMaxValue() []byte {
if m != nil {
return m.MaxValue
}
return nil
}
func init() {
proto.RegisterEnum("dataobj.metadata.dataset.v1.ValueType", ValueType_name, ValueType_value)
proto.RegisterEnum("dataobj.metadata.dataset.v1.EncodingType", EncodingType_name, EncodingType_value)
proto.RegisterEnum("dataobj.metadata.dataset.v1.CompressionType", CompressionType_name, CompressionType_value)
proto.RegisterType((*Statistics)(nil), "dataobj.metadata.dataset.v1.Statistics")
}
func init() {
@ -98,27 +193,34 @@ func init() {
}
var fileDescriptor_7ab9d5b21b743868 = []byte{
// 310 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x32, 0x2f, 0xc8, 0x4e, 0xd7,
0x4f, 0x49, 0x2c, 0x49, 0xcc, 0x4f, 0xca, 0xd2, 0xcf, 0xcc, 0x2b, 0x49, 0x2d, 0xca, 0x4b, 0xcc,
0xd1, 0xcf, 0x4d, 0x2d, 0x49, 0x04, 0x09, 0x82, 0x65, 0x8a, 0x53, 0x4b, 0x72, 0x53, 0x10, 0x2c,
0xbd, 0x82, 0xa2, 0xfc, 0x92, 0x7c, 0x21, 0x69, 0xa8, 0x26, 0x3d, 0x98, 0x5a, 0x3d, 0xa8, 0x0a,
0xbd, 0x32, 0x43, 0xad, 0x6c, 0x2e, 0xce, 0xb0, 0xc4, 0x9c, 0xd2, 0xd4, 0x90, 0xca, 0x82, 0x54,
0x21, 0x29, 0x2e, 0xb1, 0x30, 0x47, 0x9f, 0x50, 0xd7, 0xf8, 0x90, 0xc8, 0x00, 0xd7, 0xf8, 0x50,
0xbf, 0xe0, 0x00, 0x57, 0x67, 0x4f, 0x37, 0x4f, 0x57, 0x17, 0x01, 0x06, 0x21, 0x11, 0x2e, 0x01,
0x24, 0x39, 0x4f, 0xbf, 0x10, 0x33, 0x13, 0x01, 0x46, 0x21, 0x51, 0x2e, 0x41, 0x64, 0x1d, 0x10,
0x61, 0x26, 0x34, 0xe1, 0xe0, 0x90, 0x20, 0x4f, 0x3f, 0x77, 0x01, 0x66, 0xad, 0x4a, 0x2e, 0x1e,
0xd7, 0xbc, 0xe4, 0xfc, 0x94, 0xcc, 0xbc, 0x74, 0xb0, 0x7d, 0xb2, 0x5c, 0x92, 0xae, 0x7e, 0xce,
0xfe, 0x2e, 0x9e, 0x7e, 0xee, 0xd8, 0xac, 0x14, 0xe7, 0x12, 0x46, 0x95, 0x0e, 0xf0, 0x71, 0xf4,
0xf4, 0x13, 0x60, 0xc4, 0x94, 0x70, 0x71, 0xf5, 0x09, 0x71, 0x14, 0x60, 0x12, 0x92, 0xe0, 0x12,
0x41, 0x95, 0x70, 0xf2, 0x0c, 0xf1, 0x75, 0x0c, 0x10, 0x60, 0x76, 0xaa, 0xb8, 0xf0, 0x50, 0x8e,
0xe1, 0xc6, 0x43, 0x39, 0x86, 0x0f, 0x0f, 0xe5, 0x18, 0x1b, 0x1e, 0xc9, 0x31, 0xae, 0x78, 0x24,
0xc7, 0x78, 0xe2, 0x91, 0x1c, 0xe3, 0x85, 0x47, 0x72, 0x8c, 0x0f, 0x1e, 0xc9, 0x31, 0xbe, 0x78,
0x24, 0xc7, 0xf0, 0xe1, 0x91, 0x1c, 0xe3, 0x84, 0xc7, 0x72, 0x0c, 0x17, 0x1e, 0xcb, 0x31, 0xdc,
0x78, 0x2c, 0xc7, 0x10, 0xe5, 0x94, 0x9e, 0x59, 0x92, 0x51, 0x9a, 0xa4, 0x97, 0x9c, 0x9f, 0xab,
0x9f, 0x5e, 0x94, 0x98, 0x96, 0x98, 0x97, 0xa8, 0x9f, 0x93, 0x9f, 0x9d, 0xa9, 0x5f, 0x66, 0xac,
0x4f, 0x64, 0x74, 0x24, 0xb1, 0x81, 0x63, 0xc1, 0x18, 0x10, 0x00, 0x00, 0xff, 0xff, 0xee, 0xfd,
0x13, 0xfe, 0xc0, 0x01, 0x00, 0x00,
// 420 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x92, 0x31, 0x6f, 0xd3, 0x40,
0x1c, 0xc5, 0x7d, 0x89, 0x84, 0xe8, 0x5f, 0x95, 0x38, 0x4c, 0x4b, 0x5b, 0x02, 0xa7, 0x8a, 0x09,
0x65, 0xb0, 0x85, 0x8a, 0x60, 0x76, 0x92, 0x6b, 0x75, 0x52, 0x7a, 0xb1, 0x62, 0xb7, 0x52, 0xbb,
0x44, 0x97, 0xc4, 0x84, 0x23, 0xf1, 0x9d, 0x65, 0x5f, 0xa3, 0x74, 0x63, 0x62, 0xe6, 0x63, 0xf0,
0x51, 0x18, 0x33, 0x76, 0x24, 0xce, 0xc2, 0xd8, 0x8f, 0x80, 0x6a, 0x8c, 0x68, 0x48, 0x86, 0x6e,
0xff, 0x7b, 0xbf, 0xf7, 0xf4, 0x6e, 0x78, 0xf0, 0x21, 0x19, 0x8f, 0xdc, 0xa1, 0x30, 0x42, 0xf7,
0x3f, 0xbb, 0x52, 0x99, 0x28, 0x55, 0x62, 0xe2, 0xc6, 0x91, 0x11, 0x77, 0x62, 0x41, 0xb2, 0xc8,
0xc4, 0xc3, 0x7f, 0x97, 0x93, 0xa4, 0xda, 0x68, 0xbb, 0x56, 0x86, 0x9c, 0xbf, 0x5e, 0xa7, 0x74,
0x38, 0xd3, 0xb7, 0xaf, 0x8f, 0x01, 0x02, 0x23, 0x8c, 0xcc, 0x8c, 0x1c, 0x64, 0x76, 0x0d, 0xb6,
0x62, 0xa9, 0x7a, 0x53, 0x31, 0xb9, 0x8a, 0xf6, 0xd1, 0x21, 0x7a, 0xb3, 0xdd, 0x7d, 0x1c, 0x4b,
0x75, 0x7e, 0xf7, 0x2e, 0xa0, 0x98, 0x95, 0xb0, 0x52, 0x42, 0x31, 0x2b, 0x60, 0x7d, 0x0c, 0x5b,
0xc5, 0x11, 0x5e, 0x27, 0x91, 0xfd, 0x02, 0x9e, 0x9f, 0x7b, 0xed, 0x33, 0xda, 0x0b, 0x2f, 0x7c,
0xda, 0x3b, 0xe3, 0x81, 0x4f, 0x9b, 0xec, 0x98, 0xd1, 0x16, 0xb6, 0xec, 0x1d, 0xc0, 0xf7, 0x18,
0xe3, 0xe1, 0xfb, 0x77, 0x18, 0xd9, 0xbb, 0xf0, 0xf4, 0x7e, 0xe2, 0x8f, 0x5c, 0xf9, 0x4f, 0x0e,
0xc2, 0x2e, 0xe3, 0x27, 0xb8, 0x5a, 0xbf, 0x86, 0x6d, 0xaa, 0x06, 0x7a, 0x28, 0xd5, 0xa8, 0xe8,
0x7b, 0x05, 0x07, 0x94, 0x37, 0x3b, 0x2d, 0xc6, 0x4f, 0x36, 0x55, 0xee, 0xc1, 0xb3, 0x55, 0xec,
0xb7, 0x3d, 0xc6, 0x31, 0x5a, 0x07, 0x2d, 0xda, 0x0e, 0x3d, 0x5c, 0xb1, 0xf7, 0x61, 0x67, 0x15,
0x34, 0x58, 0x78, 0xea, 0xf9, 0xb8, 0x5a, 0xff, 0x8a, 0xe0, 0x49, 0x53, 0xc7, 0x49, 0x1a, 0x65,
0x99, 0xd4, 0xaa, 0xa8, 0x3f, 0x84, 0x97, 0xcd, 0xce, 0xa9, 0xdf, 0xa5, 0x41, 0xc0, 0x3a, 0x7c,
0xd3, 0x0f, 0x0e, 0x60, 0x77, 0xcd, 0xc1, 0x3b, 0x9c, 0x62, 0x64, 0xd7, 0x60, 0x6f, 0x0d, 0x05,
0xdc, 0xf3, 0xfd, 0x0b, 0x5c, 0xd9, 0x98, 0xbb, 0x0c, 0xc2, 0x16, 0xae, 0x36, 0x66, 0xf3, 0x05,
0xb1, 0x6e, 0x16, 0xc4, 0xba, 0x5d, 0x10, 0xf4, 0x25, 0x27, 0xe8, 0x7b, 0x4e, 0xd0, 0x8f, 0x9c,
0xa0, 0x79, 0x4e, 0xd0, 0xcf, 0x9c, 0xa0, 0x5f, 0x39, 0xb1, 0x6e, 0x73, 0x82, 0xbe, 0x2d, 0x89,
0x35, 0x5f, 0x12, 0xeb, 0x66, 0x49, 0xac, 0xcb, 0xc6, 0x48, 0x9a, 0x4f, 0x57, 0x7d, 0x67, 0xa0,
0x63, 0x77, 0x94, 0x8a, 0x8f, 0x42, 0x09, 0x77, 0xa2, 0xc7, 0xd2, 0x9d, 0x1e, 0xb9, 0x0f, 0xdc,
0x57, 0xff, 0x51, 0x31, 0xab, 0xa3, 0xdf, 0x01, 0x00, 0x00, 0xff, 0xff, 0x79, 0x22, 0x11, 0xce,
0x91, 0x02, 0x00, 0x00,
}
func (x ValueType) String() string {
@ -135,3 +237,377 @@ func (x EncodingType) String() string {
}
return strconv.Itoa(int(x))
}
func (x CompressionType) String() string {
s, ok := CompressionType_name[int32(x)]
if ok {
return s
}
return strconv.Itoa(int(x))
}
func (this *Statistics) Equal(that interface{}) bool {
if that == nil {
return this == nil
}
that1, ok := that.(*Statistics)
if !ok {
that2, ok := that.(Statistics)
if ok {
that1 = &that2
} else {
return false
}
}
if that1 == nil {
return this == nil
} else if this == nil {
return false
}
if !bytes.Equal(this.MinValue, that1.MinValue) {
return false
}
if !bytes.Equal(this.MaxValue, that1.MaxValue) {
return false
}
return true
}
func (this *Statistics) GoString() string {
if this == nil {
return "nil"
}
s := make([]string, 0, 6)
s = append(s, "&datasetmd.Statistics{")
s = append(s, "MinValue: "+fmt.Sprintf("%#v", this.MinValue)+",\n")
s = append(s, "MaxValue: "+fmt.Sprintf("%#v", this.MaxValue)+",\n")
s = append(s, "}")
return strings.Join(s, "")
}
func valueToGoStringDatasetmd(v interface{}, typ string) string {
rv := reflect.ValueOf(v)
if rv.IsNil() {
return "nil"
}
pv := reflect.Indirect(rv).Interface()
return fmt.Sprintf("func(v %v) *%v { return &v } ( %#v )", typ, typ, pv)
}
func (m *Statistics) Marshal() (dAtA []byte, err error) {
size := m.Size()
dAtA = make([]byte, size)
n, err := m.MarshalToSizedBuffer(dAtA[:size])
if err != nil {
return nil, err
}
return dAtA[:n], nil
}
func (m *Statistics) MarshalTo(dAtA []byte) (int, error) {
size := m.Size()
return m.MarshalToSizedBuffer(dAtA[:size])
}
func (m *Statistics) MarshalToSizedBuffer(dAtA []byte) (int, error) {
i := len(dAtA)
_ = i
var l int
_ = l
if len(m.MaxValue) > 0 {
i -= len(m.MaxValue)
copy(dAtA[i:], m.MaxValue)
i = encodeVarintDatasetmd(dAtA, i, uint64(len(m.MaxValue)))
i--
dAtA[i] = 0x12
}
if len(m.MinValue) > 0 {
i -= len(m.MinValue)
copy(dAtA[i:], m.MinValue)
i = encodeVarintDatasetmd(dAtA, i, uint64(len(m.MinValue)))
i--
dAtA[i] = 0xa
}
return len(dAtA) - i, nil
}
func encodeVarintDatasetmd(dAtA []byte, offset int, v uint64) int {
offset -= sovDatasetmd(v)
base := offset
for v >= 1<<7 {
dAtA[offset] = uint8(v&0x7f | 0x80)
v >>= 7
offset++
}
dAtA[offset] = uint8(v)
return base
}
func (m *Statistics) Size() (n int) {
if m == nil {
return 0
}
var l int
_ = l
l = len(m.MinValue)
if l > 0 {
n += 1 + l + sovDatasetmd(uint64(l))
}
l = len(m.MaxValue)
if l > 0 {
n += 1 + l + sovDatasetmd(uint64(l))
}
return n
}
func sovDatasetmd(x uint64) (n int) {
return (math_bits.Len64(x|1) + 6) / 7
}
func sozDatasetmd(x uint64) (n int) {
return sovDatasetmd(uint64((x << 1) ^ uint64((int64(x) >> 63))))
}
func (this *Statistics) String() string {
if this == nil {
return "nil"
}
s := strings.Join([]string{`&Statistics{`,
`MinValue:` + fmt.Sprintf("%v", this.MinValue) + `,`,
`MaxValue:` + fmt.Sprintf("%v", this.MaxValue) + `,`,
`}`,
}, "")
return s
}
func valueToStringDatasetmd(v interface{}) string {
rv := reflect.ValueOf(v)
if rv.IsNil() {
return "nil"
}
pv := reflect.Indirect(rv).Interface()
return fmt.Sprintf("*%v", pv)
}
func (m *Statistics) Unmarshal(dAtA []byte) error {
l := len(dAtA)
iNdEx := 0
for iNdEx < l {
preIndex := iNdEx
var wire uint64
for shift := uint(0); ; shift += 7 {
if shift >= 64 {
return ErrIntOverflowDatasetmd
}
if iNdEx >= l {
return io.ErrUnexpectedEOF
}
b := dAtA[iNdEx]
iNdEx++
wire |= uint64(b&0x7F) << shift
if b < 0x80 {
break
}
}
fieldNum := int32(wire >> 3)
wireType := int(wire & 0x7)
if wireType == 4 {
return fmt.Errorf("proto: Statistics: wiretype end group for non-group")
}
if fieldNum <= 0 {
return fmt.Errorf("proto: Statistics: illegal tag %d (wire type %d)", fieldNum, wire)
}
switch fieldNum {
case 1:
if wireType != 2 {
return fmt.Errorf("proto: wrong wireType = %d for field MinValue", wireType)
}
var byteLen int
for shift := uint(0); ; shift += 7 {
if shift >= 64 {
return ErrIntOverflowDatasetmd
}
if iNdEx >= l {
return io.ErrUnexpectedEOF
}
b := dAtA[iNdEx]
iNdEx++
byteLen |= int(b&0x7F) << shift
if b < 0x80 {
break
}
}
if byteLen < 0 {
return ErrInvalidLengthDatasetmd
}
postIndex := iNdEx + byteLen
if postIndex < 0 {
return ErrInvalidLengthDatasetmd
}
if postIndex > l {
return io.ErrUnexpectedEOF
}
m.MinValue = append(m.MinValue[:0], dAtA[iNdEx:postIndex]...)
if m.MinValue == nil {
m.MinValue = []byte{}
}
iNdEx = postIndex
case 2:
if wireType != 2 {
return fmt.Errorf("proto: wrong wireType = %d for field MaxValue", wireType)
}
var byteLen int
for shift := uint(0); ; shift += 7 {
if shift >= 64 {
return ErrIntOverflowDatasetmd
}
if iNdEx >= l {
return io.ErrUnexpectedEOF
}
b := dAtA[iNdEx]
iNdEx++
byteLen |= int(b&0x7F) << shift
if b < 0x80 {
break
}
}
if byteLen < 0 {
return ErrInvalidLengthDatasetmd
}
postIndex := iNdEx + byteLen
if postIndex < 0 {
return ErrInvalidLengthDatasetmd
}
if postIndex > l {
return io.ErrUnexpectedEOF
}
m.MaxValue = append(m.MaxValue[:0], dAtA[iNdEx:postIndex]...)
if m.MaxValue == nil {
m.MaxValue = []byte{}
}
iNdEx = postIndex
default:
iNdEx = preIndex
skippy, err := skipDatasetmd(dAtA[iNdEx:])
if err != nil {
return err
}
if skippy < 0 {
return ErrInvalidLengthDatasetmd
}
if (iNdEx + skippy) < 0 {
return ErrInvalidLengthDatasetmd
}
if (iNdEx + skippy) > l {
return io.ErrUnexpectedEOF
}
iNdEx += skippy
}
}
if iNdEx > l {
return io.ErrUnexpectedEOF
}
return nil
}
func skipDatasetmd(dAtA []byte) (n int, err error) {
l := len(dAtA)
iNdEx := 0
for iNdEx < l {
var wire uint64
for shift := uint(0); ; shift += 7 {
if shift >= 64 {
return 0, ErrIntOverflowDatasetmd
}
if iNdEx >= l {
return 0, io.ErrUnexpectedEOF
}
b := dAtA[iNdEx]
iNdEx++
wire |= (uint64(b) & 0x7F) << shift
if b < 0x80 {
break
}
}
wireType := int(wire & 0x7)
switch wireType {
case 0:
for shift := uint(0); ; shift += 7 {
if shift >= 64 {
return 0, ErrIntOverflowDatasetmd
}
if iNdEx >= l {
return 0, io.ErrUnexpectedEOF
}
iNdEx++
if dAtA[iNdEx-1] < 0x80 {
break
}
}
return iNdEx, nil
case 1:
iNdEx += 8
return iNdEx, nil
case 2:
var length int
for shift := uint(0); ; shift += 7 {
if shift >= 64 {
return 0, ErrIntOverflowDatasetmd
}
if iNdEx >= l {
return 0, io.ErrUnexpectedEOF
}
b := dAtA[iNdEx]
iNdEx++
length |= (int(b) & 0x7F) << shift
if b < 0x80 {
break
}
}
if length < 0 {
return 0, ErrInvalidLengthDatasetmd
}
iNdEx += length
if iNdEx < 0 {
return 0, ErrInvalidLengthDatasetmd
}
return iNdEx, nil
case 3:
for {
var innerWire uint64
var start int = iNdEx
for shift := uint(0); ; shift += 7 {
if shift >= 64 {
return 0, ErrIntOverflowDatasetmd
}
if iNdEx >= l {
return 0, io.ErrUnexpectedEOF
}
b := dAtA[iNdEx]
iNdEx++
innerWire |= (uint64(b) & 0x7F) << shift
if b < 0x80 {
break
}
}
innerWireType := int(innerWire & 0x7)
if innerWireType == 4 {
break
}
next, err := skipDatasetmd(dAtA[start:])
if err != nil {
return 0, err
}
iNdEx = start + next
if iNdEx < 0 {
return 0, ErrInvalidLengthDatasetmd
}
}
return iNdEx, nil
case 4:
return iNdEx, nil
case 5:
iNdEx += 4
return iNdEx, nil
default:
return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
}
}
panic("unreachable")
}
var (
ErrInvalidLengthDatasetmd = fmt.Errorf("proto: negative length found during unmarshaling")
ErrIntOverflowDatasetmd = fmt.Errorf("proto: integer overflow")
)

@ -37,3 +37,29 @@ enum EncodingType {
// integers using a combination of run-length encoding and bitpacking.
ENCODING_TYPE_BITMAP = 3;
}
// CompressionType represents the valid compression types that can be used for
// compressing values in a page.
enum CompressionType {
// Invalid compression type.
COMPRESSION_TYPE_UNSPECIFIED = 0;
// No compression.
COMPRESSION_TYPE_NONE = 1;
// Snappy compression.
COMPRESSION_TYPE_SNAPPY = 2;
// Zstd compression.
COMPRESSION_TYPE_ZSTD = 3;
}
// Statistics about a column or a page. All statistics are optional and are
// conditionally set depending on the column type.
message Statistics {
// Minimum value.
bytes min_value = 1;
// Maximum value.
bytes max_value = 2;
}

@ -0,0 +1,101 @@
// Package result provides utilities for dealing with iterators that can fail
// during iteration.
//
// Result is useful to make it harder for callers to ignore errors. Using
// iter.Seq2[V, error] can make it easy to accidentally ignore errors:
//
// func myIter() iter.Seq2[V, error] { ... }
//
// func main() {
// for v := range myIter() { /* errors are ignored! */ }
// }
package result
import (
"errors"
"iter"
)
// Result is a type used for representing a result from an operation that can
// fail.
type Result[V any] struct {
value V // Valid only if err is nil.
err error
}
// Value returns a successful result with the given value.
func Value[V any](v V) Result[V] {
return Result[V]{value: v}
}
// Error returns a failed result with the given error.
func Error[V any](err error) Result[V] {
return Result[V]{err: err}
}
// Value returns r's value and error.
func (r Result[V]) Value() (V, error) {
return r.value, r.err
}
// MustValue returns r's value. If r is an error, MustValue panics.
func (r Result[V]) MustValue() V {
if r.err != nil {
panic(r.err)
}
return r.value
}
// Err returns r's error, if any.
func (r Result[V]) Err() error {
return r.err
}
// Seq is an iterator over sequences of result values. When called as
// seq(yield), seq calls yield(r) for each value r in the sequence, stopping
// early if yield returns false.
//
// See the [iter] package for more information on iterators.
type Seq[V any] func(yield func(Result[V]) bool)
// Iter produces a new Seq[V] from a given function that can fail. Values
// passed to yield are wrapped in a call to [Value], while a non-nil error is
// wrapped in a call to [Error].
//
// Iter makes it easier to write failable iterators and removes the need to
// manually wrap values and errors into a [Result].
func Iter[V any](seq func(yield func(V) bool) error) Seq[V] {
return func(yield func(Result[V]) bool) {
err := seq(func(v V) bool { return yield(Value(v)) })
if err != nil {
yield(Error[V](err))
}
}
}
// Pull converts the "push-style" Result iterator sequence seq into a
// "pull-style" iterator accessed by the two functions next and stop.
//
// Pull is a wrapper around [iter.Pull].
func Pull[V any](seq Seq[V]) (next func() (Result[V], bool), stop func()) {
iseq := iter.Seq[Result[V]](seq)
return iter.Pull(iseq)
}
// Collect collects values from seq into a new slice and returns it. Any errors
// from seq are joined and returned as the second value.
func Collect[V any](seq Seq[V]) ([]V, error) {
var (
vals []V
errs []error
)
for res := range seq {
val, err := res.Value()
if err != nil {
errs = append(errs, err)
} else {
vals = append(vals, val)
}
}
return vals, errors.Join(errs...)
}

@ -3,6 +3,7 @@ package streamio
import (
"encoding/binary"
"io"
"math/bits"
)
// [binary] does not have an implementation to write varints directly
@ -10,6 +11,23 @@ import (
// encoders to stream values, we provide equivalent implementations of
// [binary.AppendUvarint] and [binary.AppendVarint] which accept a ByteWriter.
// VarintSize returns the number of bytes needed to encode x.
func VarintSize(x int64) int {
ux := uint64(x) << 1
if x < 0 {
ux = ^ux
}
return UvarintSize(ux)
}
// UvarintSize returns the number of bytes needed to encode x.
func UvarintSize(x uint64) int {
if x == 0 {
return 1
}
return 1 + (63-bits.LeadingZeros64(x))/7
}
// WriteVarint writes an encoded signed integer to w.
func WriteVarint(w io.ByteWriter, x int64) error {
// Like [binary.AppendVarint], we use zig-zag encoding so small negative

Loading…
Cancel
Save