package streams

import (
	"errors"
	"fmt"
	"sort"
	"sync"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/prometheus/model/labels"
	"go.uber.org/atomic"

	"github.com/grafana/loki/v3/pkg/dataobj"
	"github.com/grafana/loki/v3/pkg/dataobj/internal/dataset"
	"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd"
	"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/streamsmd"
	"github.com/grafana/loki/v3/pkg/dataobj/internal/streamio"
	"github.com/grafana/loki/v3/pkg/dataobj/internal/util/sliceclear"
)

// A Stream is an individual stream within a data object.
type Stream struct {
	// ID to uniquely represent a stream in a data object. Valid IDs start at 1.
	// IDs are used to track streams across multiple sections in the same data
	// object.
	ID int64

	// MinTime and MaxTime denote the range of timestamps across all entries in
	// the stream.
	MinTimestamp, MaxTimestamp time.Time // Minimum timestamp in the stream.

	// Uncompressed size of the log lines and structured metadata values in the stream.
	UncompressedSize int64

	// Labels of the stream.
	Labels labels.Labels //

	// Total number of log records in the stream.
	Rows int
}

// Reset zeroes all values in the stream struct so it can be reused.
func (s *Stream) Reset() {
	s.ID = 0
	s.Labels = nil
	s.MinTimestamp = time.Time{}
	s.MaxTimestamp = time.Time{}
	s.UncompressedSize = 0
	s.Rows = 0
}

var streamPool = sync.Pool{
	New: func() interface{} {
		return &Stream{}
	},
}

// Builder builds a streams section.
type Builder struct {
	metrics  *Metrics
	pageSize int
	lastID   atomic.Int64
	lookup   map[uint64][]*Stream

	// Size of all label values across all streams; used for
	// [Streams.EstimatedSize]. Resets on [Streams.Reset].
	currentLabelsSize int

	globalMinTimestamp time.Time // Minimum timestamp across all streams, used for metrics.
	globalMaxTimestamp time.Time // Maximum timestamp across all streams, used for metrics.

	// orderedStreams is used for consistently iterating over the list of
	// streams. It contains streamed added in append order.
	ordered []*Stream
}

// NewBuilder creates a new sterams section builder. The pageSize argument
// specifies how large pages should be.
func NewBuilder(metrics *Metrics, pageSize int) *Builder {
	if metrics == nil {
		metrics = NewMetrics()
	}
	return &Builder{
		metrics:  metrics,
		pageSize: pageSize,
		lookup:   make(map[uint64][]*Stream, 1024),
		ordered:  make([]*Stream, 0, 1024),
	}
}

// Type returns the [dataobj.SectionType] of the streams builder.
func (b *Builder) Type() dataobj.SectionType { return sectionType }

// TimeRange returns the minimum and maximum timestamp across all streams.
func (b *Builder) TimeRange() (time.Time, time.Time) {
	return b.globalMinTimestamp, b.globalMaxTimestamp
}

// Record a stream record within the section. The provided timestamp is used to
// track the minimum and maximum timestamp of a stream. The number of calls to
// Record is used to track the number of rows for a stream. The recordSize is
// used to track the uncompressed size of the stream.
//
// The stream ID of the recorded stream is returned.
func (b *Builder) Record(streamLabels labels.Labels, ts time.Time, recordSize int64) int64 {
	ts = ts.UTC()
	b.observeRecord(ts)

	stream := b.getOrAddStream(streamLabels)
	if stream.MinTimestamp.IsZero() || ts.Before(stream.MinTimestamp) {
		stream.MinTimestamp = ts
	}
	if stream.MaxTimestamp.IsZero() || ts.After(stream.MaxTimestamp) {
		stream.MaxTimestamp = ts
	}
	stream.Rows++
	stream.UncompressedSize += recordSize

	return stream.ID
}

func (b *Builder) observeRecord(ts time.Time) {
	b.metrics.recordsTotal.Inc()

	if ts.Before(b.globalMinTimestamp) || b.globalMinTimestamp.IsZero() {
		b.globalMinTimestamp = ts
		b.metrics.minTimestamp.Set(float64(ts.Unix()))
	}
	if ts.After(b.globalMaxTimestamp) || b.globalMaxTimestamp.IsZero() {
		b.globalMaxTimestamp = ts
		b.metrics.maxTimestamp.Set(float64(ts.Unix()))
	}
}

// EstimatedSize returns the estimated size of the Streams section in bytes.
func (b *Builder) EstimatedSize() int {
	// Since columns are only built when encoding, we can't use
	// [dataset.ColumnBuilder.EstimatedSize] here.
	//
	// Instead, we use a basic heuristic, estimating delta encoding and
	// compression:
	//
	// 1. Assume an ID delta of 1.
	// 2. Assume a timestamp delta of 1s.
	// 3. Assume a row count delta of 500.
	// 4. Assume (conservative) 2x compression ratio of all label values.

	var (
		idDeltaSize        = streamio.VarintSize(1)
		timestampDeltaSize = streamio.VarintSize(int64(time.Second))
		rowDeltaSize       = streamio.VarintSize(500)
	)

	var sizeEstimate int

	sizeEstimate += len(b.ordered) * idDeltaSize        // ID
	sizeEstimate += len(b.ordered) * timestampDeltaSize // Min timestamp
	sizeEstimate += len(b.ordered) * timestampDeltaSize // Max timestamp
	sizeEstimate += len(b.ordered) * rowDeltaSize       // Rows
	sizeEstimate += b.currentLabelsSize / 2             // All labels (2x compression ratio)

	return sizeEstimate
}

func (b *Builder) getOrAddStream(streamLabels labels.Labels) *Stream {
	hash := streamLabels.Hash()
	matches, ok := b.lookup[hash]
	if !ok {
		return b.addStream(hash, streamLabels)
	}

	for _, stream := range matches {
		if labels.Equal(stream.Labels, streamLabels) {
			return stream
		}
	}

	return b.addStream(hash, streamLabels)
}

func (b *Builder) addStream(hash uint64, streamLabels labels.Labels) *Stream {
	// Ensure streamLabels are sorted prior to adding to ensure consistent column
	// ordering.
	sort.Sort(streamLabels)

	for _, lbl := range streamLabels {
		b.currentLabelsSize += len(lbl.Value)
	}

	newStream := streamPool.Get().(*Stream)
	newStream.Reset()
	newStream.ID = b.lastID.Add(1)
	newStream.Labels = streamLabels

	b.lookup[hash] = append(b.lookup[hash], newStream)
	b.ordered = append(b.ordered, newStream)
	b.metrics.streamCount.Inc()
	return newStream
}

// StreamID returns the stream ID for the provided streamLabels. If the stream
// has not been recorded, StreamID returns 0.
func (b *Builder) StreamID(streamLabels labels.Labels) int64 {
	hash := streamLabels.Hash()
	matches, ok := b.lookup[hash]
	if !ok {
		return 0
	}

	for _, stream := range matches {
		if labels.Equal(stream.Labels, streamLabels) {
			return stream.ID
		}
	}

	return 0
}

// Flush flushes the streams section to the provided writer.
//
// After successful encoding, b is reset to a fresh state and can be reused.
func (b *Builder) Flush(w dataobj.SectionWriter) (n int64, err error) {
	timer := prometheus.NewTimer(b.metrics.encodeSeconds)
	defer timer.ObserveDuration()

	var streamsEnc encoder
	defer streamsEnc.Reset()
	if err := b.encodeTo(&streamsEnc); err != nil {
		return 0, fmt.Errorf("building encoder: %w", err)
	}

	n, err = streamsEnc.Flush(w)
	if err == nil {
		b.Reset()
	}
	return n, err
}

func (b *Builder) encodeTo(enc *encoder) error {
	// TODO(rfratto): handle one section becoming too large. This can happen when
	// the number of columns is very wide. There are two approaches to handle
	// this:
	//
	// 1. Split streams into multiple sections.
	// 2. Move some columns into an aggregated column which holds multiple label
	//    keys and values.

	idBuilder, err := numberColumnBuilder(b.pageSize)
	if err != nil {
		return fmt.Errorf("creating ID column: %w", err)
	}
	minTimestampBuilder, err := numberColumnBuilder(b.pageSize)
	if err != nil {
		return fmt.Errorf("creating minimum timestamp column: %w", err)
	}
	maxTimestampBuilder, err := numberColumnBuilder(b.pageSize)
	if err != nil {
		return fmt.Errorf("creating maximum timestamp column: %w", err)
	}
	rowsCountBuilder, err := numberColumnBuilder(b.pageSize)
	if err != nil {
		return fmt.Errorf("creating rows column: %w", err)
	}
	uncompressedSizeBuilder, err := numberColumnBuilder(b.pageSize)
	if err != nil {
		return fmt.Errorf("creating uncompressed size column: %w", err)
	}

	var (
		labelBuilders      []*dataset.ColumnBuilder
		labelBuilderlookup = map[string]int{} // Name to index
	)

	getLabelColumn := func(name string) (*dataset.ColumnBuilder, error) {
		idx, ok := labelBuilderlookup[name]
		if ok {
			return labelBuilders[idx], nil
		}

		builder, err := dataset.NewColumnBuilder(name, dataset.BuilderOptions{
			PageSizeHint: b.pageSize,
			Value:        datasetmd.VALUE_TYPE_BYTE_ARRAY,
			Encoding:     datasetmd.ENCODING_TYPE_PLAIN,
			Compression:  datasetmd.COMPRESSION_TYPE_ZSTD,
			Statistics: dataset.StatisticsOptions{
				StoreRangeStats: true,
			},
		})
		if err != nil {
			return nil, fmt.Errorf("creating label column: %w", err)
		}

		labelBuilders = append(labelBuilders, builder)
		labelBuilderlookup[name] = len(labelBuilders) - 1
		return builder, nil
	}

	// Populate our column builders.
	for i, stream := range b.ordered {
		// Append only fails if the rows are out-of-order, which can't happen here.
		_ = idBuilder.Append(i, dataset.Int64Value(stream.ID))
		_ = minTimestampBuilder.Append(i, dataset.Int64Value(stream.MinTimestamp.UnixNano()))
		_ = maxTimestampBuilder.Append(i, dataset.Int64Value(stream.MaxTimestamp.UnixNano()))
		_ = rowsCountBuilder.Append(i, dataset.Int64Value(int64(stream.Rows)))
		_ = uncompressedSizeBuilder.Append(i, dataset.Int64Value(stream.UncompressedSize))

		for _, label := range stream.Labels {
			builder, err := getLabelColumn(label.Name)
			if err != nil {
				return fmt.Errorf("getting label column: %w", err)
			}
			_ = builder.Append(i, dataset.ByteArrayValue([]byte(label.Value)))
		}
	}

	// Encode our builders to sections. We ignore errors after enc.OpenStreams
	// (which may fail due to a caller) since we guarantee correct usage of the
	// encoding API.
	{
		var errs []error
		errs = append(errs, encodeColumn(enc, streamsmd.COLUMN_TYPE_STREAM_ID, idBuilder))
		errs = append(errs, encodeColumn(enc, streamsmd.COLUMN_TYPE_MIN_TIMESTAMP, minTimestampBuilder))
		errs = append(errs, encodeColumn(enc, streamsmd.COLUMN_TYPE_MAX_TIMESTAMP, maxTimestampBuilder))
		errs = append(errs, encodeColumn(enc, streamsmd.COLUMN_TYPE_ROWS, rowsCountBuilder))
		errs = append(errs, encodeColumn(enc, streamsmd.COLUMN_TYPE_UNCOMPRESSED_SIZE, uncompressedSizeBuilder))
		if err := errors.Join(errs...); err != nil {
			return fmt.Errorf("encoding columns: %w", err)
		}
	}

	for _, labelBuilder := range labelBuilders {
		// For consistency we'll make sure each label builder has the same number
		// of rows as the other columns (which is the number of streams).
		labelBuilder.Backfill(len(b.ordered))

		err := encodeColumn(enc, streamsmd.COLUMN_TYPE_LABEL, labelBuilder)
		if err != nil {
			return fmt.Errorf("encoding label column: %w", err)
		}
	}

	return nil
}

func numberColumnBuilder(pageSize int) (*dataset.ColumnBuilder, error) {
	return dataset.NewColumnBuilder("", dataset.BuilderOptions{
		PageSizeHint: pageSize,
		Value:        datasetmd.VALUE_TYPE_INT64,
		Encoding:     datasetmd.ENCODING_TYPE_DELTA,
		Compression:  datasetmd.COMPRESSION_TYPE_NONE,
		Statistics: dataset.StatisticsOptions{
			StoreRangeStats: true,
		},
	})
}

func encodeColumn(enc *encoder, columnType streamsmd.ColumnType, builder *dataset.ColumnBuilder) error {
	column, err := builder.Flush()
	if err != nil {
		return fmt.Errorf("flushing %s column: %w", columnType, err)
	}

	columnEnc, err := enc.OpenColumn(columnType, &column.Info)
	if err != nil {
		return fmt.Errorf("opening %s column encoder: %w", columnType, err)
	}
	defer func() {
		// Discard on defer for safety. This will return an error if we
		// successfully committed.
		_ = columnEnc.Discard()
	}()

	for _, page := range column.Pages {
		err := columnEnc.AppendPage(page)
		if err != nil {
			return fmt.Errorf("appending %s page: %w", columnType, err)
		}
	}

	return columnEnc.Commit()
}

// Reset resets all state, allowing Streams to be reused.
func (b *Builder) Reset() {
	b.lastID.Store(0)
	for _, stream := range b.ordered {
		streamPool.Put(stream)
	}
	clear(b.lookup)
	b.ordered = sliceclear.Clear(b.ordered)
	b.currentLabelsSize = 0
	b.globalMinTimestamp = time.Time{}
	b.globalMaxTimestamp = time.Time{}

	b.metrics.streamCount.Set(0)
	b.metrics.minTimestamp.Set(0)
	b.metrics.maxTimestamp.Set(0)
}