perf(blooms): Use jumpmap for client side sharding of the filter requests (#12470)

**What this PR does / why we need it**:

Use the same client-side sharding mechanism for bloom gateway filter requests that we use for client side sharding in the memcached client.

This uses DNS discovery to resolve a list of addresses uses as buckets for the jumphash algorithm to shard GroupedChunkRefs across bloom gateway servers.
The ring component is still part of the bloom gateway, but it is not used any more.


**Special notes for your reviewer**:

https://arxiv.org/pdf/1406.2294.pdf

Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
Co-authored-by: Owen Diehl <ow.diehl@gmail.com>
pull/12484/head
Christian Haudum 1 year ago committed by GitHub
parent 3ece2ea6c4
commit 58c8c61905
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 18
      docs/sources/configure/_index.md
  2. 276
      pkg/bloomgateway/client.go
  3. 106
      pkg/bloomgateway/client_pool.go
  4. 35
      pkg/bloomgateway/client_pool_test.go
  5. 298
      pkg/bloomgateway/client_test.go
  6. 16
      pkg/bloomgateway/metrics.go
  7. 2
      pkg/loki/modules.go
  8. 3
      pkg/storage/chunk/cache/memcached_client.go
  9. 11
      pkg/util/discovery/dns.go
  10. 46
      pkg/util/jumphash/memcached_client_selector.go
  11. 8
      pkg/util/jumphash/memcached_client_selector_test.go

@ -2089,11 +2089,18 @@ ring:
client:
# Configures the behavior of the connection pool.
pool_config:
[client_cleanup_period: <duration>]
# How frequently to clean up clients for servers that have gone away or are
# unhealthy.
# CLI flag: -bloom-gateway-client.pool.check-interval
[check_interval: <duration> | default = 10s]
[health_check_ingesters: <boolean>]
# Run a health check on each server during periodic cleanup.
# CLI flag: -bloom-gateway-client.pool.enable-health-check
[enable_health_check: <boolean> | default = true]
[remote_timeout: <duration>]
# Timeout for the health check if health check is enabled.
# CLI flag: -bloom-gateway-client.pool.health-check-timeout
[health_check_timeout: <duration> | default = 1s]
# The grpc_client block configures the gRPC client used to communicate between
# two Loki components.
@ -2116,6 +2123,11 @@ client:
# CLI flag: -bloom-gateway-client.cache_results
[cache_results: <boolean> | default = false]
# Comma separated addresses list in DNS Service Discovery format:
# https://grafana.com/docs/mimir/latest/configure/about-dns-service-discovery/#supported-discovery-modes
# CLI flag: -bloom-gateway-client.addresses
[addresses: <string> | default = ""]
# Number of workers to use for filtering chunks concurrently.
# CLI flag: -bloom-gateway.worker-concurrency
[worker_concurrency: <int> | default = 4]

@ -6,70 +6,33 @@ import (
"fmt"
"io"
"math"
"math/rand"
"strings"
"sync"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/concurrency"
"github.com/grafana/dskit/grpcclient"
"github.com/grafana/dskit/instrument"
"github.com/grafana/dskit/ring"
ringclient "github.com/grafana/dskit/ring/client"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/common/model"
"golang.org/x/exp/slices"
"google.golang.org/grpc"
"google.golang.org/grpc/health/grpc_health_v1"
"github.com/grafana/loki/v3/pkg/bloomutils"
"github.com/grafana/loki/v3/pkg/distributor/clientpool"
"github.com/grafana/loki/v3/pkg/logproto"
"github.com/grafana/loki/v3/pkg/logqlmodel/stats"
"github.com/grafana/loki/v3/pkg/querier/plan"
"github.com/grafana/loki/v3/pkg/queue"
v1 "github.com/grafana/loki/v3/pkg/storage/bloom/v1"
"github.com/grafana/loki/v3/pkg/storage/chunk/cache"
"github.com/grafana/loki/v3/pkg/storage/chunk/cache/resultscache"
"github.com/grafana/loki/v3/pkg/util/constants"
util_log "github.com/grafana/loki/v3/pkg/util/log"
"github.com/grafana/loki/v3/pkg/util/discovery"
)
var (
// BlocksOwnerRead is the operation used to check the authoritative owners of a block
// (replicas included) that are available for queries (a bloom gateway is available for
// queries only when ACTIVE).
BlocksOwnerRead = ring.NewOp([]ring.InstanceState{ring.ACTIVE}, nil)
// groupedChunksRefPool pooling slice of logproto.GroupedChunkRefs [64, 128, 256, ..., 65536]
groupedChunksRefPool = queue.NewSlicePool[*logproto.GroupedChunkRefs](1<<6, 1<<16, 2)
// ringGetBuffersPool pooling for ringGetBuffers to avoid calling ring.MakeBuffersForGet() for each request
ringGetBuffersPool = sync.Pool{
New: func() interface{} {
descs, hosts, zones := ring.MakeBuffersForGet()
return &ringGetBuffers{
Descs: descs,
Hosts: hosts,
Zones: zones,
}
},
}
)
type ringGetBuffers struct {
Descs []ring.InstanceDesc
Hosts []string
Zones []string
}
func (buf *ringGetBuffers) Reset() {
buf.Descs = buf.Descs[:0]
buf.Hosts = buf.Hosts[:0]
buf.Zones = buf.Zones[:0]
}
// GRPCPool represents a pool of gRPC connections to different bloom gateway instances.
// Interfaces are inlined for simplicity to automatically satisfy interface functions.
type GRPCPool struct {
@ -98,19 +61,17 @@ func NewBloomGatewayGRPCPool(address string, opts []grpc.DialOption) (*GRPCPool,
type ClientConfig struct {
// PoolConfig defines the behavior of the gRPC connection pool used to communicate
// with the Bloom Gateway.
// It is defined at the distributors YAML section and reused here.
PoolConfig clientpool.PoolConfig `yaml:"pool_config,omitempty" doc:"description=Configures the behavior of the connection pool."`
PoolConfig PoolConfig `yaml:"pool_config,omitempty" doc:"description=Configures the behavior of the connection pool."`
// GRPCClientConfig configures the gRPC connection between the Bloom Gateway client and the server.
GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config"`
// Ring is the Bloom Gateway ring used to find the appropriate Bloom Gateway instance
// this client should talk to.
Ring ring.ReadRing `yaml:"-"`
// Cache configures the cache used to store the results of the Bloom Gateway server.
Cache CacheConfig `yaml:"results_cache,omitempty"`
CacheResults bool `yaml:"cache_results"`
// Client sharding using DNS disvovery and jumphash
Addresses string `yaml:"addresses,omitempty"`
}
// RegisterFlags registers flags for the Bloom Gateway client configuration.
@ -122,7 +83,9 @@ func (i *ClientConfig) RegisterFlags(f *flag.FlagSet) {
func (i *ClientConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
i.GRPCClientConfig.RegisterFlagsWithPrefix(prefix+"grpc", f)
i.Cache.RegisterFlagsWithPrefix(prefix+"cache.", f)
i.PoolConfig.RegisterFlagsWithPrefix(prefix+"pool.", f)
f.BoolVar(&i.CacheResults, prefix+"cache_results", false, "Flag to control whether to cache bloom gateway client requests/responses.")
f.StringVar(&i.Addresses, prefix+"addresses", "", "Comma separated addresses list in DNS Service Discovery format: https://grafana.com/docs/mimir/latest/configure/about-dns-service-discovery/#supported-discovery-modes")
}
func (i *ClientConfig) Validate() error {
@ -130,12 +93,20 @@ func (i *ClientConfig) Validate() error {
return errors.Wrap(err, "grpc client config")
}
if err := i.PoolConfig.Validate(); err != nil {
return errors.Wrap(err, "pool config")
}
if i.CacheResults {
if err := i.Cache.Validate(); err != nil {
return errors.Wrap(err, "cache config")
}
}
if i.Addresses == "" {
return errors.New("addresses requires a list of comma separated strings in DNS service discovery format with at least one item")
}
return nil
}
@ -144,33 +115,25 @@ type Client interface {
}
type GatewayClient struct {
cfg ClientConfig
limits Limits
logger log.Logger
metrics *clientMetrics
pool *ringclient.Pool
ring ring.ReadRing
cfg ClientConfig
limits Limits
logger log.Logger
metrics *clientMetrics
pool *JumpHashClientPool
dnsProvider *discovery.DNS
}
func NewClient(
cfg ClientConfig,
readRing ring.ReadRing,
limits Limits,
registerer prometheus.Registerer,
logger log.Logger,
metricsNamespace string,
cacheGen resultscache.CacheGenNumberLoader,
retentionEnabled bool,
) (*GatewayClient, error) {
latency := promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
Namespace: constants.Loki,
Subsystem: "bloom_gateway",
Name: "request_duration_seconds",
Help: "Time (in seconds) spent serving requests when using the bloom gateway",
Buckets: instrument.DefBuckets,
}, []string{"operation", "status_code"})
dialOpts, err := cfg.GRPCClientConfig.DialOption(grpcclient.Instrument(latency))
metrics := newClientMetrics(registerer)
dialOpts, err := cfg.GRPCClientConfig.DialOption(grpcclient.Instrument(metrics.requestLatency))
if err != nil {
return nil, err
}
@ -206,29 +169,35 @@ func NewClient(
return pool, nil
}
dnsProvider := discovery.NewDNS(logger, cfg.PoolConfig.CheckInterval, cfg.Addresses, nil)
// Make an attempt to do one DNS lookup so we can start with addresses
dnsProvider.RunOnce()
clientPool := ringclient.NewPool(
"bloom-gateway",
ringclient.PoolConfig(cfg.PoolConfig),
func() ([]string, error) { return dnsProvider.Addresses(), nil },
ringclient.PoolAddrFunc(poolFactory),
metrics.clients,
logger,
)
pool := NewJumpHashClientPool(clientPool, dnsProvider, cfg.PoolConfig.CheckInterval, logger)
pool.Start()
return &GatewayClient{
cfg: cfg,
logger: logger,
limits: limits,
metrics: newClientMetrics(registerer),
pool: clientpool.NewPool("bloom-gateway", cfg.PoolConfig, cfg.Ring, ringclient.PoolAddrFunc(poolFactory), logger, metricsNamespace),
ring: readRing,
cfg: cfg,
logger: logger,
limits: limits,
metrics: metrics,
pool: pool,
dnsProvider: dnsProvider, // keep reference so we can stop it when the client is closed
}, nil
}
func JoinFunc[S ~[]E, E any](elems S, sep string, f func(e E) string) string {
res := make([]string, len(elems))
for i := range elems {
res[i] = f(elems[i])
}
return strings.Join(res, sep)
}
func shuffleAddrs(addrs []string) []string {
rand.Shuffle(len(addrs), func(i, j int) {
addrs[i], addrs[j] = addrs[j], addrs[i]
})
return addrs
func (c *GatewayClient) Close() {
c.pool.Stop()
c.dnsProvider.Stop()
}
// FilterChunkRefs implements Client
@ -237,19 +206,23 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, from, t
return groups, nil
}
subRing := GetShuffleShardingSubring(c.ring, tenant, c.limits)
rs, err := subRing.GetAllHealthy(BlocksOwnerRead)
if err != nil {
return nil, errors.Wrap(err, "bloom gateway get healthy instances")
clients := make(map[string][]*logproto.GroupedChunkRefs)
for _, g := range groups {
addr, err := c.pool.AddrForFingerprint(g.Fingerprint)
if err != nil {
return nil, errors.Wrap(err, "server address for fingerprint")
}
clients[addr] = append(clients[addr], g)
}
servers, err := replicationSetsWithBounds(subRing, rs.Instances)
if err != nil {
return nil, errors.Wrap(err, "bloom gateway get replication sets")
servers := make([]addrWithGroups, 0, len(clients))
for k, v := range clients {
servers = append(servers, addrWithGroups{
groups: v,
addr: k,
})
}
servers = partitionByReplicationSet(groups, servers)
if len(servers) > 0 {
// cache locality score (higher is better):
// `% keyspace / % instances`. Ideally converges to 1 (querying x% of keyspace requires x% of instances),
@ -258,23 +231,20 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, from, t
// overlap on instances to the left and right of the range.
firstFp, lastFp := groups[0].Fingerprint, groups[len(groups)-1].Fingerprint
pctKeyspace := float64(lastFp-firstFp) / float64(math.MaxUint64)
pctInstances := float64(len(servers)) / float64(len(rs.Instances))
pctInstances := float64(len(servers)) / float64(max(1, len(c.pool.Addrs())))
cacheLocalityScore := pctKeyspace / pctInstances
c.metrics.cacheLocalityScore.Observe(cacheLocalityScore)
}
results := make([][]*logproto.GroupedChunkRefs, len(servers))
count := 0
err = concurrency.ForEachJob(ctx, len(servers), len(servers), func(ctx context.Context, i int) error {
err := concurrency.ForEachJob(ctx, len(servers), len(servers), func(ctx context.Context, i int) error {
rs := servers[i]
// randomize order of addresses so we don't hotspot the first server in the list
addrs := shuffleAddrs(rs.rs.GetAddresses())
level.Info(c.logger).Log(
"msg", "do FilterChunkRefs for addresses",
"progress", fmt.Sprintf("%d/%d", i+1, len(servers)),
"bounds", JoinFunc(rs.ranges, ",", func(e v1.FingerprintBounds) string { return e.String() }),
"addrs", strings.Join(addrs, ","),
"addr", rs.addr,
"from", from.Time(),
"through", through.Time(),
"num_refs", len(rs.groups),
@ -282,7 +252,7 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, from, t
"plan_hash", plan.Hash(),
)
return c.doForAddrs(addrs, func(client logproto.BloomGatewayClient) error {
return c.doForAddrs([]string{rs.addr}, func(client logproto.BloomGatewayClient) error {
req := &logproto.FilterChunkRefRequest{
From: from,
Through: through,
@ -316,7 +286,6 @@ func flatten(input [][]*logproto.GroupedChunkRefs, n int) []*logproto.GroupedChu
// doForAddrs sequetially calls the provided callback function fn for each
// address in given slice addrs until the callback function does not return an
// error.
// TODO(owen-d): parallelism
func (c *GatewayClient) doForAddrs(addrs []string, fn func(logproto.BloomGatewayClient) error) error {
var err error
var poolClient ringclient.PoolClient
@ -337,116 +306,7 @@ func (c *GatewayClient) doForAddrs(addrs []string, fn func(logproto.BloomGateway
return err
}
func mapTokenRangeToFingerprintRange(r bloomutils.Range[uint32]) v1.FingerprintBounds {
minFp := uint64(r.Min) << 32
maxFp := uint64(r.Max) << 32
return v1.NewBounds(
model.Fingerprint(minFp),
model.Fingerprint(maxFp|math.MaxUint32),
)
}
type rsWithRanges struct {
rs ring.ReplicationSet
ranges []v1.FingerprintBounds
type addrWithGroups struct {
addr string
groups []*logproto.GroupedChunkRefs
}
func replicationSetsWithBounds(subRing ring.ReadRing, instances []ring.InstanceDesc) ([]rsWithRanges, error) {
bufDescs, bufHosts, bufZones := ring.MakeBuffersForGet()
servers := make([]rsWithRanges, 0, len(instances))
for _, inst := range instances {
tr, err := bloomutils.TokenRangesForInstance(inst.Id, instances)
if err != nil {
return nil, errors.Wrap(err, "bloom gateway get ring")
}
if len(tr) == 0 {
level.Warn(util_log.Logger).Log(
"subroutine", "replicationSetsWithBounds",
"msg", "instance has no token ranges - should not be possible",
"instance", inst.Id,
"n_instances", len(instances),
)
continue
}
// NB(owen-d): this will send requests to the wrong nodes if RF>1 since it only checks the
// first token when assigning replicasets
rs, err := subRing.Get(tr[0], BlocksOwnerRead, bufDescs, bufHosts, bufZones)
if err != nil {
return nil, errors.Wrap(err, "bloom gateway get ring")
}
bounds := make([]v1.FingerprintBounds, 0, len(tr)/2)
for i := 0; i < len(tr); i += 2 {
b := v1.NewBounds(
model.Fingerprint(uint64(tr[i])<<32),
model.Fingerprint(uint64(tr[i+1])<<32|math.MaxUint32),
)
bounds = append(bounds, b)
}
servers = append(servers, rsWithRanges{
rs: rs,
ranges: bounds,
})
}
return servers, nil
}
func partitionByReplicationSet(fingerprints []*logproto.GroupedChunkRefs, rs []rsWithRanges) (result []rsWithRanges) {
for _, inst := range rs {
for _, bounds := range inst.ranges {
min, _ := slices.BinarySearchFunc(fingerprints, bounds, func(g *logproto.GroupedChunkRefs, b v1.FingerprintBounds) int {
if g.Fingerprint < uint64(b.Min) {
return -1
} else if g.Fingerprint > uint64(b.Min) {
return 1
}
return 0
})
max, _ := slices.BinarySearchFunc(fingerprints, bounds, func(g *logproto.GroupedChunkRefs, b v1.FingerprintBounds) int {
if g.Fingerprint <= uint64(b.Max) {
return -1
} else if g.Fingerprint > uint64(b.Max) {
return 1
}
return 0
})
// fingerprint is out of boundaries
if min == len(fingerprints) || max == 0 {
continue
}
inst.groups = append(inst.groups, fingerprints[min:max]...)
}
if len(inst.groups) > 0 {
result = append(result, inst)
}
}
return result
}
// GetShuffleShardingSubring returns the subring to be used for a given user.
// This function should be used both by index gateway servers and clients in
// order to guarantee the same logic is used.
func GetShuffleShardingSubring(ring ring.ReadRing, tenantID string, limits Limits) ring.ReadRing {
shardSize := limits.BloomGatewayShardSize(tenantID)
// A shard size of 0 means shuffle sharding is disabled for this specific user,
// so we just return the full ring so that indexes will be sharded across all index gateways.
// Since we set the shard size to replication factor if shard size is 0, this
// can only happen if both the shard size and the replication factor are set
// to 0.
if shardSize <= 0 {
return ring
}
return ring.ShuffleShard(tenantID, shardSize)
}

@ -0,0 +1,106 @@
package bloomgateway
import (
"context"
"flag"
"sort"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/ring/client"
"github.com/grafana/dskit/services"
"github.com/grafana/loki/v3/pkg/util/jumphash"
)
// PoolConfig is config for creating a Pool.
// It has the same fields as "github.com/grafana/dskit/ring/client.PoolConfig" so it can be cast.
type PoolConfig struct {
CheckInterval time.Duration `yaml:"check_interval"`
HealthCheckEnabled bool `yaml:"enable_health_check"`
HealthCheckTimeout time.Duration `yaml:"health_check_timeout"`
MaxConcurrentHealthChecks int `yaml:"-"`
}
// RegisterFlags adds the flags required to config this to the given FlagSet.
func (cfg *PoolConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
f.DurationVar(&cfg.CheckInterval, prefix+"check-interval", 10*time.Second, "How frequently to clean up clients for servers that have gone away or are unhealthy.")
f.BoolVar(&cfg.HealthCheckEnabled, prefix+"enable-health-check", true, "Run a health check on each server during periodic cleanup.")
f.DurationVar(&cfg.HealthCheckTimeout, prefix+"health-check-timeout", 1*time.Second, "Timeout for the health check if health check is enabled.")
}
func (cfg *PoolConfig) Validate() error {
return nil
}
type JumpHashClientPool struct {
*client.Pool
*jumphash.Selector
done chan struct{}
logger log.Logger
}
type AddressProvider interface {
Addresses() []string
}
func NewJumpHashClientPool(pool *client.Pool, dnsProvider AddressProvider, updateInterval time.Duration, logger log.Logger) *JumpHashClientPool {
selector := jumphash.DefaultSelector()
err := selector.SetServers(dnsProvider.Addresses()...)
if err != nil {
level.Warn(logger).Log("msg", "error updating servers", "err", err)
}
p := &JumpHashClientPool{
Pool: pool,
Selector: selector,
done: make(chan struct{}),
logger: logger,
}
go p.updateLoop(dnsProvider, updateInterval)
return p
}
func (p *JumpHashClientPool) AddrForFingerprint(fp uint64) (string, error) {
addr, err := p.FromUInt64(fp)
if err != nil {
return "", err
}
return addr.String(), nil
}
func (p *JumpHashClientPool) Start() {
ctx := context.Background()
_ = services.StartAndAwaitRunning(ctx, p.Pool)
}
func (p *JumpHashClientPool) Stop() {
ctx := context.Background()
_ = services.StopAndAwaitTerminated(ctx, p.Pool)
close(p.done)
}
func (p *JumpHashClientPool) updateLoop(provider AddressProvider, updateInterval time.Duration) {
ticker := time.NewTicker(updateInterval)
defer ticker.Stop()
for {
select {
case <-p.done:
return
case <-ticker.C:
servers := provider.Addresses()
// ServerList deterministically maps keys to _index_ of the server list.
// Since DNS returns records in different order each time, we sort to
// guarantee best possible match between nodes.
sort.Strings(servers)
err := p.SetServers(servers...)
if err != nil {
level.Warn(p.logger).Log("msg", "error updating servers", "err", err)
}
}
}
}

@ -0,0 +1,35 @@
package bloomgateway
import (
"testing"
"time"
"github.com/go-kit/log"
"github.com/stretchr/testify/require"
)
type provider struct {
addresses []string
}
func (p *provider) Addresses() []string {
return p.addresses
}
func TestJumpHashClientPool_UpdateLoop(t *testing.T) {
interval := 100 * time.Millisecond
provider := &provider{[]string{"localhost:9095"}}
pool := NewJumpHashClientPool(nil, provider, interval, log.NewNopLogger())
require.Len(t, pool.Addrs(), 1)
require.Equal(t, "127.0.0.1:9095", pool.Addrs()[0].String())
// update address list
provider.addresses = []string{"localhost:9095", "localhost:9096"}
// wait refresh interval
time.Sleep(2 * interval)
// pool has been updated
require.Len(t, pool.Addrs(), 2)
require.Equal(t, "127.0.0.1:9095", pool.Addrs()[0].String())
require.Equal(t, "127.0.0.1:9096", pool.Addrs()[1].String())
}

@ -2,35 +2,19 @@ package bloomgateway
import (
"context"
"fmt"
"math"
"testing"
"time"
"github.com/go-kit/log"
"github.com/grafana/dskit/flagext"
"github.com/grafana/dskit/ring"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/v3/pkg/bloomutils"
"github.com/grafana/loki/v3/pkg/logproto"
"github.com/grafana/loki/v3/pkg/logql/syntax"
"github.com/grafana/loki/v3/pkg/querier/plan"
v1 "github.com/grafana/loki/v3/pkg/storage/bloom/v1"
"github.com/grafana/loki/v3/pkg/validation"
)
func rs(id int, tokens ...uint32) ring.ReplicationSet {
inst := ring.InstanceDesc{
Id: fmt.Sprintf("instance-%d", id),
Addr: fmt.Sprintf("10.0.0.%d", id),
Tokens: tokens,
}
return ring.ReplicationSet{Instances: []ring.InstanceDesc{inst}}
}
func TestBloomGatewayClient(t *testing.T) {
logger := log.NewNopLogger()
reg := prometheus.NewRegistry()
@ -42,7 +26,7 @@ func TestBloomGatewayClient(t *testing.T) {
flagext.DefaultValues(&cfg)
t.Run("FilterChunks returns response", func(t *testing.T) {
c, err := NewClient(cfg, &mockRing{}, l, reg, logger, "loki", nil, false)
c, err := NewClient(cfg, l, reg, logger, nil, false)
require.NoError(t, err)
expr, err := syntax.ParseExpr(`{foo="bar"}`)
require.NoError(t, err)
@ -51,283 +35,3 @@ func TestBloomGatewayClient(t *testing.T) {
require.Equal(t, 0, len(res))
})
}
func TestBloomGatewayClient_ReplicationSetsWithBounds(t *testing.T) {
testCases := map[string]struct {
instances []ring.InstanceDesc
expected []rsWithRanges
}{
"single instance covers full range": {
instances: []ring.InstanceDesc{
{Id: "instance-1", Addr: "10.0.0.1", Tokens: []uint32{(1 << 31)}}, // 0x80000000
},
expected: []rsWithRanges{
{rs: rs(1, (1 << 31)), ranges: []v1.FingerprintBounds{
v1.NewBounds(0, math.MaxUint64),
}},
},
},
"one token per instance": {
instances: []ring.InstanceDesc{
{Id: "instance-1", Addr: "10.0.0.1", Tokens: []uint32{(1 << 30) * 1}}, // 0x40000000
{Id: "instance-2", Addr: "10.0.0.2", Tokens: []uint32{(1 << 30) * 2}}, // 0x80000000
{Id: "instance-3", Addr: "10.0.0.3", Tokens: []uint32{(1 << 30) * 3}}, // 0xc0000000
},
expected: []rsWithRanges{
{rs: rs(1, (1<<30)*1), ranges: []v1.FingerprintBounds{
v1.NewBounds(0, 4611686018427387903),
v1.NewBounds(13835058055282163712, 18446744073709551615),
}},
{rs: rs(2, (1<<30)*2), ranges: []v1.FingerprintBounds{
v1.NewBounds(4611686018427387904, 9223372036854775807),
}},
{rs: rs(3, (1<<30)*3), ranges: []v1.FingerprintBounds{
v1.NewBounds(9223372036854775808, 13835058055282163711),
}},
},
},
"extreme tokens in ring": {
instances: []ring.InstanceDesc{
{Id: "instance-1", Addr: "10.0.0.1", Tokens: []uint32{0}},
{Id: "instance-2", Addr: "10.0.0.2", Tokens: []uint32{math.MaxUint32}},
},
expected: []rsWithRanges{
{rs: rs(1, 0), ranges: []v1.FingerprintBounds{
v1.NewBounds(math.MaxUint64-math.MaxUint32, math.MaxUint64),
}},
{rs: rs(2, math.MaxUint32), ranges: []v1.FingerprintBounds{
v1.NewBounds(0, math.MaxUint64-math.MaxUint32-1),
}},
},
},
}
for name, tc := range testCases {
tc := tc
t.Run(name, func(t *testing.T) {
subRing := newMockRing(t, tc.instances)
res, err := replicationSetsWithBounds(subRing, tc.instances)
require.NoError(t, err)
require.Equal(t, tc.expected, res)
})
}
}
func TestBloomGatewayClient_PartitionByReplicationSet(t *testing.T) {
// Create 10 fingerprints [0, 2, 4, ... 18]
groups := make([]*logproto.GroupedChunkRefs, 0, 10)
for i := 0; i < 20; i += 2 {
groups = append(groups, &logproto.GroupedChunkRefs{Fingerprint: uint64(i)})
}
// instance token ranges do not overlap
t.Run("non-overlapping", func(t *testing.T) {
servers := []rsWithRanges{
{rs: rs(1), ranges: []v1.FingerprintBounds{v1.NewBounds(0, 4)}},
{rs: rs(2), ranges: []v1.FingerprintBounds{v1.NewBounds(5, 9), v1.NewBounds(15, 19)}},
{rs: rs(3), ranges: []v1.FingerprintBounds{v1.NewBounds(10, 14)}},
}
// partition fingerprints
expected := [][]*logproto.GroupedChunkRefs{
{
{Fingerprint: 0},
{Fingerprint: 2},
{Fingerprint: 4},
},
{
{Fingerprint: 6},
{Fingerprint: 8},
{Fingerprint: 16},
{Fingerprint: 18},
},
{
{Fingerprint: 10},
{Fingerprint: 12},
{Fingerprint: 14},
},
}
partitioned := partitionByReplicationSet(groups, servers)
for i := range partitioned {
require.Equal(t, expected[i], partitioned[i].groups)
}
})
// instance token ranges overlap -- this should not happen in a real ring, though
t.Run("overlapping", func(t *testing.T) {
servers := []rsWithRanges{
{rs: rs(1), ranges: []v1.FingerprintBounds{v1.NewBounds(0, 9)}},
{rs: rs(2), ranges: []v1.FingerprintBounds{v1.NewBounds(5, 14)}},
{rs: rs(3), ranges: []v1.FingerprintBounds{v1.NewBounds(10, 19)}},
}
// partition fingerprints
expected := [][]*logproto.GroupedChunkRefs{
{
{Fingerprint: 0},
{Fingerprint: 2},
{Fingerprint: 4},
{Fingerprint: 6},
{Fingerprint: 8},
},
{
{Fingerprint: 6},
{Fingerprint: 8},
{Fingerprint: 10},
{Fingerprint: 12},
{Fingerprint: 14},
},
{
{Fingerprint: 10},
{Fingerprint: 12},
{Fingerprint: 14},
{Fingerprint: 16},
{Fingerprint: 18},
},
}
partitioned := partitionByReplicationSet(groups, servers)
for i := range partitioned {
require.Equal(t, expected[i], partitioned[i].groups)
}
})
}
func BenchmarkPartitionFingerprintsByAddresses(b *testing.B) {
numFp := 100000
fpStep := math.MaxUint64 / uint64(numFp)
groups := make([]*logproto.GroupedChunkRefs, 0, numFp)
for i := uint64(0); i < math.MaxUint64-fpStep; i += fpStep {
groups = append(groups, &logproto.GroupedChunkRefs{Fingerprint: i})
}
numServers := 100
tokenStep := math.MaxUint32 / uint32(numServers)
servers := make([]rsWithRanges, 0, numServers)
for i := uint32(0); i < math.MaxUint32-tokenStep; i += tokenStep {
servers = append(servers, rsWithRanges{
rs: rs(int(i)),
ranges: []v1.FingerprintBounds{
v1.NewBounds(model.Fingerprint(i)<<32, model.Fingerprint(i+tokenStep)<<32),
},
})
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = partitionByReplicationSet(groups, servers)
}
}
func TestBloomGatewayClient_MapTokenRangeToFingerprintRange(t *testing.T) {
testCases := map[string]struct {
lshift int
inp bloomutils.Range[uint32]
exp v1.FingerprintBounds
}{
"single token expands to multiple fingerprints": {
inp: bloomutils.NewTokenRange(0, 0),
exp: v1.NewBounds(0, 0xffffffff),
},
"max value expands to max value of new range": {
inp: bloomutils.NewTokenRange((1 << 31), math.MaxUint32),
exp: v1.NewBounds((1 << 63), 0xffffffffffffffff),
},
}
for desc, tc := range testCases {
t.Run(desc, func(t *testing.T) {
actual := mapTokenRangeToFingerprintRange(tc.inp)
require.Equal(t, tc.exp, actual)
})
}
}
// make sure mockRing implements the ring.ReadRing interface
var _ ring.ReadRing = &mockRing{}
func newMockRing(t *testing.T, instances []ring.InstanceDesc) *mockRing {
ranges := make([]ring.TokenRanges, 0)
for i := range instances {
tr, err := bloomutils.TokenRangesForInstance(instances[i].Id, instances)
if err != nil {
t.Fatal(err)
}
ranges = append(ranges, tr)
}
return &mockRing{
instances: instances,
ranges: ranges,
}
}
type mockRing struct {
instances []ring.InstanceDesc
ranges []ring.TokenRanges
}
// Get implements ring.ReadRing.
func (r *mockRing) Get(key uint32, _ ring.Operation, _ []ring.InstanceDesc, _ []string, _ []string) (rs ring.ReplicationSet, err error) {
for i := range r.ranges {
if r.ranges[i].IncludesKey(key) {
rs.Instances = append(rs.Instances, r.instances[i])
}
}
return
}
// GetAllHealthy implements ring.ReadRing.
func (r *mockRing) GetAllHealthy(_ ring.Operation) (ring.ReplicationSet, error) {
return ring.ReplicationSet{
Instances: r.instances,
}, nil
}
// GetInstanceState implements ring.ReadRing.
func (*mockRing) GetInstanceState(_ string) (ring.InstanceState, error) {
panic("unimplemented")
}
// GetReplicationSetForOperation implements ring.ReadRing.
func (*mockRing) GetReplicationSetForOperation(_ ring.Operation) (ring.ReplicationSet, error) {
panic("unimplemented")
}
// HasInstance implements ring.ReadRing.
func (*mockRing) HasInstance(_ string) bool {
panic("unimplemented")
}
// InstancesCount implements ring.ReadRing.
func (r *mockRing) InstancesCount() int {
return len(r.instances)
}
// ReplicationFactor implements ring.ReadRing.
func (*mockRing) ReplicationFactor() int {
return 1
}
// ShuffleShard implements ring.ReadRing.
func (r *mockRing) ShuffleShard(_ string, _ int) ring.ReadRing {
return r
}
// ShuffleShardWithLookback implements ring.ReadRing.
func (*mockRing) ShuffleShardWithLookback(_ string, _ int, _ time.Duration, _ time.Time) ring.ReadRing {
panic("unimplemented")
}
// CleanupShuffleShardCache implements ring.ReadRing.
func (*mockRing) CleanupShuffleShardCache(_ string) {
panic("unimplemented")
}
func (r *mockRing) GetTokenRangesForInstance(id string) (ring.TokenRanges, error) {
return bloomutils.TokenRangesForInstance(id, r.instances)
}

@ -3,6 +3,7 @@ package bloomgateway
import (
"time"
"github.com/grafana/dskit/instrument"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
@ -16,6 +17,8 @@ type metrics struct {
type clientMetrics struct {
cacheLocalityScore prometheus.Histogram
requestLatency *prometheus.HistogramVec
clients prometheus.Gauge
}
func newClientMetrics(registerer prometheus.Registerer) *clientMetrics {
@ -27,6 +30,19 @@ func newClientMetrics(registerer prometheus.Registerer) *clientMetrics {
Help: "Cache locality score of the bloom filter, as measured by % of keyspace touched / % of bloom_gws required",
Buckets: prometheus.LinearBuckets(0.01, 0.2, 5),
}),
requestLatency: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
Namespace: constants.Loki,
Subsystem: "bloom_gateway_client",
Name: "request_duration_seconds",
Help: "Time (in seconds) spent serving requests when using the bloom gateway",
Buckets: instrument.DefBuckets,
}, []string{"operation", "status_code"}),
clients: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
Namespace: constants.Loki,
Subsystem: "bloom_gateway",
Name: "clients",
Help: "The current number of bloom gateway clients.",
}),
}
}

@ -1451,11 +1451,9 @@ func (t *Loki) initIndexGateway() (services.Service, error) {
if t.Cfg.BloomGateway.Enabled {
bloomGatewayClient, err := bloomgateway.NewClient(
t.Cfg.BloomGateway.Client,
t.bloomGatewayRingManager.Ring,
t.Overrides,
prometheus.DefaultRegisterer,
logger,
t.Cfg.MetricsNamespace,
t.cacheGenerationLoader,
t.Cfg.CompactorConfig.RetentionEnabled,
)

@ -22,6 +22,7 @@ import (
"github.com/sony/gobreaker"
"github.com/grafana/loki/v3/pkg/util/constants"
"github.com/grafana/loki/v3/pkg/util/jumphash"
)
// MemcachedClient interface exists for mocking memcacheClient.
@ -113,7 +114,7 @@ func (cfg *MemcachedClientConfig) RegisterFlagsWithPrefix(prefix, description st
func NewMemcachedClient(cfg MemcachedClientConfig, name string, r prometheus.Registerer, logger log.Logger, metricsNamespace string) MemcachedClient {
var selector serverSelector
if cfg.ConsistentHash {
selector = DefaultMemcachedJumpHashSelector()
selector = jumphash.DefaultSelector()
} else {
selector = &memcache.ServerList{}
}

@ -3,6 +3,7 @@ package discovery
import (
"context"
"fmt"
"strings"
"sync"
"time"
@ -15,7 +16,7 @@ import (
type DNS struct {
logger log.Logger
cleanupPeriod time.Duration
address string
addresses []string
stop chan struct{}
done sync.WaitGroup
once sync.Once
@ -27,7 +28,7 @@ func NewDNS(logger log.Logger, cleanupPeriod time.Duration, address string, reg
d := &DNS{
logger: logger,
cleanupPeriod: cleanupPeriod,
address: address,
addresses: strings.Split(address, ","),
stop: make(chan struct{}),
done: sync.WaitGroup{},
dnsProvider: dnsProvider,
@ -70,10 +71,10 @@ func (d *DNS) discoveryLoop() {
}
func (d *DNS) runDiscovery() {
ctx, cancel := context.WithTimeoutCause(context.Background(), 5*time.Second, fmt.Errorf("DNS lookup timeout: %s", d.address))
ctx, cancel := context.WithTimeoutCause(context.Background(), 5*time.Second, fmt.Errorf("DNS lookup timeout: %v", d.addresses))
defer cancel()
err := d.dnsProvider.Resolve(ctx, []string{d.address})
err := d.dnsProvider.Resolve(ctx, d.addresses)
if err != nil {
level.Error(d.logger).Log("msg", "failed to resolve index gateway address", "err", err)
level.Error(d.logger).Log("msg", "failed to resolve server addresses", "err", err)
}
}

@ -1,4 +1,4 @@
package cache
package jumphash
import (
"net"
@ -13,16 +13,16 @@ import (
util_log "github.com/grafana/loki/v3/pkg/util/log"
)
// MemcachedJumpHashSelector implements the memcache.ServerSelector
// interface. MemcachedJumpHashSelector utilizes a jump hash to
// Selector implements the memcache.ServerSelector
// interface. Selector utilizes a jump hash to
// distribute keys to servers.
//
// While adding or removing servers only requires 1/N keys to move,
// servers are treated as a stack and can only be pushed/popped.
// Therefore, MemcachedJumpHashSelector works best for servers
// Therefore, Selector works best for servers
// with consistent DNS names where the naturally sorted order
// is predictable.
type MemcachedJumpHashSelector struct {
type Selector struct {
mu sync.RWMutex
addrs []net.Addr
resolveUnixAddr UnixResolver
@ -33,15 +33,15 @@ type UnixResolver func(network, address string) (*net.UnixAddr, error)
type TCPResolver func(network, address string) (*net.TCPAddr, error)
func NewMemcachedJumpHashSelector(resolveUnixAddr UnixResolver, resolveTCPAddr TCPResolver) *MemcachedJumpHashSelector {
return &MemcachedJumpHashSelector{
func NewSelector(resolveUnixAddr UnixResolver, resolveTCPAddr TCPResolver) *Selector {
return &Selector{
resolveUnixAddr: resolveUnixAddr,
resolveTCPAddr: resolveTCPAddr,
}
}
func DefaultMemcachedJumpHashSelector() *MemcachedJumpHashSelector {
return &MemcachedJumpHashSelector{
func DefaultSelector() *Selector {
return &Selector{
resolveUnixAddr: net.ResolveUnixAddr,
resolveTCPAddr: net.ResolveTCPAddr,
}
@ -78,7 +78,7 @@ func (a *staticAddr) String() string { return a.str }
// To minimize the number of rehashes for keys when scaling the
// number of servers in subsequent calls to SetServers, servers
// are stored in natural sort order.
func (s *MemcachedJumpHashSelector) SetServers(servers ...string) error {
func (s *Selector) SetServers(servers ...string) error {
sortedServers := make([]string, len(servers))
copy(sortedServers, servers)
natsort.Sort(sortedServers)
@ -138,7 +138,11 @@ func jumpHash(key uint64, numBuckets int) int32 {
// PickServer returns the server address that a given item
// should be shared onto.
func (s *MemcachedJumpHashSelector) PickServer(key string) (net.Addr, error) {
func (s *Selector) PickServer(key string) (net.Addr, error) {
return s.FromString(key)
}
func (s *Selector) FromString(key string) (net.Addr, error) {
s.mu.RLock()
defer s.mu.RUnlock()
if len(s.addrs) == 0 {
@ -151,10 +155,22 @@ func (s *MemcachedJumpHashSelector) PickServer(key string) (net.Addr, error) {
return s.addrs[idx], nil
}
func (s *Selector) FromUInt64(key uint64) (net.Addr, error) {
s.mu.RLock()
defer s.mu.RUnlock()
if len(s.addrs) == 0 {
return nil, memcache.ErrNoServers
} else if len(s.addrs) == 1 {
return s.addrs[0], nil
}
idx := jumpHash(key, len(s.addrs))
return s.addrs[idx], nil
}
// Each iterates over each server and calls the given function.
// If f returns a non-nil error, iteration will stop and that
// error will be returned.
func (s *MemcachedJumpHashSelector) Each(f func(net.Addr) error) error {
func (s *Selector) Each(f func(net.Addr) error) error {
s.mu.RLock()
defer s.mu.RUnlock()
for _, def := range s.addrs {
@ -164,3 +180,9 @@ func (s *MemcachedJumpHashSelector) Each(f func(net.Addr) error) error {
}
return nil
}
func (s *Selector) Addrs() []net.Addr {
s.mu.RLock()
defer s.mu.RUnlock()
return s.addrs
}

@ -1,4 +1,4 @@
package cache_test
package jumphash
import (
"fmt"
@ -8,8 +8,6 @@ import (
"github.com/facette/natsort"
"github.com/grafana/gomemcache/memcache"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/v3/pkg/storage/chunk/cache"
)
func TestNatSort(t *testing.T) {
@ -58,7 +56,7 @@ var mockTCPResolver = func(network, address string) (*net.TCPAddr, error) {
}
func TestMemcachedJumpHashSelector_PickSever(t *testing.T) {
s := cache.NewMemcachedJumpHashSelector(
s := NewSelector(
mockUnixResolver,
mockTCPResolver,
)
@ -85,7 +83,7 @@ func TestMemcachedJumpHashSelector_PickSever(t *testing.T) {
}
func TestMemcachedJumpHashSelector_PickSever_ErrNoServers(t *testing.T) {
s := cache.NewMemcachedJumpHashSelector(
s := NewSelector(
mockUnixResolver,
mockTCPResolver,
)
Loading…
Cancel
Save