@ -14,6 +14,7 @@
package remote
import (
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
@ -21,16 +22,6 @@ import (
"github.com/prometheus/common/model"
)
const (
// The maximum number of concurrent send requests to the remote storage.
maxConcurrentSends = 10
// The maximum number of samples to fit into a single request to the remote storage.
maxSamplesPerSend = 100
// The deadline after which to send queued samples even if the maximum batch
// size has not been reached.
batchSendDeadline = 5 * time . Second
)
// String constants for instrumentation.
const (
namespace = "prometheus"
@ -51,14 +42,28 @@ type StorageClient interface {
Name ( ) string
}
type StorageQueueManagerConfig struct {
QueueCapacity int // Number of samples to buffer per shard before we start dropping them.
Shards int // Number of shards, i.e. amount of concurrency.
MaxSamplesPerSend int // Maximum number of samples per send.
BatchSendDeadline time . Duration // Maximum time sample will wait in buffer.
}
var defaultConfig = StorageQueueManagerConfig {
QueueCapacity : 100 * 1024 / 10 ,
Shards : 10 ,
MaxSamplesPerSend : 100 ,
BatchSendDeadline : 5 * time . Second ,
}
// StorageQueueManager manages a queue of samples to be sent to the Storage
// indicated by the provided StorageClient.
type StorageQueueManager struct {
tsdb StorageClient
queue chan * model . Sample
pendingSamples model . Samples
sendSemaphore chan bool
drained chan bool
cfg StorageQueueManagerConfig
tsdb StorageClient
shards [ ] chan * model . Sample
wg sync . WaitGroup
done chan struct { }
samplesCount * prometheus . CounterVec
sendLatency prometheus . Summary
@ -69,16 +74,25 @@ type StorageQueueManager struct {
}
// NewStorageQueueManager builds a new StorageQueueManager.
func NewStorageQueueManager ( tsdb StorageClient , queueCapacity int ) * StorageQueueManager {
func NewStorageQueueManager ( tsdb StorageClient , cfg * StorageQueueManagerConfig ) * StorageQueueManager {
constLabels := prometheus . Labels {
"type" : tsdb . Name ( ) ,
}
return & StorageQueueManager {
tsdb : tsdb ,
queue : make ( chan * model . Sample , queueCapacity ) ,
sendSemaphore : make ( chan bool , maxConcurrentSends ) ,
drained : make ( chan bool ) ,
if cfg == nil {
cfg = & defaultConfig
}
shards := make ( [ ] chan * model . Sample , cfg . Shards )
for i := 0 ; i < cfg . Shards ; i ++ {
shards [ i ] = make ( chan * model . Sample , cfg . QueueCapacity )
}
t := & StorageQueueManager {
cfg : * cfg ,
tsdb : tsdb ,
shards : shards ,
done : make ( chan struct { } ) ,
samplesCount : prometheus . NewCounterVec (
prometheus . CounterOpts {
@ -126,17 +140,23 @@ func NewStorageQueueManager(tsdb StorageClient, queueCapacity int) *StorageQueue
constLabels ,
) ,
prometheus . GaugeValue ,
float64 ( q ueueCapacity) ,
float64 ( cfg . Q ueueCapacity) ,
) ,
}
t . wg . Add ( cfg . Shards )
return t
}
// Append queues a sample to be sent to the remote storage. It drops the
// sample on the floor if the queue is full.
// Always returns nil.
func ( t * StorageQueueManager ) Append ( s * model . Sample ) error {
fp := s . Metric . FastFingerprint ( )
shard := uint64 ( fp ) % uint64 ( t . cfg . Shards )
select {
case t . queue <- s :
case t . shards [ shard ] <- s :
default :
t . samplesCount . WithLabelValues ( dropped ) . Inc ( )
log . Warn ( "Remote storage queue full, discarding sample." )
@ -144,16 +164,11 @@ func (t *StorageQueueManager) Append(s *model.Sample) error {
return nil
}
// Stop stops sending samples to the remote storage and waits for pending
// sends to complete.
func ( t * StorageQueueManager ) Stop ( ) {
log . Infof ( "Stopping remote storage..." )
close ( t . queue )
<- t . drained
for i := 0 ; i < maxConcurrentSends ; i ++ {
t . sendSemaphore <- true
}
log . Info ( "Remote storage stopped." )
// NeedsThrottling implements storage.SampleAppender. It will always return
// false as a remote storage drops samples on the floor if backlogging instead
// of asking for throttling.
func ( * StorageQueueManager ) NeedsThrottling ( ) bool {
return false
}
// Describe implements prometheus.Collector.
@ -166,79 +181,96 @@ func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) {
ch <- t . queueCapacity . Desc ( )
}
// QueueLength returns the number of outstanding samples in the queue.
func ( t * StorageQueueManager ) queueLen ( ) int {
queueLength := 0
for _ , shard := range t . shards {
queueLength += len ( shard )
}
return queueLength
}
// Collect implements prometheus.Collector.
func ( t * StorageQueueManager ) Collect ( ch chan <- prometheus . Metric ) {
t . samplesCount . Collect ( ch )
t . sendLatency . Collect ( ch )
t . queueLength . Set ( float64 ( len ( t . queue ) ) )
t . queueLength . Set ( float64 ( t . queueLen ( ) ) )
ch <- t . failedBatches
ch <- t . failedSamples
ch <- t . queueLength
ch <- t . queueCapacity
}
func ( t * StorageQueueManager ) sendSamples ( s model . Samples ) {
t . sendSemaphore <- true
go func ( ) {
defer func ( ) {
<- t . sendSemaphore
} ( )
// Samples are sent to the remote storage on a best-effort basis. If a
// sample isn't sent correctly the first time, it's simply dropped on the
// floor.
begin := time . Now ( )
err := t . tsdb . Store ( s )
duration := time . Since ( begin ) . Seconds ( )
labelValue := success
if err != nil {
log . Warnf ( "error sending %d samples to remote storage: %s" , len ( s ) , err )
labelValue = failure
t . failedBatches . Inc ( )
t . failedSamples . Add ( float64 ( len ( s ) ) )
}
t . samplesCount . WithLabelValues ( labelValue ) . Add ( float64 ( len ( s ) ) )
t . sendLatency . Observe ( duration )
} ( )
}
// Run continuously sends samples to the remote storage.
func ( t * StorageQueueManager ) Run ( ) {
defer func ( ) {
close ( t . drained )
} ( )
for i := 0 ; i < t . cfg . Shards ; i ++ {
go t . runShard ( i )
}
t . wg . Wait ( )
}
// Stop stops sending samples to the remote storage and waits for pending
// sends to complete.
func ( t * StorageQueueManager ) Stop ( ) {
log . Infof ( "Stopping remote storage..." )
for _ , shard := range t . shards {
close ( shard )
}
t . wg . Wait ( )
log . Info ( "Remote storage stopped." )
}
// Send batches of at most maxSamplesPerSend samples to the remote storage.
func ( t * StorageQueueManager ) runShard ( i int ) {
defer t . wg . Done ( )
shard := t . shards [ i ]
// Send batches of at most MaxSamplesPerSend samples to the remote storage.
// If we have fewer samples than that, flush them out after a deadline
// anyways.
pendingSamples := model . Samples { }
for {
select {
case s , ok := <- t . queue :
case s , ok := <- shard :
if ! ok {
log . Infof ( "Flushing %d samples to remote storage..." , len ( t . pendingSamples ) )
t . flush ( )
log . Infof ( "Done flushing." )
if len ( pendingSamples ) > 0 {
log . Infof ( "Flushing %d samples to remote storage..." , len ( pendingSamples ) )
t . sendSamples ( pendingSamples )
log . Infof ( "Done flushing." )
}
return
}
t . pendingSamples = append ( t . pendingSamples , s )
pendingSamples = append ( pendingSamples , s )
for len ( t . pendingSamples ) >= maxSamplesPerSend {
t . sendSamples ( t . pendingSamples [ : maxSamplesPerSend ] )
t . pendingSamples = t . pendingSamples [ maxSamplesPerSend : ]
for len ( pendingSamples ) >= t . cfg . MaxSamplesPerSend {
t . sendSamples ( pendingSamples [ : t . cfg . MaxSamplesPerSend ] )
pendingSamples = pendingSamples [ t . cfg . MaxSamplesPerSend : ]
}
case <- time . After ( t . cfg . BatchSendDeadline ) :
if len ( pendingSamples ) > 0 {
t . sendSamples ( pendingSamples )
pendingSamples = pendingSamples [ : 0 ]
}
case <- time . After ( batchSendDeadline ) :
t . flush ( )
}
}
}
// Flush flushes remaining queued samples.
func ( t * StorageQueueManager ) flush ( ) {
if len ( t . pendingSamples ) > 0 {
t . sendSamples ( t . pendingSamples )
func ( t * StorageQueueManager ) sendSamples ( s model . Samples ) {
// Samples are sent to the remote storage on a best-effort basis. If a
// sample isn't sent correctly the first time, it's simply dropped on the
// floor.
begin := time . Now ( )
err := t . tsdb . Store ( s )
duration := time . Since ( begin ) . Seconds ( )
labelValue := success
if err != nil {
log . Warnf ( "error sending %d samples to remote storage: %s" , len ( s ) , err )
labelValue = failure
t . failedBatches . Inc ( )
t . failedSamples . Add ( float64 ( len ( s ) ) )
}
t . pendingSamples = t . pendingSamples [ : 0 ]
t . samplesCount . WithLabelValues ( labelValue ) . Add ( float64 ( len ( s ) ) )
t . sendLatency . Observe ( duration )
}