Lambda-promtail: Add support for processing SQS messages, add promtailClient Type, add logger, upgrade dependencies and fix unexpected flushing behaviors (#8231)

pull/3493/head^2
Christophe Collot 2 years ago committed by GitHub
parent c6542e61fa
commit a013e9f342
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      CHANGELOG.md
  2. 6
      docs/sources/clients/lambda-promtail/_index.md
  3. 2
      tools/lambda-promtail/go.mod
  4. 2
      tools/lambda-promtail/go.sum
  5. 9
      tools/lambda-promtail/lambda-promtail/cw.go
  6. 8
      tools/lambda-promtail/lambda-promtail/kinesis.go
  7. 15
      tools/lambda-promtail/lambda-promtail/logger.go
  8. 40
      tools/lambda-promtail/lambda-promtail/main.go
  9. 11
      tools/lambda-promtail/lambda-promtail/main_test.go
  10. 41
      tools/lambda-promtail/lambda-promtail/promtail.go
  11. 53
      tools/lambda-promtail/lambda-promtail/promtail_client.go
  12. 49
      tools/lambda-promtail/lambda-promtail/s3.go
  13. 32
      tools/lambda-promtail/lambda-promtail/sqs.go
  14. 68
      tools/lambda-promtail/lambda-promtail/sqs_test.go

@ -6,6 +6,7 @@
##### Enhancements
* [8231](https://github.com/grafana/loki/pull/8231) **CCOLLOT**: Lambda-promtail: add support for AWS SQS message ingestion.
* [8532](https://github.com/grafana/loki/pull/8532) **justcompile**: Adds Storage Class option to S3 objects
* [7951](https://github.com/grafana/loki/pull/7951) **MichelHollands**: Add a count template function to line_format and label_format.
* [7380](https://github.com/grafana/loki/pull/7380) **liguozhong**: metrics query: range vector support streaming agg when no overlap.
@ -33,6 +34,7 @@
##### Fixes
* [8231](https://github.com/grafana/loki/pull/8231) **CCOLLOT**: Lambda-promtail: fix flushing behavior of batches, leading to a significant increase in performance.
* [7784](https://github.com/grafana/loki/pull/7784) **isodude**: Fix default values of connect addresses for compactor and querier workers to work with IPv6.
* [7880](https://github.com/grafana/loki/pull/7880) **sandeepsukhani**: consider range and offset in queries while looking for schema config for query sharding.
* [7937](https://github.com/grafana/loki/pull/7937) **ssncferreira**: Deprecate CLI flag `-ruler.wal-cleaer.period` and replace it with `-ruler.wal-cleaner.period`.

@ -109,6 +109,12 @@ This workflow allows ingesting AWS loadbalancer logs stored on S3 to Loki.
Cloudfront [real-time logs](https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/real-time-logs.html) can be sent to a Kinesis data stream. The data stream can be mapped to be an [event source](https://docs.aws.amazon.com/lambda/latest/dg/invocation-eventsourcemapping.html) for lambda-promtail to deliver the logs to Loki.
### Triggering Lambda-Promtail via SQS
For AWS services supporting sending messages to SQS (for example, S3 with an S3 Notification to SQS), events can be processed through an [SQS queue using a lambda trigger](https://docs.aws.amazon.com/lambda/latest/dg/with-sqs.html) instead of directly configuring the source service to trigger lambda. Lambda-promtail will retrieve the nested events from the SQS messages' body and process them as if them came directly from the source service.
### On-Failure log recovery using SQS
Triggering lambda-promtail through SQS allows handling on-failure recovery of the logs using a secondary SQS queue as a dead-letter-queue (DLQ). You can configure lambda so that unsuccessfully processed messages will be sent to the DLQ. After fixing the issue, operators will be able to reprocess the messages by sending back messages from the DLQ to the source queue using the [SQS DLQ redrive](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-configure-dead-letter-queue-redrive.html) feature.
## Propagated Labels
Incoming logs can have seven special labels assigned to them which can be used in [relabeling]({{<relref "../promtail/configuration#relabel_configs">}}) or later stages in a Promtail [pipeline]({{<relref "../promtail/pipelines/">}}):

@ -7,6 +7,7 @@ require (
github.com/aws/aws-sdk-go-v2 v1.16.0
github.com/aws/aws-sdk-go-v2/config v1.15.1
github.com/aws/aws-sdk-go-v2/service/s3 v1.22.0
github.com/go-kit/log v0.2.1
github.com/gogo/protobuf v1.3.2
github.com/golang/snappy v0.0.4
github.com/grafana/dskit v0.0.0-20230201083518-528d8a7d52f2
@ -40,7 +41,6 @@ require (
github.com/dustin/go-humanize v1.0.0 // indirect
github.com/fatih/color v1.14.1 // indirect
github.com/felixge/httpsnoop v1.0.3 // indirect
github.com/go-kit/log v0.2.1 // indirect
github.com/go-logfmt/logfmt v0.6.0 // indirect
github.com/gogo/googleapis v1.4.0 // indirect
github.com/gogo/status v1.1.1 // indirect

@ -1100,4 +1100,4 @@ rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc=
sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc=

@ -13,7 +13,6 @@ import (
func parseCWEvent(ctx context.Context, b *batch, ev *events.CloudwatchLogsEvent) error {
data, err := ev.AWSLogs.Parse()
if err != nil {
fmt.Println("error parsing log event: ", err)
return err
}
@ -42,18 +41,18 @@ func parseCWEvent(ctx context.Context, b *batch, ev *events.CloudwatchLogsEvent)
return nil
}
func processCWEvent(ctx context.Context, ev *events.CloudwatchLogsEvent) error {
batch, err := newBatch(ctx)
func processCWEvent(ctx context.Context, ev *events.CloudwatchLogsEvent, pClient Client) error {
batch, err := newBatch(ctx, pClient)
if err != nil {
return err
}
err = parseCWEvent(ctx, batch, ev)
if err != nil {
return err
return fmt.Errorf("error parsing log event: %s", err)
}
err = sendToPromtail(ctx, batch)
err = pClient.sendToPromtail(ctx, batch)
if err != nil {
return err
}

@ -33,17 +33,17 @@ func parseKinesisEvent(ctx context.Context, b batchIf, ev *events.KinesisEvent)
return nil
}
func processKinesisEvent(ctx context.Context, ev *events.KinesisEvent) error {
batch, _ := newBatch(ctx)
func processKinesisEvent(ctx context.Context, ev *events.KinesisEvent, pClient Client) error {
batch, _ := newBatch(ctx, pClient)
err := parseKinesisEvent(ctx, batch, ev)
if err != nil {
return err
}
err = sendToPromtail(ctx, batch)
err = pClient.sendToPromtail(ctx, batch)
if err != nil {
return err
}
return nil
}
}

@ -0,0 +1,15 @@
package main
import (
"os"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
)
func NewLogger(logLevel string) *log.Logger {
logger := log.NewLogfmtLogger(os.Stderr)
logger = level.NewFilter(logger, level.Allow(level.ParseDefault(logLevel, level.DebugValue())))
logger = log.With(logger, "caller", log.DefaultCaller)
return &logger
}

@ -10,6 +10,8 @@ import (
"strconv"
"strings"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/backoff"
"github.com/prometheus/common/model"
"github.com/aws/aws-lambda-go/events"
@ -23,7 +25,7 @@ const (
maxErrMsgLen = 1024
invalidExtraLabelsError = "Invalid value for environment variable EXTRA_LABELS. Expected a comma seperated list with an even number of entries. "
invalidExtraLabelsError = "Invalid value for environment variable EXTRA_LABELS. Expected a comma separated list with an even number of entries. "
)
var (
@ -97,7 +99,6 @@ func setupArguments() {
if strings.EqualFold(print, "false") {
printLogLine = false
}
s3Clients = make(map[string]*s3.Client)
}
@ -133,10 +134,12 @@ func applyExtraLabels(labels model.LabelSet) model.LabelSet {
func checkEventType(ev map[string]interface{}) (interface{}, error) {
var s3Event events.S3Event
var s3TestEvent events.S3TestEvent
var cwEvent events.CloudwatchLogsEvent
var kinesisEvent events.KinesisEvent
var sqsEvent events.SQSEvent
types := [...]interface{}{&s3Event, &cwEvent, &kinesisEvent}
types := [...]interface{}{&s3Event, &s3TestEvent, &cwEvent, &kinesisEvent, &sqsEvent}
j, _ := json.Marshal(ev)
reader := strings.NewReader(string(j))
@ -157,21 +160,42 @@ func checkEventType(ev map[string]interface{}) (interface{}, error) {
}
func handler(ctx context.Context, ev map[string]interface{}) error {
lvl, ok := os.LookupEnv("LOG_LEVEL")
if !ok {
lvl = "info"
}
log := NewLogger(lvl)
pClient := NewPromtailClient(&promtailClientConfig{
backoff: &backoff.Config{
MinBackoff: minBackoff,
MaxBackoff: maxBackoff,
MaxRetries: maxRetries,
},
http: &httpClientConfig{
timeout: timeout,
skipTlsVerify: skipTlsVerify,
},
}, log)
event, err := checkEventType(ev)
if err != nil {
fmt.Printf("invalid event: %s\n", ev)
level.Error(*pClient.log).Log("err", fmt.Errorf("invalid event: %s\n", ev))
return err
}
switch evt := event.(type) {
case *events.S3Event:
return processS3Event(ctx, evt)
return processS3Event(ctx, evt, pClient, pClient.log)
case *events.CloudwatchLogsEvent:
return processCWEvent(ctx, evt)
return processCWEvent(ctx, evt, pClient)
case *events.KinesisEvent:
return processKinesisEvent(ctx, evt)
return processKinesisEvent(ctx, evt, pClient)
case *events.SQSEvent:
return processSQSEvent(ctx, evt)
// When setting up S3 Notification on a bucket, a test event is first sent, see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/notification-content-structure.html
case *events.S3TestEvent:
return nil
}
return err
}

@ -1,13 +1,14 @@
package main
import (
"testing"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"testing"
)
func TestLambdaPromtail_ExtraLabelsValid(t *testing.T) {
extraLabels, err := parseExtraLabels("A1,a,B2,b,C3,c,D4,d")
extraLabels, err := parseExtraLabels("A1,a,B2,b,C3,c,D4,d", false)
require.Nil(t, err)
require.Len(t, extraLabels, 4)
require.Equal(t, model.LabelValue("a"), extraLabels["__extra_A1"])
@ -17,19 +18,19 @@ func TestLambdaPromtail_ExtraLabelsValid(t *testing.T) {
}
func TestLambdaPromtail_ExtraLabelsMissingValue(t *testing.T) {
extraLabels, err := parseExtraLabels("A,a,B,b,C,c,D")
extraLabels, err := parseExtraLabels("A,a,B,b,C,c,D",false)
require.Nil(t, extraLabels)
require.Errorf(t, err, invalidExtraLabelsError)
}
func TestLambdaPromtail_ExtraLabelsInvalidNames(t *testing.T) {
extraLabels, err := parseExtraLabels("A!,%a,B?,$b,C-,c^")
extraLabels, err := parseExtraLabels("A!,%a,B?,$b,C-,c^", false)
require.Nil(t, extraLabels)
require.Errorf(t, err, "invalid name \"__extra_A!\"")
}
func TestLambdaPromtail_TestParseLabelsNoneProvided(t *testing.T) {
extraLabels, err := parseExtraLabels("")
extraLabels, err := parseExtraLabels("", false)
require.Len(t, extraLabels, 0)
require.Nil(t, err)
}

@ -4,7 +4,6 @@ import (
"bufio"
"bytes"
"context"
"crypto/tls"
"fmt"
"io"
"net/http"
@ -12,6 +11,7 @@ import (
"strings"
"time"
"github.com/go-kit/log/level"
"github.com/gogo/protobuf/proto"
"github.com/golang/snappy"
"github.com/grafana/dskit/backoff"
@ -38,6 +38,7 @@ type entry struct {
type batch struct {
streams map[string]*logproto.Stream
size int
client Client
}
type batchIf interface {
@ -47,9 +48,10 @@ type batchIf interface {
flushBatch(ctx context.Context) error
}
func newBatch(ctx context.Context, entries ...entry) (*batch, error) {
func newBatch(ctx context.Context, pClient Client, entries ...entry) (*batch, error) {
b := &batch{
streams: map[string]*logproto.Stream{},
client: pClient,
}
for _, entry := range entries {
@ -123,34 +125,37 @@ func (b *batch) createPushRequest() (*logproto.PushRequest, int) {
}
func (b *batch) flushBatch(ctx context.Context) error {
err := sendToPromtail(ctx, b)
err := b.client.sendToPromtail(ctx, b)
if err != nil {
return err
}
b.streams = make(map[string]*logproto.Stream)
b.resetBatch()
return nil
}
func sendToPromtail(ctx context.Context, b *batch) error {
func (b *batch) resetBatch() {
b.streams = make(map[string]*logproto.Stream)
b.size = 0
}
func (c *promtailClient) sendToPromtail(ctx context.Context, b *batch) error {
buf, _, err := b.encode()
if err != nil {
return err
}
backoff := backoff.New(ctx, backoff.Config{minBackoff, maxBackoff, maxRetries})
backoff := backoff.New(ctx, *c.config.backoff)
var status int
for {
// send uses `timeout` internally, so `context.Background` is good enough.
status, err = send(context.Background(), buf)
status, err = c.send(context.Background(), buf)
// Only retry 429s, 500s and connection-level errors.
if status > 0 && status != 429 && status/100 != 5 {
break
}
fmt.Printf("error sending batch, will retry, status: %d error: %s\n", status, err)
level.Error(*c.log).Log("err", fmt.Errorf("error sending batch, will retry, status: %d error: %s\n", status, err))
backoff.Wait()
// Make sure it sends at least once before checking for retry.
@ -160,15 +165,15 @@ func sendToPromtail(ctx context.Context, b *batch) error {
}
if err != nil {
fmt.Printf("Failed to send logs! %s\n", err)
level.Error(*c.log).Log("err", fmt.Errorf("Failed to send logs! %s\n", err))
return err
}
return nil
}
func send(ctx context.Context, buf []byte) (int, error) {
ctx, cancel := context.WithTimeout(ctx, timeout)
func (c *promtailClient) send(ctx context.Context, buf []byte) (int, error) {
ctx, cancel := context.WithTimeout(ctx, c.config.http.timeout)
defer cancel()
req, err := http.NewRequest("POST", writeAddress.String(), bytes.NewReader(buf))
@ -190,17 +195,11 @@ func send(ctx context.Context, buf []byte) (int, error) {
req.Header.Set("Authorization", "Bearer "+bearerToken)
}
promtailClient := &http.Client{}
if skipTlsVerify == true {
promtailClient = &http.Client{Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}}
}
resp, err := promtailClient.Do(req.WithContext(ctx))
resp, err := c.http.Do(req.WithContext(ctx))
if err != nil {
return -1, err
}
defer resp.Body.Close()
if resp.StatusCode/100 != 2 {
scanner := bufio.NewScanner(io.LimitReader(resp.Body, maxErrMsgLen))
line := ""

@ -0,0 +1,53 @@
package main
import (
"context"
"crypto/tls"
"net/http"
"net/url"
"time"
"github.com/go-kit/log"
"github.com/grafana/dskit/backoff"
)
type Client interface {
sendToPromtail(ctx context.Context, b *batch) error
}
// Implements Client
type promtailClient struct {
config *promtailClientConfig
http *http.Client
log *log.Logger
}
type promtailClientConfig struct {
backoff *backoff.Config
http *httpClientConfig
url *url.URL
}
type httpClientConfig struct {
timeout time.Duration
skipTlsVerify bool
}
func NewPromtailClient(cfg *promtailClientConfig, log *log.Logger) *promtailClient {
return &promtailClient{
config: cfg,
http: NewHTTPClient(cfg.http),
log: log,
}
}
func NewHTTPClient(cfg *httpClientConfig) *http.Client {
transport := http.DefaultTransport
if cfg.skipTlsVerify {
transport = &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}
}
return &http.Client{
Timeout: cfg.timeout,
Transport: transport,
}
}

@ -10,6 +10,8 @@ import (
"time"
"github.com/aws/aws-lambda-go/events"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/loki/pkg/logproto"
"github.com/prometheus/common/model"
@ -38,33 +40,20 @@ const (
LB_LOG_TYPE string = "elasticloadbalancing"
)
func getS3Object(ctx context.Context, labels map[string]string) (io.ReadCloser, error) {
func getS3Client(ctx context.Context, region string) (*s3.Client, error) {
var s3Client *s3.Client
if c, ok := s3Clients[labels["bucket_region"]]; ok {
if c, ok := s3Clients[region]; ok {
s3Client = c
} else {
cfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(labels["bucket_region"]))
cfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(region))
if err != nil {
return nil, err
}
s3Client = s3.NewFromConfig(cfg)
s3Clients[labels["bucket_region"]] = s3Client
s3Clients[region] = s3Client
}
obj, err := s3Client.GetObject(ctx,
&s3.GetObjectInput{
Bucket: aws.String(labels["bucket"]),
Key: aws.String(labels["key"]),
ExpectedBucketOwner: aws.String(labels["bucketOwner"]),
})
if err != nil {
fmt.Printf("Failed to get object %s from bucket %s on account %s\n", labels["key"], labels["bucket"], labels["bucketOwner"])
return nil, err
}
return obj.Body, nil
return s3Client, nil
}
func parseS3Log(ctx context.Context, b *batch, labels map[string]string, obj io.ReadCloser) error {
@ -142,31 +131,37 @@ func getLabels(record events.S3EventRecord) (map[string]string, error) {
return labels, nil
}
func processS3Event(ctx context.Context, ev *events.S3Event) error {
batch, err := newBatch(ctx)
func processS3Event(ctx context.Context, ev *events.S3Event, pc Client, log *log.Logger) error {
batch, err := newBatch(ctx, pc)
if err != nil {
return err
}
for _, record := range ev.Records {
labels, err := getLabels(record)
if err != nil {
return err
}
obj, err := getS3Object(ctx, labels)
level.Info(*log).Log("msg", fmt.Sprintf("fetching s3 file: %s", labels["key"]))
s3Client, err := getS3Client(ctx, labels["bucket_region"])
if err != nil {
return err
}
err = parseS3Log(ctx, batch, labels, obj)
obj, err := s3Client.GetObject(ctx,
&s3.GetObjectInput{
Bucket: aws.String(labels["bucket"]),
Key: aws.String(labels["key"]),
ExpectedBucketOwner: aws.String(labels["bucketOwner"]),
})
if err != nil {
return fmt.Errorf("Failed to get object %s from bucket %s on account %s\n, %s", labels["key"], labels["bucket"], labels["bucketOwner"], err)
}
err = parseS3Log(ctx, batch, labels, obj.Body)
if err != nil {
return err
}
}
err = sendToPromtail(ctx, batch)
err = pc.sendToPromtail(ctx, batch)
if err != nil {
return err
}

@ -0,0 +1,32 @@
package main
import (
"context"
"encoding/json"
"github.com/aws/aws-lambda-go/events"
)
func processSQSEvent(ctx context.Context, evt *events.SQSEvent) error {
for _, record := range evt.Records {
// retrieve nested
event, err := stringToRawEvent(record.Body)
if err != nil {
return err
}
err = handler(ctx, event)
if err != nil {
return err
}
}
return nil
}
func stringToRawEvent(body string) (map[string]interface{}, error) {
result := make(map[string]interface{})
err := json.Unmarshal([]byte(body), &result)
if err != nil {
return nil, err
}
return result, nil
}

@ -0,0 +1,68 @@
package main
import (
"testing"
"github.com/aws/aws-lambda-go/events"
)
func TestStringToRawEvent(t *testing.T) {
tc := &events.SQSEvent{
Records: []events.SQSMessage{
{
AWSRegion: "eu-west-3",
MessageId: "someID",
Body: `{
"Records": [
{
"eventVersion": "2.1",
"eventSource": "aws:s3",
"awsRegion": "us-east-1",
"eventTime": "2023-01-18T01:52:46.432Z",
"eventName": "ObjectCreated:Put",
"userIdentity": {
"principalId": "AWS:SOMEID:AWS.INTERNAL.URL"
},
"requestParameters": {
"sourceIPAddress": "172.15.15.15"
},
"responseElements": {
"x-amz-request-id": "SOMEID",
"x-amz-id-2": "SOMEID"
},
"s3": {
"s3SchemaVersion": "1.0",
"configurationId": "tf-s3-queue-SOMEID",
"bucket": {
"name": "some-bucket-name",
"ownerIdentity": {
"principalId": "SOMEID"
},
"arn": "arn:aws:s3:::SOME-BUCKET-ARN"
},
"object": {
"key": "SOME-PREFIX/AWSLogs/ACCOUNT-ID/vpcflowlogs/us-east-1/2023/01/18/end-of-filename.log.gz",
"size": 1042577,
"eTag": "SOMEID",
"versionId": "SOMEID",
"sequencer": "SOMEID"
}
}
}
]
}`,
},
},
}
for _, record := range tc.Records {
event, err := stringToRawEvent(record.Body)
if err != nil {
t.Error(err)
}
_, err = checkEventType(event)
if err != nil {
t.Error(err)
}
}
}
Loading…
Cancel
Save