Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
loki/integration/loki_rule_eval_test.go

181 lines
5.2 KiB

//go:build integration
package integration
import (
"context"
"fmt"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/prometheus/prometheus/storage/remote"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/v3/integration/client"
"github.com/grafana/loki/v3/integration/cluster"
"github.com/grafana/loki/v3/pkg/ruler"
)
// TestLocalRuleEval tests that rules are evaluated locally with an embedded query engine
// and that the results are written to the backend correctly.
func TestLocalRuleEval(t *testing.T) {
testRuleEval(t, ruler.EvalModeLocal)
}
// TestRemoteRuleEval tests that rules are evaluated remotely against a configured query-frontend
// and that the results are written to the backend correctly.
func TestRemoteRuleEval(t *testing.T) {
testRuleEval(t, ruler.EvalModeRemote)
}
// The only way we can test rule evaluation in an integration test is to use the remote-write feature.
// In this test we stub out a remote-write receiver and check that the expected data is sent to it.
// Both the local and the remote rule evaluation modes should produce the same result.
func testRuleEval(t *testing.T, mode string) {
clu := cluster.New(nil, cluster.SchemaWithTSDB, func(c *cluster.Cluster) {
c.SetSchemaVer("v13")
})
t.Cleanup(func() {
assert.NoError(t, clu.Cleanup())
})
// initialise a write component and ingest some logs
tWrite := clu.AddComponent(
"write",
"-target=write",
)
now := time.Now()
tenantID := randStringRunes()
require.NoError(t, clu.Run())
job := "accesslog"
cliWrite := client.New(tenantID, "", tWrite.HTTPURL())
cliWrite.Now = now
// 1. Ingest some logs
ingestion: native otlp ingestion support (#10727) **What this PR does / why we need it**: Add support for natively supporting logs ingestion in OTLP format. `/otlp/v1/logs` is the new endpoint where users can push logs in OTLP format. It accepts logs serialized in JSON or proto format. Since OTEL format is very different than what Loki storage model, here is how data in OTEL format will be mapped to Loki data model: * Index labels: The Resource Attributes map quite well to Index labels in Loki since both usually identify the source of the logs. The problem however is that Resource attributes in OTLP can have an unbounded number of values while Loki has a default limit of having up to 30 labels. Since Index labels in Loki can largely drive the kind of querying experience the users are going to have, we have chosen select attributes which would be picked as Index Labels. The ones that are not picked up as Index labels would be stored as Structured Metadata with each log entry. * Timestamp: LogRecord.TimeUnixNano * LogLine: LogRecord.Body holds the body of the log. However, since Loki only supports Log body in string format, we will stringify non-string values using [AsString method from OTEL collector lib](https://github.com/open-telemetry/opentelemetry-collector/blob/ab3d6c5b64701e690aaa340b0a63f443ff22c1f0/pdata/pcommon/value.go#L353). * Structured Metadata: Anything which can’t be stored in Index labels and LogLine. Here is a non-exhaustive list of what will be stored in Structured Metadata to give a sense of what it will hold: * Resource Attributes not stored as Index labels is replicated and stored with each log entry. * Everything under InstrumentationScope is replicated and stored with each log entry. * Everything under LogRecord except LogRecord.Body, LogRecord.TimeUnixNano and sometimes LogRecord.ObservedTimestamp. *NOTES*: * Since Loki does not support `.` or any other special characters other than `_` in label names, we replace all non-supported characters with `_`. * Since Loki only supports string in values of Index Labels and Structured Metadata, all the complex types are converted as follows: * Map would be flattened into label keys using `_` as separator, same as how we do it in [json parser in LogQL](https://grafana.com/docs/loki/latest/query/log_queries/#json). * Everything else is stringified using [AsString method from OTEL collector lib](https://github.com/open-telemetry/opentelemetry-collector/blob/ab3d6c5b64701e690aaa340b0a63f443ff22c1f0/pdata/pcommon/value.go#L353) **Special notes for your reviewer**: I will open follow-up PRs for: * Documentation * Make blessed attributes list configurable per tenant. **Checklist** - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label
2 years ago
require.NoError(t, cliWrite.PushLogLine("HEAD /", now, nil, map[string]string{"method": "HEAD", "job": job}))
require.NoError(t, cliWrite.PushLogLine("GET /", now, nil, map[string]string{"method": "GET", "job": job}))
require.NoError(t, cliWrite.PushLogLine("GET /", now.Add(time.Second), nil, map[string]string{"method": "GET", "job": job}))
// advance time to after the last ingested log line so queries don't return empty results
now = now.Add(time.Second * 2)
// start up read component for remote rule evaluation
tRead := clu.AddComponent(
"read",
"-target=read",
// we set a fake address here because deletion is not being tested,
// and we have a circular dependency with the backend
"-common.compactor-address=http://fake",
"-legacy-read-mode=false",
query-scheduler: fix query distribution in SSD mode (#9471) **What this PR does / why we need it**: When we run the `query-scheduler` in `ring` mode, `queriers` and `query-frontend` discover the available `query-scheduler` instances using the ring. However, we have a problem when `query-schedulers` are not running in the same process as queriers and query-frontend since [we try to get the ring client interface from the scheduler instance](https://github.com/grafana/loki/blob/abd6131bba18db7f3575241c5e6dc4eed879fbc0/pkg/loki/modules.go#L358). This causes queries not to be spread across all the available queriers when running in SSD mode because [we point querier workers to query frontend when there is no ring client and scheduler address configured](https://github.com/grafana/loki/blob/b05f4fced305800b32641ae84e3bed5f1794fa7d/pkg/querier/worker_service.go#L115). I have fixed this issue by adding a new hidden target to initialize the ring client in `reader`/`member` mode based on which service is initializing it. `reader` mode will be used by `queriers` and `query-frontend` for discovering `query-scheduler` instances from the ring. `member` mode will be used by `query-schedulers` for registering themselves in the ring. I have also made a couple of changes not directly related to the issue but it fixes some problems: * [reset metric registry for each integration test](https://github.com/grafana/loki/commit/18c4fe59078b649ad6a788a48765b101d0b97618) - Previously we were reusing the same registry for all the tests and just [ignored the attempts to register same metrics](https://github.com/grafana/loki/blob/01f0ded7fcb57e3a7b26ffc1e8e3abf04a403825/integration/cluster/cluster.go#L113). This causes the registry to have metrics registered only from the first test so any updates from subsequent tests won't reflect in the metrics. metrics was the only reliable way for me to verify that `query-schedulers` were connected to `queriers` and `query-frontend` when running in ring mode in the integration test that I added to test my changes. This should also help with other tests where earlier it was hard to reliably check the metrics. * [load config from cli as well before applying dynamic config](https://github.com/grafana/loki/commit/f9e2448fc7e718db107165cd908054c806b84337) - Previously we were applying dynamic config considering just the config from config file. This results in unexpected config changes, for example, [this config change](https://github.com/grafana/loki/blob/4148dd2c51cb827ec3889298508b95ec7731e7fd/integration/loki_micro_services_test.go#L66) was getting ignored and [dynamic config tuning was unexpectedly turning on ring mode](https://github.com/grafana/loki/blob/52cd0a39b8266564352c61ab9b845ab597008770/pkg/loki/config_wrapper.go#L94) in the config. It is better to do any config tuning based on both file and cli args configs. **Which issue(s) this PR fixes**: Fixes #9195
2 years ago
"-query-scheduler.use-scheduler-ring=false",
)
require.NoError(t, clu.Run())
// start up a backend component which contains the ruler
tBackend := clu.AddComponent(
"backend",
"-target=backend",
"-legacy-read-mode=false",
)
rwHandler := func(called *bool, test func(w http.ResponseWriter, r *http.Request)) *httptest.Server {
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/write" {
t.Errorf("Expected to request '/api/v1/write', got: %s", r.URL.Path)
}
test(w, r)
*called = true
w.WriteHeader(http.StatusOK)
}))
}
// this is the function that will be called when the remote-write receiver receives a request.
// it tests that the expected payload is received.
expectedResults := func(_ http.ResponseWriter, r *http.Request) {
wr, err := remote.DecodeWriteRequest(r.Body)
require.NoError(t, err)
// depending on the rule interval, we may get multiple timeseries before remote-write is triggered,
// so we just check that we have at least one that matches our requirements.
require.GreaterOrEqual(t, len(wr.Timeseries), 1)
// we expect to see two GET lines from the aggregation in the recording rule
require.Equal(t, wr.Timeseries[len(wr.Timeseries)-1].Samples[0].Value, float64(2))
}
var called bool
server1 := rwHandler(&called, expectedResults)
defer server1.Close()
// configure the backend component
tBackend.WithRulerRemoteWrite("target1", server1.URL)
if mode == ruler.EvalModeRemote {
tBackend.WithExtraConfig(fmt.Sprintf(`
ruler:
evaluation:
mode: %s
query_frontend:
address: %s
`, mode, tRead.GRPCURL()))
}
record := fmt.Sprintf(`
groups:
- name: record
interval: 1s
rules:
- record: test
expr: sum by (method) (count_over_time({job="%s", method="GET"}[1m]))
labels:
foo: bar
`, job)
require.NoError(t, tBackend.WithTenantRules(map[string]map[string]string{
tenantID: {
"record.yaml": record,
},
}))
m, e := tBackend.MergedConfig()
require.NoError(t, e)
t.Logf("starting backend with config:\n%s\n", m)
require.NoError(t, clu.Run())
cliBackend := client.New(tenantID, "", tBackend.HTTPURL())
cliBackend.Now = now
// 2. Assert rules evaluation
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// check rules exist
resp, err := cliBackend.GetRules(ctx)
require.NoError(t, err)
require.NotNil(t, resp)
require.Equal(t, "success", resp.Status)
require.Len(t, resp.Data.Groups, 1)
require.Len(t, resp.Data.Groups[0].Rules, 1)
// ensure that both remote-write receivers were called
require.Eventually(t, func() bool {
return assert.ObjectsAreEqualValues(true, called)
}, 30*time.Second, 100*time.Millisecond, "remote-write was not called")
}