Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/pattern/stream.go

130 lines
3.6 KiB

package pattern
import (
"context"
"sync"
"time"
"github.com/grafana/loki/v3/pkg/logproto"
"github.com/grafana/loki/v3/pkg/pattern/drain"
"github.com/grafana/loki/v3/pkg/pattern/iter"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/model/labels"
)
// TODO(kolesnikovae):
//
// This is crucial for Drain to ensure that the first LogClusterDepth tokens
// are constant (see https://jiemingzhu.github.io/pub/pjhe_icws2017.pdf).
// We should remove any variables such as timestamps, IDs, IPs, counters, etc.
// from these tokens.
//
// Moreover, Drain is not designed for structured logs. Therefore, we should
// handle logfmt (and, probably, JSON) logs in a special way:
//
// The parse tree should have a fixed length, and the depth should be
// determined by the number of fields in the logfmt message.
// A parsing tree should be maintained for each unique field set.
var drainConfig = &drain.Config{
// At training, if at the depth of LogClusterDepth there is a cluster with
// similarity coefficient greater that SimTh, then the log message is added
// to that cluster. Otherwise, a new cluster is created.
//
// LogClusterDepth should be equal to the number of constant tokens from
// the beginning of the message that likely determine the message contents.
//
// > In this step, Drain traverses from a 1-st layer node, which
// > is searched in step 2, to a leaf node. This step is based on
// > the assumption that tokens in the beginning positions of a log
// > message are more likely to be constants. Specifically, Drain
// > selects the next internal node by the tokens in the beginning
// > positions of the log message
LogClusterDepth: 8,
// SimTh is basically a ratio of matching/total in the cluster.
// Cluster tokens: "foo <*> bar fred"
// Log line: "foo bar baz qux"
// * * * x
// Similarity of these sequences is 0.75 (the distance)
// Both SimTh and MaxClusterDepth impact branching factor: the greater
// MaxClusterDepth and SimTh, the less the chance that there will be
// "similar" clusters, but the greater the footprint.
SimTh: 0.3,
MaxChildren: 100,
ParamString: "<_>",
MaxClusters: 300,
}
type stream struct {
fp model.Fingerprint
labels labels.Labels
labelsString string
labelHash uint64
patterns *drain.Drain
mtx sync.Mutex
lastTs int64
}
func newStream(
fp model.Fingerprint,
labels labels.Labels,
) (*stream, error) {
return &stream{
fp: fp,
labels: labels,
labelsString: labels.String(),
labelHash: labels.Hash(),
patterns: drain.New(drainConfig),
}, nil
}
func (s *stream) Push(
_ context.Context,
entries []logproto.Entry,
) error {
s.mtx.Lock()
defer s.mtx.Unlock()
for _, entry := range entries {
if entry.Timestamp.UnixNano() < s.lastTs {
continue
}
s.lastTs = entry.Timestamp.UnixNano()
s.patterns.Train(entry.Line, entry.Timestamp.UnixNano())
}
return nil
}
func (s *stream) Iterator(_ context.Context, from, through model.Time) (iter.Iterator, error) {
// todo we should improve locking.
s.mtx.Lock()
defer s.mtx.Unlock()
clusters := s.patterns.Clusters()
iters := make([]iter.Iterator, 0, len(clusters))
for _, cluster := range clusters {
if cluster.String() == "" {
continue
}
iters = append(iters, cluster.Iterator(from, through))
}
return iter.NewMerge(iters...), nil
}
func (s *stream) prune(olderThan time.Duration) bool {
s.mtx.Lock()
defer s.mtx.Unlock()
clusters := s.patterns.Clusters()
for _, cluster := range clusters {
cluster.Prune(olderThan)
if cluster.Size == 0 {
s.patterns.Delete(cluster)
}
}
return len(s.patterns.Clusters()) == 0
}