Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/ui/cluster.go

351 lines
10 KiB

package ui
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"sync"
"github.com/grafana/ckit/peer"
"golang.org/x/sync/errgroup"
"gopkg.in/yaml.v3"
"github.com/grafana/loki/v3/pkg/analytics"
)
// Cluster represents a collection of cluster members.
type Cluster struct {
Members map[string]Member `json:"members"`
}
// Member represents a node in the cluster with its current state and capabilities.
type Member struct {
Addr string `json:"addr"`
State string `json:"state"`
IsSelf bool `json:"isSelf"`
Target string `json:"target"`
Services []ServiceState `json:"services"`
Build BuildInfo `json:"build"`
Error error `json:"error,omitempty"`
Ready ReadyResponse `json:"ready,omitempty"`
configBody string
}
// ServiceState represents the current state of a service running on a member.
type ServiceState struct {
Service string `json:"service"`
Status string `json:"status"`
}
// BuildInfo contains version and build information about a member.
type BuildInfo struct {
Version string `json:"version"`
Revision string `json:"revision"`
Branch string `json:"branch"`
BuildUser string `json:"buildUser"`
BuildDate string `json:"buildDate"`
GoVersion string `json:"goVersion"`
}
// fetchClusterMembers retrieves the state of all members in the cluster.
// It uses an errgroup to fetch member states concurrently with a limit of 16 concurrent operations.
func (s *Service) fetchClusterMembers(ctx context.Context) (Cluster, error) {
var cluster Cluster
cluster.Members = make(map[string]Member)
g, ctx := errgroup.WithContext(ctx)
g.SetLimit(16)
// Use a mutex to protect concurrent map access
var mu sync.Mutex
for _, p := range s.node.Peers() {
peer := p // Create new variable to avoid closure issues
g.Go(func() error {
member, err := s.fetchMemberState(ctx, peer)
if err != nil {
member.Error = err
}
mu.Lock()
cluster.Members[peer.Name] = member
mu.Unlock()
return nil
})
}
if err := g.Wait(); err != nil {
return Cluster{}, fmt.Errorf("fetching cluster members: %w", err)
}
return cluster, nil
}
// fetchMemberState retrieves the complete state of a single cluster member.
func (s *Service) fetchMemberState(ctx context.Context, peer peer.Peer) (Member, error) {
member := Member{
Addr: peer.Addr,
IsSelf: peer.Self,
State: peer.State.String(),
}
config, err := s.fetchConfig(ctx, peer)
if err != nil {
return member, fmt.Errorf("fetching config: %w", err)
}
member.configBody = config
member.Target = parseTargetFromConfig(config)
services, err := s.fetchServices(ctx, peer)
if err != nil {
return member, fmt.Errorf("fetching services: %w", err)
}
member.Services = services
build, err := s.fetchBuild(ctx, peer)
if err != nil {
return member, fmt.Errorf("fetching build info: %w", err)
}
member.Build = build
readyResp, err := s.checkNodeReadiness(ctx, peer.Name)
if err != nil {
return member, fmt.Errorf("checking node readiness: %w", err)
}
member.Ready = readyResp
return member, nil
}
// buildProxyPath constructs the proxy URL path for a given peer and endpoint.
func (s *Service) buildProxyPath(peer peer.Peer, endpoint string) string {
// todo support configured server prefix.
return fmt.Sprintf("http://%s/ui/api/v1/proxy/%s%s", s.localAddr, peer.Name, endpoint)
}
// readResponseError checks the HTTP response for errors and returns an appropriate error message.
// If the response status is not OK, it reads and includes the response body in the error message.
func readResponseError(resp *http.Response, operation string) error {
if resp == nil {
return fmt.Errorf("%s: no response received", operation)
}
if resp.StatusCode != http.StatusOK {
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("%s failed: %s, error reading body: %v", operation, resp.Status, err)
}
return fmt.Errorf("%s failed: %s, response: %s", operation, resp.Status, string(body))
}
return nil
}
// NodeDetails contains the details of a node in the cluster.
// It adds on top of Member the config, build, clusterID, clusterSeededAt, os, arch, edition and registered analytics metrics.
type NodeDetails struct {
Member
Config string `json:"config"`
ClusterID string `json:"clusterID"`
ClusterSeededAt int64 `json:"clusterSeededAt"`
OS string `json:"os"`
Arch string `json:"arch"`
Edition string `json:"edition"`
Metrics map[string]interface{} `json:"metrics"`
}
func (s *Service) fetchSelfDetails(ctx context.Context) (NodeDetails, error) {
peer, ok := s.getSelfPeer()
if !ok {
return NodeDetails{}, fmt.Errorf("self peer not found")
}
report, err := s.fetchAnalytics(ctx, peer)
if err != nil {
return NodeDetails{}, fmt.Errorf("fetching analytics: %w", err)
}
member, err := s.fetchMemberState(ctx, peer)
if err != nil {
return NodeDetails{}, fmt.Errorf("fetching member state: %w", err)
}
return NodeDetails{
Member: member,
Config: member.configBody,
Metrics: report.Metrics,
ClusterID: report.ClusterID,
ClusterSeededAt: report.CreatedAt.UnixMilli(),
OS: report.Os,
Arch: report.Arch,
Edition: report.Edition,
}, nil
}
func (s *Service) getSelfPeer() (peer.Peer, bool) {
for _, peer := range s.node.Peers() {
if peer.Self {
return peer, true
}
}
return peer.Peer{}, false
}
func (s *Service) fetchAnalytics(ctx context.Context, peer peer.Peer) (analytics.Report, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, s.buildProxyPath(peer, "/ui/api/v1/analytics"), nil)
if err != nil {
return analytics.Report{}, fmt.Errorf("creating request: %w", err)
}
resp, err := s.client.Do(req)
if err != nil {
return analytics.Report{}, fmt.Errorf("sending request: %w", err)
}
if err := readResponseError(resp, "fetch build info"); err != nil {
return analytics.Report{}, err
}
defer resp.Body.Close()
var report analytics.Report
if err := json.NewDecoder(resp.Body).Decode(&report); err != nil {
return analytics.Report{}, fmt.Errorf("decoding response: %w", err)
}
return report, nil
}
// fetchConfig retrieves the configuration of a cluster member.
func (s *Service) fetchConfig(ctx context.Context, peer peer.Peer) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, s.buildProxyPath(peer, "/config"), nil)
if err != nil {
return "", fmt.Errorf("creating request: %w", err)
}
resp, err := s.client.Do(req)
if err != nil {
return "", fmt.Errorf("sending request: %w", err)
}
if err := readResponseError(resp, "fetch config"); err != nil {
return "", err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("reading response: %w", err)
}
return string(body), nil
}
// fetchServices retrieves the service states of a cluster member.
func (s *Service) fetchServices(ctx context.Context, peer peer.Peer) ([]ServiceState, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, s.buildProxyPath(peer, "/services"), nil)
if err != nil {
return nil, fmt.Errorf("creating request: %w", err)
}
resp, err := s.client.Do(req)
if err != nil {
return nil, fmt.Errorf("sending request: %w", err)
}
if err := readResponseError(resp, "fetch services"); err != nil {
return nil, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("reading response: %w", err)
}
return parseServices(string(body))
}
// fetchBuild retrieves the build information of a cluster member.
func (s *Service) fetchBuild(ctx context.Context, peer peer.Peer) (BuildInfo, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, s.buildProxyPath(peer, "/loki/api/v1/status/buildinfo"), nil)
if err != nil {
return BuildInfo{}, fmt.Errorf("creating request: %w", err)
}
resp, err := s.client.Do(req)
if err != nil {
return BuildInfo{}, fmt.Errorf("sending request: %w", err)
}
if err := readResponseError(resp, "fetch build info"); err != nil {
return BuildInfo{}, err
}
defer resp.Body.Close()
var build BuildInfo
if err := json.NewDecoder(resp.Body).Decode(&build); err != nil {
return BuildInfo{}, fmt.Errorf("decoding response: %w", err)
}
return build, nil
}
type ReadyResponse struct {
IsReady bool `json:"isReady"`
Message string `json:"message"`
}
func (s *Service) checkNodeReadiness(ctx context.Context, nodeName string) (ReadyResponse, error) {
peer, err := s.findPeerByName(nodeName)
if err != nil {
return ReadyResponse{}, err
}
req, err := http.NewRequestWithContext(ctx, "GET", s.buildProxyPath(peer, "/ready"), nil)
if err != nil {
return ReadyResponse{}, err
}
resp, err := s.client.Do(req)
if err != nil {
return ReadyResponse{}, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return ReadyResponse{}, err
}
return ReadyResponse{
IsReady: resp.StatusCode == http.StatusOK && strings.TrimSpace(string(body)) == "ready",
Message: string(body),
}, nil
}
// parseTargetFromConfig extracts the target value from a YAML configuration string.
// Returns "unknown" if the config cannot be parsed or the target is not found.
func parseTargetFromConfig(config string) string {
var cfg map[string]interface{}
if err := yaml.Unmarshal([]byte(config), &cfg); err != nil {
return "unknown"
}
target, _ := cfg["target"].(string)
return target
}
// parseServices parses a string containing service states in the format:
// service => status
// Returns a slice of ServiceState structs.
func parseServices(body string) ([]ServiceState, error) {
var services []ServiceState
lines := strings.Split(body, "\n")
for _, line := range lines {
parts := strings.SplitN(line, " => ", 2)
if len(parts) != 2 {
continue
}
services = append(services, ServiceState{
Service: parts[0],
Status: parts[1],
})
}
return services, nil
}