diff --git a/pkg/services/ngalert/metrics/multi_org_alertmanager.go b/pkg/services/ngalert/metrics/multi_org_alertmanager.go index 056479c27d8..eb8441e45c6 100644 --- a/pkg/services/ngalert/metrics/multi_org_alertmanager.go +++ b/pkg/services/ngalert/metrics/multi_org_alertmanager.go @@ -72,6 +72,22 @@ func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Reg type AlertmanagerAggregatedMetrics struct { registries *metrics.TenantRegistries + // exported metrics, gathered from Alertmanager PipelineBuilder + numNotifications *prometheus.Desc + numFailedNotifications *prometheus.Desc + numNotificationRequestsTotal *prometheus.Desc + numNotificationRequestsFailedTotal *prometheus.Desc + notificationLatencySeconds *prometheus.Desc + + // exported metrics, gathered from Alertmanager nflog + nflogGCDuration *prometheus.Desc + nflogSnapshotDuration *prometheus.Desc + nflogSnapshotSize *prometheus.Desc + nflogQueriesTotal *prometheus.Desc + nflogQueryErrorsTotal *prometheus.Desc + nflogQueryDuration *prometheus.Desc + nflogPropagatedMessagesTotal *prometheus.Desc + // exported metrics, gathered from Alertmanager Silences silencesGCDuration *prometheus.Desc silencesSnapshotDuration *prometheus.Desc @@ -81,12 +97,66 @@ type AlertmanagerAggregatedMetrics struct { silencesQueryDuration *prometheus.Desc silences *prometheus.Desc silencesPropagatedMessagesTotal *prometheus.Desc + + // exported metrics, gathered from Alertmanager Dispatcher + dispatchAggrGroups *prometheus.Desc + dispatchProcessingDuration *prometheus.Desc } func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics { aggregatedMetrics := &AlertmanagerAggregatedMetrics{ registries: registries, + numNotifications: prometheus.NewDesc( + fmt.Sprintf("%s_%s_notifications_total", Namespace, Subsystem), + "The total number of attempted notifications.", + []string{"org", "integration"}, nil), + numFailedNotifications: prometheus.NewDesc( + fmt.Sprintf("%s_%s_notifications_failed_total", Namespace, Subsystem), + "The total number of failed notifications.", + []string{"org", "integration"}, nil), + numNotificationRequestsTotal: prometheus.NewDesc( + fmt.Sprintf("%s_%s_notification_requests_total", Namespace, Subsystem), + "The total number of attempted notification requests.", + []string{"org", "integration"}, nil), + numNotificationRequestsFailedTotal: prometheus.NewDesc( + fmt.Sprintf("%s_%s_notification_requests_failed_total", Namespace, Subsystem), + "The total number of failed notification requests.", + []string{"org", "integration"}, nil), + notificationLatencySeconds: prometheus.NewDesc( + fmt.Sprintf("%s_%s_notification_latency_seconds", Namespace, Subsystem), + "The latency of notifications in seconds.", + nil, nil), + + nflogGCDuration: prometheus.NewDesc( + fmt.Sprintf("%s_%s_nflog_gc_duration_seconds", Namespace, Subsystem), + "Duration of the last notification log garbage collection cycle.", + nil, nil), + nflogSnapshotDuration: prometheus.NewDesc( + fmt.Sprintf("%s_%s_nflog_snapshot_duration_seconds", Namespace, Subsystem), + "Duration of the last notification log snapshot.", + nil, nil), + nflogSnapshotSize: prometheus.NewDesc( + fmt.Sprintf("%s_%s_nflog_snapshot_size_bytes", Namespace, Subsystem), + "Size of the last notification log snapshot in bytes.", + nil, nil), + nflogQueriesTotal: prometheus.NewDesc( + fmt.Sprintf("%s_%s_nflog_queries_total", Namespace, Subsystem), + "Number of notification log queries were received.", + nil, nil), + nflogQueryErrorsTotal: prometheus.NewDesc( + fmt.Sprintf("%s_%s_nflog_query_errors_total", Namespace, Subsystem), + "Number notification log received queries that failed.", + nil, nil), + nflogQueryDuration: prometheus.NewDesc( + fmt.Sprintf("%s_%s_nflog_query_duration_seconds", Namespace, Subsystem), + "Duration of notification log query evaluation.", + nil, nil), + nflogPropagatedMessagesTotal: prometheus.NewDesc( + fmt.Sprintf("%s_%s_nflog_gossip_messages_propagated_total", Namespace, Subsystem), + "Number of received gossip messages that have been further gossiped.", + nil, nil), + silencesGCDuration: prometheus.NewDesc( fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem), "Duration of the last silence garbage collection cycle.", @@ -119,12 +189,35 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale fmt.Sprintf("%s_%s_silences", Namespace, Subsystem), "How many silences by state.", []string{"org", "state"}, nil), + + dispatchAggrGroups: prometheus.NewDesc( + fmt.Sprintf("%s_%s_dispatcher_aggregation_groups", Namespace, Subsystem), + "Number of active aggregation groups", + nil, nil), + dispatchProcessingDuration: prometheus.NewDesc( + fmt.Sprintf("%s_%s_dispatcher_alert_processing_duration_seconds", Namespace, Subsystem), + "Summary of latencies for the processing of alerts.", + nil, nil), } return aggregatedMetrics } func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) { + out <- a.numNotifications + out <- a.numFailedNotifications + out <- a.numNotificationRequestsTotal + out <- a.numNotificationRequestsFailedTotal + out <- a.notificationLatencySeconds + + out <- a.nflogGCDuration + out <- a.nflogSnapshotDuration + out <- a.nflogSnapshotSize + out <- a.nflogQueriesTotal + out <- a.nflogQueryErrorsTotal + out <- a.nflogQueryDuration + out <- a.nflogPropagatedMessagesTotal + out <- a.silencesGCDuration out <- a.silencesSnapshotDuration out <- a.silencesSnapshotSize @@ -133,11 +226,28 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) { out <- a.silencesQueryDuration out <- a.silencesPropagatedMessagesTotal out <- a.silences + + out <- a.dispatchAggrGroups + out <- a.dispatchProcessingDuration } func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) { data := a.registries.BuildMetricFamiliesPerTenant() + data.SendSumOfCountersPerTenant(out, a.numNotifications, "alertmanager_notifications_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics) + data.SendSumOfCountersPerTenant(out, a.numFailedNotifications, "alertmanager_notifications_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics) + data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsTotal, "alertmanager_notification_requests_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics) + data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics) + data.SendSumOfHistograms(out, a.notificationLatencySeconds, "alertmanager_notification_latency_seconds") + + data.SendSumOfSummaries(out, a.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds") + data.SendSumOfSummaries(out, a.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds") + data.SendSumOfGauges(out, a.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes") + data.SendSumOfCounters(out, a.nflogQueriesTotal, "alertmanager_nflog_queries_total") + data.SendSumOfCounters(out, a.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total") + data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds") + data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total") + data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes") @@ -146,4 +256,7 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds") data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total") data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state") + + data.SendSumOfGauges(out, a.dispatchAggrGroups, "alertmanager_dispatcher_aggregation_groups") + data.SendSumOfSummaries(out, a.dispatchProcessingDuration, "alertmanager_dispatcher_alert_processing_duration_seconds") }