@ -72,6 +72,22 @@ func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Reg
type AlertmanagerAggregatedMetrics struct {
registries * metrics . TenantRegistries
// exported metrics, gathered from Alertmanager PipelineBuilder
numNotifications * prometheus . Desc
numFailedNotifications * prometheus . Desc
numNotificationRequestsTotal * prometheus . Desc
numNotificationRequestsFailedTotal * prometheus . Desc
notificationLatencySeconds * prometheus . Desc
// exported metrics, gathered from Alertmanager nflog
nflogGCDuration * prometheus . Desc
nflogSnapshotDuration * prometheus . Desc
nflogSnapshotSize * prometheus . Desc
nflogQueriesTotal * prometheus . Desc
nflogQueryErrorsTotal * prometheus . Desc
nflogQueryDuration * prometheus . Desc
nflogPropagatedMessagesTotal * prometheus . Desc
// exported metrics, gathered from Alertmanager Silences
silencesGCDuration * prometheus . Desc
silencesSnapshotDuration * prometheus . Desc
@ -81,12 +97,66 @@ type AlertmanagerAggregatedMetrics struct {
silencesQueryDuration * prometheus . Desc
silences * prometheus . Desc
silencesPropagatedMessagesTotal * prometheus . Desc
// exported metrics, gathered from Alertmanager Dispatcher
dispatchAggrGroups * prometheus . Desc
dispatchProcessingDuration * prometheus . Desc
}
func NewAlertmanagerAggregatedMetrics ( registries * metrics . TenantRegistries ) * AlertmanagerAggregatedMetrics {
aggregatedMetrics := & AlertmanagerAggregatedMetrics {
registries : registries ,
numNotifications : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_notifications_total" , Namespace , Subsystem ) ,
"The total number of attempted notifications." ,
[ ] string { "org" , "integration" } , nil ) ,
numFailedNotifications : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_notifications_failed_total" , Namespace , Subsystem ) ,
"The total number of failed notifications." ,
[ ] string { "org" , "integration" } , nil ) ,
numNotificationRequestsTotal : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_notification_requests_total" , Namespace , Subsystem ) ,
"The total number of attempted notification requests." ,
[ ] string { "org" , "integration" } , nil ) ,
numNotificationRequestsFailedTotal : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_notification_requests_failed_total" , Namespace , Subsystem ) ,
"The total number of failed notification requests." ,
[ ] string { "org" , "integration" } , nil ) ,
notificationLatencySeconds : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_notification_latency_seconds" , Namespace , Subsystem ) ,
"The latency of notifications in seconds." ,
nil , nil ) ,
nflogGCDuration : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_nflog_gc_duration_seconds" , Namespace , Subsystem ) ,
"Duration of the last notification log garbage collection cycle." ,
nil , nil ) ,
nflogSnapshotDuration : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_nflog_snapshot_duration_seconds" , Namespace , Subsystem ) ,
"Duration of the last notification log snapshot." ,
nil , nil ) ,
nflogSnapshotSize : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_nflog_snapshot_size_bytes" , Namespace , Subsystem ) ,
"Size of the last notification log snapshot in bytes." ,
nil , nil ) ,
nflogQueriesTotal : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_nflog_queries_total" , Namespace , Subsystem ) ,
"Number of notification log queries were received." ,
nil , nil ) ,
nflogQueryErrorsTotal : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_nflog_query_errors_total" , Namespace , Subsystem ) ,
"Number notification log received queries that failed." ,
nil , nil ) ,
nflogQueryDuration : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_nflog_query_duration_seconds" , Namespace , Subsystem ) ,
"Duration of notification log query evaluation." ,
nil , nil ) ,
nflogPropagatedMessagesTotal : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_nflog_gossip_messages_propagated_total" , Namespace , Subsystem ) ,
"Number of received gossip messages that have been further gossiped." ,
nil , nil ) ,
silencesGCDuration : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_silences_gc_duration_seconds" , Namespace , Subsystem ) ,
"Duration of the last silence garbage collection cycle." ,
@ -119,12 +189,35 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
fmt . Sprintf ( "%s_%s_silences" , Namespace , Subsystem ) ,
"How many silences by state." ,
[ ] string { "org" , "state" } , nil ) ,
dispatchAggrGroups : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_dispatcher_aggregation_groups" , Namespace , Subsystem ) ,
"Number of active aggregation groups" ,
nil , nil ) ,
dispatchProcessingDuration : prometheus . NewDesc (
fmt . Sprintf ( "%s_%s_dispatcher_alert_processing_duration_seconds" , Namespace , Subsystem ) ,
"Summary of latencies for the processing of alerts." ,
nil , nil ) ,
}
return aggregatedMetrics
}
func ( a * AlertmanagerAggregatedMetrics ) Describe ( out chan <- * prometheus . Desc ) {
out <- a . numNotifications
out <- a . numFailedNotifications
out <- a . numNotificationRequestsTotal
out <- a . numNotificationRequestsFailedTotal
out <- a . notificationLatencySeconds
out <- a . nflogGCDuration
out <- a . nflogSnapshotDuration
out <- a . nflogSnapshotSize
out <- a . nflogQueriesTotal
out <- a . nflogQueryErrorsTotal
out <- a . nflogQueryDuration
out <- a . nflogPropagatedMessagesTotal
out <- a . silencesGCDuration
out <- a . silencesSnapshotDuration
out <- a . silencesSnapshotSize
@ -133,11 +226,28 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
out <- a . silencesQueryDuration
out <- a . silencesPropagatedMessagesTotal
out <- a . silences
out <- a . dispatchAggrGroups
out <- a . dispatchProcessingDuration
}
func ( a * AlertmanagerAggregatedMetrics ) Collect ( out chan <- prometheus . Metric ) {
data := a . registries . BuildMetricFamiliesPerTenant ( )
data . SendSumOfCountersPerTenant ( out , a . numNotifications , "alertmanager_notifications_total" , metrics . WithLabels ( "integration" ) , metrics . WithSkipZeroValueMetrics )
data . SendSumOfCountersPerTenant ( out , a . numFailedNotifications , "alertmanager_notifications_failed_total" , metrics . WithLabels ( "integration" ) , metrics . WithSkipZeroValueMetrics )
data . SendSumOfCountersPerTenant ( out , a . numNotificationRequestsTotal , "alertmanager_notification_requests_total" , metrics . WithLabels ( "integration" ) , metrics . WithSkipZeroValueMetrics )
data . SendSumOfCountersPerTenant ( out , a . numNotificationRequestsFailedTotal , "alertmanager_notification_requests_failed_total" , metrics . WithLabels ( "integration" ) , metrics . WithSkipZeroValueMetrics )
data . SendSumOfHistograms ( out , a . notificationLatencySeconds , "alertmanager_notification_latency_seconds" )
data . SendSumOfSummaries ( out , a . nflogGCDuration , "alertmanager_nflog_gc_duration_seconds" )
data . SendSumOfSummaries ( out , a . nflogSnapshotDuration , "alertmanager_nflog_snapshot_duration_seconds" )
data . SendSumOfGauges ( out , a . nflogSnapshotSize , "alertmanager_nflog_snapshot_size_bytes" )
data . SendSumOfCounters ( out , a . nflogQueriesTotal , "alertmanager_nflog_queries_total" )
data . SendSumOfCounters ( out , a . nflogQueryErrorsTotal , "alertmanager_nflog_query_errors_total" )
data . SendSumOfHistograms ( out , a . nflogQueryDuration , "alertmanager_nflog_query_duration_seconds" )
data . SendSumOfCounters ( out , a . nflogPropagatedMessagesTotal , "alertmanager_nflog_gossip_messages_propagated_total" )
data . SendSumOfSummaries ( out , a . silencesGCDuration , "alertmanager_silences_gc_duration_seconds" )
data . SendSumOfSummaries ( out , a . silencesSnapshotDuration , "alertmanager_silences_snapshot_duration_seconds" )
data . SendSumOfGauges ( out , a . silencesSnapshotSize , "alertmanager_silences_snapshot_size_bytes" )
@ -146,4 +256,7 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
data . SendSumOfHistograms ( out , a . silencesQueryDuration , "alertmanager_silences_query_duration_seconds" )
data . SendSumOfCounters ( out , a . silencesPropagatedMessagesTotal , "alertmanager_silences_gossip_messages_propagated_total" )
data . SendSumOfGaugesPerTenantWithLabels ( out , a . silences , "alertmanager_silences" , "state" )
data . SendSumOfGauges ( out , a . dispatchAggrGroups , "alertmanager_dispatcher_aggregation_groups" )
data . SendSumOfSummaries ( out , a . dispatchProcessingDuration , "alertmanager_dispatcher_alert_processing_duration_seconds" )
}