mirror of https://github.com/grafana/grafana
Backend Plugins: Collect and expose metrics and plugin process health check (#21481)
Adds support for collecting metrics from backend plugins and exposing them thru Grafana's Prometheus metrics endpoint. Enables to check health of backend plugin by using the route `/api/plugins/<plugin id>/health`. Uses sdk v0.6.0. Closes #20984pull/21507/head
parent
f56f54b1a3
commit
5c711bfb79
@ -0,0 +1,89 @@ |
||||
package collector |
||||
|
||||
import ( |
||||
"context" |
||||
"sync" |
||||
"time" |
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
) |
||||
|
||||
// Namespace collector metric namespace
|
||||
const Namespace = "grafana_plugin" |
||||
|
||||
var ( |
||||
scrapeDurationDesc = prometheus.NewDesc( |
||||
prometheus.BuildFQName(Namespace, "scrape", "duration_seconds"), |
||||
"grafana_plugin: Duration of a plugin collector scrape.", |
||||
[]string{"plugin_id"}, |
||||
nil, |
||||
) |
||||
scrapeSuccessDesc = prometheus.NewDesc( |
||||
prometheus.BuildFQName(Namespace, "scrape", "success"), |
||||
"grafana_plugin: Whether a plugin collector succeeded.", |
||||
[]string{"plugin_id"}, |
||||
nil, |
||||
) |
||||
) |
||||
|
||||
// Collector is the interface a plugin collector has to implement.
|
||||
type Collector interface { |
||||
// Get new metrics and expose them via prometheus registry.
|
||||
CollectMetrics(ctx context.Context, ch chan<- prometheus.Metric) error |
||||
} |
||||
|
||||
// PluginCollector implements the prometheus.Collector interface.
|
||||
type PluginCollector struct { |
||||
collectors map[string]Collector |
||||
logger log.Logger |
||||
} |
||||
|
||||
// NewPluginCollector creates a new PluginCollector..
|
||||
func NewPluginCollector() PluginCollector { |
||||
return PluginCollector{ |
||||
collectors: make(map[string]Collector), |
||||
logger: log.New("plugins.backend.collector"), |
||||
} |
||||
} |
||||
|
||||
func (pc PluginCollector) Register(pluginID string, c Collector) { |
||||
pc.collectors[pluginID] = c |
||||
} |
||||
|
||||
// Describe implements the prometheus.Collector interface.
|
||||
func (pc PluginCollector) Describe(ch chan<- *prometheus.Desc) { |
||||
ch <- scrapeDurationDesc |
||||
ch <- scrapeSuccessDesc |
||||
} |
||||
|
||||
// Collect implements the prometheus.Collector interface.
|
||||
func (pc PluginCollector) Collect(ch chan<- prometheus.Metric) { |
||||
ctx := context.Background() |
||||
wg := sync.WaitGroup{} |
||||
wg.Add(len(pc.collectors)) |
||||
for name, c := range pc.collectors { |
||||
go func(name string, c Collector) { |
||||
execute(ctx, name, c, ch, pc.logger) |
||||
wg.Done() |
||||
}(name, c) |
||||
} |
||||
wg.Wait() |
||||
} |
||||
|
||||
func execute(ctx context.Context, pluginID string, c Collector, ch chan<- prometheus.Metric, logger log.Logger) { |
||||
begin := time.Now() |
||||
err := c.CollectMetrics(ctx, ch) |
||||
duration := time.Since(begin) |
||||
var success float64 |
||||
|
||||
if err != nil { |
||||
logger.Error("collector failed", "pluginId", pluginID, "took", duration, "error", err) |
||||
success = 0 |
||||
} else { |
||||
logger.Debug("collector succeeded", "pluginId", pluginID, "took", duration) |
||||
success = 1 |
||||
} |
||||
ch <- prometheus.MustNewConstMetric(scrapeDurationDesc, prometheus.GaugeValue, duration.Seconds(), pluginID) |
||||
ch <- prometheus.MustNewConstMetric(scrapeSuccessDesc, prometheus.GaugeValue, success, pluginID) |
||||
} |
||||
@ -1,61 +0,0 @@ |
||||
package backend |
||||
|
||||
import ( |
||||
"bytes" |
||||
"context" |
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/genproto/pluginv2" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/prometheus/common/expfmt" |
||||
) |
||||
|
||||
// sdkAdapter adapter between protobuf and SDK interfaces.
|
||||
type sdkAdapter struct { |
||||
handlers PluginHandlers |
||||
} |
||||
|
||||
func (a *sdkAdapter) CollectMetrics(ctx context.Context, protoReq *pluginv2.CollectMetrics_Request) (*pluginv2.CollectMetrics_Response, error) { |
||||
metrics, err := prometheus.DefaultGatherer.Gather() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
var buf bytes.Buffer |
||||
for _, m := range metrics { |
||||
_, err := expfmt.MetricFamilyToText(&buf, m) |
||||
if err != nil { |
||||
continue |
||||
} |
||||
} |
||||
|
||||
resp := &pluginv2.CollectMetrics_Response{ |
||||
Metrics: &pluginv2.CollectMetrics_Payload{ |
||||
Prometheus: buf.Bytes(), |
||||
}, |
||||
} |
||||
|
||||
return resp, nil |
||||
} |
||||
|
||||
func (a *sdkAdapter) CheckHealth(ctx context.Context, protoReq *pluginv2.CheckHealth_Request) (*pluginv2.CheckHealth_Response, error) { |
||||
return &pluginv2.CheckHealth_Response{ |
||||
Status: pluginv2.CheckHealth_Response_OK, |
||||
}, nil |
||||
} |
||||
|
||||
func (a *sdkAdapter) DataQuery(ctx context.Context, req *pluginv2.DataQueryRequest) (*pluginv2.DataQueryResponse, error) { |
||||
resp, err := a.handlers.DataQuery(ctx, dataQueryRequestFromProto(req)) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
return resp.toProtobuf() |
||||
} |
||||
|
||||
func (a *sdkAdapter) Resource(ctx context.Context, req *pluginv2.ResourceRequest) (*pluginv2.ResourceResponse, error) { |
||||
res, err := a.handlers.Resource(ctx, resourceRequestFromProtobuf(req)) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
return res.toProtobuf(), nil |
||||
} |
||||
@ -0,0 +1,53 @@ |
||||
package backend |
||||
|
||||
import ( |
||||
"context" |
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/genproto/pluginv2" |
||||
) |
||||
|
||||
// DiagnosticsPlugin is the Grafana diagnostics plugin interface.
|
||||
type DiagnosticsPlugin interface { |
||||
CollectMetrics(ctx context.Context, req *pluginv2.CollectMetrics_Request) (*pluginv2.CollectMetrics_Response, error) |
||||
CheckHealth(ctx context.Context, req *pluginv2.CheckHealth_Request) (*pluginv2.CheckHealth_Response, error) |
||||
} |
||||
|
||||
type CheckHealthHandler interface { |
||||
CheckHealth(ctx context.Context) (*CheckHealthResult, error) |
||||
} |
||||
|
||||
// HealthStatus is the status of the plugin.
|
||||
type HealthStatus int |
||||
|
||||
const ( |
||||
// HealthStatusUnknown means the status of the plugin is unknown.
|
||||
HealthStatusUnknown HealthStatus = iota |
||||
// HealthStatusOk means the status of the plugin is good.
|
||||
HealthStatusOk |
||||
// HealthStatusError means the plugin is in an error state.
|
||||
HealthStatusError |
||||
) |
||||
|
||||
func (ps HealthStatus) toProtobuf() pluginv2.CheckHealth_Response_HealthStatus { |
||||
switch ps { |
||||
case HealthStatusUnknown: |
||||
return pluginv2.CheckHealth_Response_UNKNOWN |
||||
case HealthStatusOk: |
||||
return pluginv2.CheckHealth_Response_OK |
||||
case HealthStatusError: |
||||
return pluginv2.CheckHealth_Response_ERROR |
||||
} |
||||
panic("unsupported protobuf health status type in sdk") |
||||
} |
||||
|
||||
type CheckHealthResult struct { |
||||
Status HealthStatus |
||||
Info string |
||||
} |
||||
|
||||
func (res *CheckHealthResult) toProtobuf() *pluginv2.CheckHealth_Response { |
||||
return &pluginv2.CheckHealth_Response{ |
||||
Status: res.Status.toProtobuf(), |
||||
Info: res.Info, |
||||
} |
||||
} |
||||
@ -0,0 +1,51 @@ |
||||
package backend |
||||
|
||||
import ( |
||||
"context" |
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/genproto/pluginv2" |
||||
plugin "github.com/hashicorp/go-plugin" |
||||
"google.golang.org/grpc" |
||||
) |
||||
|
||||
// DiagnosticsGRPCPlugin implements the GRPCPlugin interface from github.com/hashicorp/go-plugin.
|
||||
type DiagnosticsGRPCPlugin struct { |
||||
plugin.NetRPCUnsupportedPlugin |
||||
plugin.GRPCPlugin |
||||
server pluginv2.DiagnosticsServer |
||||
} |
||||
|
||||
func (p *DiagnosticsGRPCPlugin) GRPCServer(broker *plugin.GRPCBroker, s *grpc.Server) error { |
||||
pluginv2.RegisterDiagnosticsServer(s, &diagnosticsGRPCServer{ |
||||
server: p.server, |
||||
}) |
||||
return nil |
||||
} |
||||
|
||||
func (p *DiagnosticsGRPCPlugin) GRPCClient(ctx context.Context, broker *plugin.GRPCBroker, c *grpc.ClientConn) (interface{}, error) { |
||||
return &diagnosticsGRPCClient{client: pluginv2.NewDiagnosticsClient(c)}, nil |
||||
} |
||||
|
||||
type diagnosticsGRPCServer struct { |
||||
server pluginv2.DiagnosticsServer |
||||
} |
||||
|
||||
func (s *diagnosticsGRPCServer) CollectMetrics(ctx context.Context, req *pluginv2.CollectMetrics_Request) (*pluginv2.CollectMetrics_Response, error) { |
||||
return s.server.CollectMetrics(ctx, req) |
||||
} |
||||
|
||||
func (s *diagnosticsGRPCServer) CheckHealth(ctx context.Context, req *pluginv2.CheckHealth_Request) (*pluginv2.CheckHealth_Response, error) { |
||||
return s.server.CheckHealth(ctx, req) |
||||
} |
||||
|
||||
type diagnosticsGRPCClient struct { |
||||
client pluginv2.DiagnosticsClient |
||||
} |
||||
|
||||
func (s *diagnosticsGRPCClient) CollectMetrics(ctx context.Context, req *pluginv2.CollectMetrics_Request) (*pluginv2.CollectMetrics_Response, error) { |
||||
return s.client.CollectMetrics(ctx, req) |
||||
} |
||||
|
||||
func (s *diagnosticsGRPCClient) CheckHealth(ctx context.Context, req *pluginv2.CheckHealth_Request) (*pluginv2.CheckHealth_Response, error) { |
||||
return s.client.CheckHealth(ctx, req) |
||||
} |
||||
@ -1,4 +1,4 @@ |
||||
package common |
||||
package backend |
||||
|
||||
import plugin "github.com/hashicorp/go-plugin" |
||||
|
||||
@ -0,0 +1,91 @@ |
||||
package backend |
||||
|
||||
import ( |
||||
"bytes" |
||||
"context" |
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/dataframe" |
||||
"github.com/grafana/grafana-plugin-sdk-go/genproto/pluginv2" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
"github.com/prometheus/common/expfmt" |
||||
) |
||||
|
||||
// sdkAdapter adapter between protobuf and SDK interfaces.
|
||||
type sdkAdapter struct { |
||||
checkHealthHandler CheckHealthHandler |
||||
dataQueryHandler DataQueryHandler |
||||
resourceHandler ResourceHandler |
||||
transformDataHandler TransformDataHandler |
||||
} |
||||
|
||||
func (a *sdkAdapter) CollectMetrics(ctx context.Context, protoReq *pluginv2.CollectMetrics_Request) (*pluginv2.CollectMetrics_Response, error) { |
||||
mfs, err := prometheus.DefaultGatherer.Gather() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
var buf bytes.Buffer |
||||
for _, mf := range mfs { |
||||
_, err := expfmt.MetricFamilyToText(&buf, mf) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
return &pluginv2.CollectMetrics_Response{ |
||||
Metrics: &pluginv2.CollectMetrics_Payload{ |
||||
Prometheus: buf.Bytes(), |
||||
}, |
||||
}, nil |
||||
} |
||||
|
||||
func (a *sdkAdapter) CheckHealth(ctx context.Context, protoReq *pluginv2.CheckHealth_Request) (*pluginv2.CheckHealth_Response, error) { |
||||
if a.checkHealthHandler != nil { |
||||
res, err := a.checkHealthHandler.CheckHealth(ctx) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
return res.toProtobuf(), nil |
||||
} |
||||
|
||||
return &pluginv2.CheckHealth_Response{ |
||||
Status: pluginv2.CheckHealth_Response_OK, |
||||
}, nil |
||||
} |
||||
|
||||
func (a *sdkAdapter) DataQuery(ctx context.Context, req *pluginv2.DataQueryRequest) (*pluginv2.DataQueryResponse, error) { |
||||
resp, err := a.dataQueryHandler.DataQuery(ctx, dataQueryRequestFromProto(req)) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
return resp.toProtobuf() |
||||
} |
||||
|
||||
func (a *sdkAdapter) Resource(ctx context.Context, req *pluginv2.ResourceRequest) (*pluginv2.ResourceResponse, error) { |
||||
res, err := a.resourceHandler.Resource(ctx, resourceRequestFromProtobuf(req)) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
return res.toProtobuf(), nil |
||||
} |
||||
|
||||
func (a *sdkAdapter) TransformData(ctx context.Context, req *pluginv2.DataQueryRequest, callBack TransformCallBack) (*pluginv2.DataQueryResponse, error) { |
||||
resp, err := a.transformDataHandler.TransformData(ctx, dataQueryRequestFromProto(req), &transformCallBackWrapper{callBack}) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
encodedFrames := make([][]byte, len(resp.Frames)) |
||||
for i, frame := range resp.Frames { |
||||
encodedFrames[i], err = dataframe.MarshalArrow(frame) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
return &pluginv2.DataQueryResponse{ |
||||
Frames: encodedFrames, |
||||
Metadata: resp.Metadata, |
||||
}, nil |
||||
} |
||||
33
vendor/github.com/grafana/grafana-plugin-sdk-go/backend/transform_sdk_adapter.go
generated
vendored
33
vendor/github.com/grafana/grafana-plugin-sdk-go/backend/transform_sdk_adapter.go
generated
vendored
@ -1,33 +0,0 @@ |
||||
package backend |
||||
|
||||
import ( |
||||
"context" |
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/dataframe" |
||||
"github.com/grafana/grafana-plugin-sdk-go/genproto/pluginv2" |
||||
) |
||||
|
||||
// transformSDKAdapter adapter between protobuf and SDK interfaces.
|
||||
type transformSDKAdapter struct { |
||||
handlers TransformHandlers |
||||
} |
||||
|
||||
func (a *transformSDKAdapter) TransformData(ctx context.Context, req *pluginv2.DataQueryRequest, callBack TransformCallBack) (*pluginv2.DataQueryResponse, error) { |
||||
resp, err := a.handlers.DataQuery(ctx, dataQueryRequestFromProto(req), &transformCallBackWrapper{callBack}) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
encodedFrames := make([][]byte, len(resp.Frames)) |
||||
for i, frame := range resp.Frames { |
||||
encodedFrames[i], err = dataframe.MarshalArrow(frame) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
return &pluginv2.DataQueryResponse{ |
||||
Frames: encodedFrames, |
||||
Metadata: resp.Metadata, |
||||
}, nil |
||||
} |
||||
980
vendor/github.com/grafana/grafana-plugin-sdk-go/genproto/pluginv2/backend.pb.go
generated
vendored
980
vendor/github.com/grafana/grafana-plugin-sdk-go/genproto/pluginv2/backend.pb.go
generated
vendored
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,201 @@ |
||||
#vendor |
||||
vendor/ |
||||
|
||||
# Created by .ignore support plugin (hsz.mobi) |
||||
coverage.txt |
||||
### Go template |
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects) |
||||
*.o |
||||
*.a |
||||
*.so |
||||
|
||||
# Folders |
||||
_obj |
||||
_test |
||||
|
||||
# Architecture specific extensions/prefixes |
||||
*.[568vq] |
||||
[568vq].out |
||||
|
||||
*.cgo1.go |
||||
*.cgo2.c |
||||
_cgo_defun.c |
||||
_cgo_gotypes.go |
||||
_cgo_export.* |
||||
|
||||
_testmain.go |
||||
|
||||
*.exe |
||||
*.test |
||||
*.prof |
||||
### Windows template |
||||
# Windows image file caches |
||||
Thumbs.db |
||||
ehthumbs.db |
||||
|
||||
# Folder config file |
||||
Desktop.ini |
||||
|
||||
# Recycle Bin used on file shares |
||||
$RECYCLE.BIN/ |
||||
|
||||
# Windows Installer files |
||||
*.cab |
||||
*.msi |
||||
*.msm |
||||
*.msp |
||||
|
||||
# Windows shortcuts |
||||
*.lnk |
||||
### Kate template |
||||
# Swap Files # |
||||
.*.kate-swp |
||||
.swp.* |
||||
### SublimeText template |
||||
# cache files for sublime text |
||||
*.tmlanguage.cache |
||||
*.tmPreferences.cache |
||||
*.stTheme.cache |
||||
|
||||
# workspace files are user-specific |
||||
*.sublime-workspace |
||||
|
||||
# project files should be checked into the repository, unless a significant |
||||
# proportion of contributors will probably not be using SublimeText |
||||
# *.sublime-project |
||||
|
||||
# sftp configuration file |
||||
sftp-config.json |
||||
### Linux template |
||||
*~ |
||||
|
||||
# temporary files which can be created if a process still has a handle open of a deleted file |
||||
.fuse_hidden* |
||||
|
||||
# KDE directory preferences |
||||
.directory |
||||
|
||||
# Linux trash folder which might appear on any partition or disk |
||||
.Trash-* |
||||
### JetBrains template |
||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm |
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 |
||||
|
||||
# User-specific stuff: |
||||
.idea |
||||
.idea/tasks.xml |
||||
.idea/dictionaries |
||||
.idea/vcs.xml |
||||
.idea/jsLibraryMappings.xml |
||||
|
||||
# Sensitive or high-churn files: |
||||
.idea/dataSources.ids |
||||
.idea/dataSources.xml |
||||
.idea/dataSources.local.xml |
||||
.idea/sqlDataSources.xml |
||||
.idea/dynamic.xml |
||||
.idea/uiDesigner.xml |
||||
|
||||
# Gradle: |
||||
.idea/gradle.xml |
||||
.idea/libraries |
||||
|
||||
# Mongo Explorer plugin: |
||||
.idea/mongoSettings.xml |
||||
|
||||
## File-based project format: |
||||
*.iws |
||||
|
||||
## Plugin-specific files: |
||||
|
||||
# IntelliJ |
||||
/out/ |
||||
|
||||
# mpeltonen/sbt-idea plugin |
||||
.idea_modules/ |
||||
|
||||
# JIRA plugin |
||||
atlassian-ide-plugin.xml |
||||
|
||||
# Crashlytics plugin (for Android Studio and IntelliJ) |
||||
com_crashlytics_export_strings.xml |
||||
crashlytics.properties |
||||
crashlytics-build.properties |
||||
fabric.properties |
||||
### Xcode template |
||||
# Xcode |
||||
# |
||||
# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore |
||||
|
||||
## Build generated |
||||
build/ |
||||
DerivedData/ |
||||
|
||||
## Various settings |
||||
*.pbxuser |
||||
!default.pbxuser |
||||
*.mode1v3 |
||||
!default.mode1v3 |
||||
*.mode2v3 |
||||
!default.mode2v3 |
||||
*.perspectivev3 |
||||
!default.perspectivev3 |
||||
xcuserdata/ |
||||
|
||||
## Other |
||||
*.moved-aside |
||||
*.xccheckout |
||||
*.xcscmblueprint |
||||
### Eclipse template |
||||
|
||||
.metadata |
||||
bin/ |
||||
tmp/ |
||||
*.tmp |
||||
*.bak |
||||
*.swp |
||||
*~.nib |
||||
local.properties |
||||
.settings/ |
||||
.loadpath |
||||
.recommenders |
||||
|
||||
# Eclipse Core |
||||
.project |
||||
|
||||
# External tool builders |
||||
.externalToolBuilders/ |
||||
|
||||
# Locally stored "Eclipse launch configurations" |
||||
*.launch |
||||
|
||||
# PyDev specific (Python IDE for Eclipse) |
||||
*.pydevproject |
||||
|
||||
# CDT-specific (C/C++ Development Tooling) |
||||
.cproject |
||||
|
||||
# JDT-specific (Eclipse Java Development Tools) |
||||
.classpath |
||||
|
||||
# Java annotation processor (APT) |
||||
.factorypath |
||||
|
||||
# PDT-specific (PHP Development Tools) |
||||
.buildpath |
||||
|
||||
# sbteclipse plugin |
||||
.target |
||||
|
||||
# Tern plugin |
||||
.tern-project |
||||
|
||||
# TeXlipse plugin |
||||
.texlipse |
||||
|
||||
# STS (Spring Tool Suite) |
||||
.springBeans |
||||
|
||||
# Code Recommenders |
||||
.recommenders/ |
||||
|
||||
@ -0,0 +1,24 @@ |
||||
# Changelog |
||||
All notable changes to this project will be documented in this file. |
||||
|
||||
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) |
||||
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). |
||||
|
||||
## [Unreleased] |
||||
|
||||
## [1.2.0](https://github.com/grpc-ecosystem/go-grpc-prometheus/releases/tag/v1.2.0) - 2018-06-04 |
||||
|
||||
### Added |
||||
|
||||
* Provide metrics object as `prometheus.Collector`, for conventional metric registration. |
||||
* Support non-default/global Prometheus registry. |
||||
* Allow configuring counters with `prometheus.CounterOpts`. |
||||
|
||||
### Changed |
||||
|
||||
* Remove usage of deprecated `grpc.Code()`. |
||||
* Remove usage of deprecated `grpc.Errorf` and replace with `status.Errorf`. |
||||
|
||||
--- |
||||
|
||||
This changelog was started with version `v1.2.0`, for earlier versions refer to the respective [GitHub releases](https://github.com/grpc-ecosystem/go-grpc-prometheus/releases). |
||||
@ -0,0 +1,201 @@ |
||||
Apache License |
||||
Version 2.0, January 2004 |
||||
http://www.apache.org/licenses/ |
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |
||||
|
||||
1. Definitions. |
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction, |
||||
and distribution as defined by Sections 1 through 9 of this document. |
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by |
||||
the copyright owner that is granting the License. |
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all |
||||
other entities that control, are controlled by, or are under common |
||||
control with that entity. For the purposes of this definition, |
||||
"control" means (i) the power, direct or indirect, to cause the |
||||
direction or management of such entity, whether by contract or |
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the |
||||
outstanding shares, or (iii) beneficial ownership of such entity. |
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity |
||||
exercising permissions granted by this License. |
||||
|
||||
"Source" form shall mean the preferred form for making modifications, |
||||
including but not limited to software source code, documentation |
||||
source, and configuration files. |
||||
|
||||
"Object" form shall mean any form resulting from mechanical |
||||
transformation or translation of a Source form, including but |
||||
not limited to compiled object code, generated documentation, |
||||
and conversions to other media types. |
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or |
||||
Object form, made available under the License, as indicated by a |
||||
copyright notice that is included in or attached to the work |
||||
(an example is provided in the Appendix below). |
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object |
||||
form, that is based on (or derived from) the Work and for which the |
||||
editorial revisions, annotations, elaborations, or other modifications |
||||
represent, as a whole, an original work of authorship. For the purposes |
||||
of this License, Derivative Works shall not include works that remain |
||||
separable from, or merely link (or bind by name) to the interfaces of, |
||||
the Work and Derivative Works thereof. |
||||
|
||||
"Contribution" shall mean any work of authorship, including |
||||
the original version of the Work and any modifications or additions |
||||
to that Work or Derivative Works thereof, that is intentionally |
||||
submitted to Licensor for inclusion in the Work by the copyright owner |
||||
or by an individual or Legal Entity authorized to submit on behalf of |
||||
the copyright owner. For the purposes of this definition, "submitted" |
||||
means any form of electronic, verbal, or written communication sent |
||||
to the Licensor or its representatives, including but not limited to |
||||
communication on electronic mailing lists, source code control systems, |
||||
and issue tracking systems that are managed by, or on behalf of, the |
||||
Licensor for the purpose of discussing and improving the Work, but |
||||
excluding communication that is conspicuously marked or otherwise |
||||
designated in writing by the copyright owner as "Not a Contribution." |
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity |
||||
on behalf of whom a Contribution has been received by Licensor and |
||||
subsequently incorporated within the Work. |
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
copyright license to reproduce, prepare Derivative Works of, |
||||
publicly display, publicly perform, sublicense, and distribute the |
||||
Work and such Derivative Works in Source or Object form. |
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
(except as stated in this section) patent license to make, have made, |
||||
use, offer to sell, sell, import, and otherwise transfer the Work, |
||||
where such license applies only to those patent claims licensable |
||||
by such Contributor that are necessarily infringed by their |
||||
Contribution(s) alone or by combination of their Contribution(s) |
||||
with the Work to which such Contribution(s) was submitted. If You |
||||
institute patent litigation against any entity (including a |
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work |
||||
or a Contribution incorporated within the Work constitutes direct |
||||
or contributory patent infringement, then any patent licenses |
||||
granted to You under this License for that Work shall terminate |
||||
as of the date such litigation is filed. |
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the |
||||
Work or Derivative Works thereof in any medium, with or without |
||||
modifications, and in Source or Object form, provided that You |
||||
meet the following conditions: |
||||
|
||||
(a) You must give any other recipients of the Work or |
||||
Derivative Works a copy of this License; and |
||||
|
||||
(b) You must cause any modified files to carry prominent notices |
||||
stating that You changed the files; and |
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works |
||||
that You distribute, all copyright, patent, trademark, and |
||||
attribution notices from the Source form of the Work, |
||||
excluding those notices that do not pertain to any part of |
||||
the Derivative Works; and |
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its |
||||
distribution, then any Derivative Works that You distribute must |
||||
include a readable copy of the attribution notices contained |
||||
within such NOTICE file, excluding those notices that do not |
||||
pertain to any part of the Derivative Works, in at least one |
||||
of the following places: within a NOTICE text file distributed |
||||
as part of the Derivative Works; within the Source form or |
||||
documentation, if provided along with the Derivative Works; or, |
||||
within a display generated by the Derivative Works, if and |
||||
wherever such third-party notices normally appear. The contents |
||||
of the NOTICE file are for informational purposes only and |
||||
do not modify the License. You may add Your own attribution |
||||
notices within Derivative Works that You distribute, alongside |
||||
or as an addendum to the NOTICE text from the Work, provided |
||||
that such additional attribution notices cannot be construed |
||||
as modifying the License. |
||||
|
||||
You may add Your own copyright statement to Your modifications and |
||||
may provide additional or different license terms and conditions |
||||
for use, reproduction, or distribution of Your modifications, or |
||||
for any such Derivative Works as a whole, provided Your use, |
||||
reproduction, and distribution of the Work otherwise complies with |
||||
the conditions stated in this License. |
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise, |
||||
any Contribution intentionally submitted for inclusion in the Work |
||||
by You to the Licensor shall be under the terms and conditions of |
||||
this License, without any additional terms or conditions. |
||||
Notwithstanding the above, nothing herein shall supersede or modify |
||||
the terms of any separate license agreement you may have executed |
||||
with Licensor regarding such Contributions. |
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade |
||||
names, trademarks, service marks, or product names of the Licensor, |
||||
except as required for reasonable and customary use in describing the |
||||
origin of the Work and reproducing the content of the NOTICE file. |
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or |
||||
agreed to in writing, Licensor provides the Work (and each |
||||
Contributor provides its Contributions) on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
||||
implied, including, without limitation, any warranties or conditions |
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |
||||
PARTICULAR PURPOSE. You are solely responsible for determining the |
||||
appropriateness of using or redistributing the Work and assume any |
||||
risks associated with Your exercise of permissions under this License. |
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory, |
||||
whether in tort (including negligence), contract, or otherwise, |
||||
unless required by applicable law (such as deliberate and grossly |
||||
negligent acts) or agreed to in writing, shall any Contributor be |
||||
liable to You for damages, including any direct, indirect, special, |
||||
incidental, or consequential damages of any character arising as a |
||||
result of this License or out of the use or inability to use the |
||||
Work (including but not limited to damages for loss of goodwill, |
||||
work stoppage, computer failure or malfunction, or any and all |
||||
other commercial damages or losses), even if such Contributor |
||||
has been advised of the possibility of such damages. |
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing |
||||
the Work or Derivative Works thereof, You may choose to offer, |
||||
and charge a fee for, acceptance of support, warranty, indemnity, |
||||
or other liability obligations and/or rights consistent with this |
||||
License. However, in accepting such obligations, You may act only |
||||
on Your own behalf and on Your sole responsibility, not on behalf |
||||
of any other Contributor, and only if You agree to indemnify, |
||||
defend, and hold each Contributor harmless for any liability |
||||
incurred by, or claims asserted against, such Contributor by reason |
||||
of your accepting any such warranty or additional liability. |
||||
|
||||
END OF TERMS AND CONDITIONS |
||||
|
||||
APPENDIX: How to apply the Apache License to your work. |
||||
|
||||
To apply the Apache License to your work, attach the following |
||||
boilerplate notice, with the fields enclosed by brackets "[]" |
||||
replaced with your own identifying information. (Don't include |
||||
the brackets!) The text should be enclosed in the appropriate |
||||
comment syntax for the file format. We also recommend that a |
||||
file or class name and description of purpose be included on the |
||||
same "printed page" as the copyright notice for easier |
||||
identification within third-party archives. |
||||
|
||||
Copyright [yyyy] [name of copyright owner] |
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); |
||||
you may not use this file except in compliance with the License. |
||||
You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
@ -0,0 +1,247 @@ |
||||
# Go gRPC Interceptors for Prometheus monitoring |
||||
|
||||
[](https://travis-ci.org/grpc-ecosystem/go-grpc-prometheus) |
||||
[](http://goreportcard.com/report/grpc-ecosystem/go-grpc-prometheus) |
||||
[](https://godoc.org/github.com/grpc-ecosystem/go-grpc-prometheus) |
||||
[](https://sourcegraph.com/github.com/grpc-ecosystem/go-grpc-prometheus/?badge) |
||||
[](https://codecov.io/gh/grpc-ecosystem/go-grpc-prometheus) |
||||
[](LICENSE) |
||||
|
||||
[Prometheus](https://prometheus.io/) monitoring for your [gRPC Go](https://github.com/grpc/grpc-go) servers and clients. |
||||
|
||||
A sister implementation for [gRPC Java](https://github.com/grpc/grpc-java) (same metrics, same semantics) is in [grpc-ecosystem/java-grpc-prometheus](https://github.com/grpc-ecosystem/java-grpc-prometheus). |
||||
|
||||
## Interceptors |
||||
|
||||
[gRPC Go](https://github.com/grpc/grpc-go) recently acquired support for Interceptors, i.e. middleware that is executed |
||||
by a gRPC Server before the request is passed onto the user's application logic. It is a perfect way to implement |
||||
common patterns: auth, logging and... monitoring. |
||||
|
||||
To use Interceptors in chains, please see [`go-grpc-middleware`](https://github.com/mwitkow/go-grpc-middleware). |
||||
|
||||
## Usage |
||||
|
||||
There are two types of interceptors: client-side and server-side. This package provides monitoring Interceptors for both. |
||||
|
||||
### Server-side |
||||
|
||||
```go |
||||
import "github.com/grpc-ecosystem/go-grpc-prometheus" |
||||
... |
||||
// Initialize your gRPC server's interceptor. |
||||
myServer := grpc.NewServer( |
||||
grpc.StreamInterceptor(grpc_prometheus.StreamServerInterceptor), |
||||
grpc.UnaryInterceptor(grpc_prometheus.UnaryServerInterceptor), |
||||
) |
||||
// Register your gRPC service implementations. |
||||
myservice.RegisterMyServiceServer(s.server, &myServiceImpl{}) |
||||
// After all your registrations, make sure all of the Prometheus metrics are initialized. |
||||
grpc_prometheus.Register(myServer) |
||||
// Register Prometheus metrics handler. |
||||
http.Handle("/metrics", promhttp.Handler()) |
||||
... |
||||
``` |
||||
|
||||
### Client-side |
||||
|
||||
```go |
||||
import "github.com/grpc-ecosystem/go-grpc-prometheus" |
||||
... |
||||
clientConn, err = grpc.Dial( |
||||
address, |
||||
grpc.WithUnaryInterceptor(grpc_prometheus.UnaryClientInterceptor), |
||||
grpc.WithStreamInterceptor(grpc_prometheus.StreamClientInterceptor) |
||||
) |
||||
client = pb_testproto.NewTestServiceClient(clientConn) |
||||
resp, err := client.PingEmpty(s.ctx, &myservice.Request{Msg: "hello"}) |
||||
... |
||||
``` |
||||
|
||||
# Metrics |
||||
|
||||
## Labels |
||||
|
||||
All server-side metrics start with `grpc_server` as Prometheus subsystem name. All client-side metrics start with `grpc_client`. Both of them have mirror-concepts. Similarly all methods |
||||
contain the same rich labels: |
||||
|
||||
* `grpc_service` - the [gRPC service](http://www.grpc.io/docs/#defining-a-service) name, which is the combination of protobuf `package` and |
||||
the `grpc_service` section name. E.g. for `package = mwitkow.testproto` and |
||||
`service TestService` the label will be `grpc_service="mwitkow.testproto.TestService"` |
||||
* `grpc_method` - the name of the method called on the gRPC service. E.g. |
||||
`grpc_method="Ping"` |
||||
* `grpc_type` - the gRPC [type of request](http://www.grpc.io/docs/guides/concepts.html#rpc-life-cycle). |
||||
Differentiating between the two is important especially for latency measurements. |
||||
|
||||
- `unary` is single request, single response RPC |
||||
- `client_stream` is a multi-request, single response RPC |
||||
- `server_stream` is a single request, multi-response RPC |
||||
- `bidi_stream` is a multi-request, multi-response RPC |
||||
|
||||
|
||||
Additionally for completed RPCs, the following labels are used: |
||||
|
||||
* `grpc_code` - the human-readable [gRPC status code](https://github.com/grpc/grpc-go/blob/master/codes/codes.go). |
||||
The list of all statuses is to long, but here are some common ones: |
||||
|
||||
- `OK` - means the RPC was successful |
||||
- `IllegalArgument` - RPC contained bad values |
||||
- `Internal` - server-side error not disclosed to the clients |
||||
|
||||
## Counters |
||||
|
||||
The counters and their up to date documentation is in [server_reporter.go](server_reporter.go) and [client_reporter.go](client_reporter.go) |
||||
the respective Prometheus handler (usually `/metrics`). |
||||
|
||||
For the purpose of this documentation we will only discuss `grpc_server` metrics. The `grpc_client` ones contain mirror concepts. |
||||
|
||||
For simplicity, let's assume we're tracking a single server-side RPC call of [`mwitkow.testproto.TestService`](examples/testproto/test.proto), |
||||
calling the method `PingList`. The call succeeds and returns 20 messages in the stream. |
||||
|
||||
First, immediately after the server receives the call it will increment the |
||||
`grpc_server_started_total` and start the handling time clock (if histograms are enabled). |
||||
|
||||
```jsoniq |
||||
grpc_server_started_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 |
||||
``` |
||||
|
||||
Then the user logic gets invoked. It receives one message from the client containing the request |
||||
(it's a `server_stream`): |
||||
|
||||
```jsoniq |
||||
grpc_server_msg_received_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 |
||||
``` |
||||
|
||||
The user logic may return an error, or send multiple messages back to the client. In this case, on |
||||
each of the 20 messages sent back, a counter will be incremented: |
||||
|
||||
```jsoniq |
||||
grpc_server_msg_sent_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 20 |
||||
``` |
||||
|
||||
After the call completes, its status (`OK` or other [gRPC status code](https://github.com/grpc/grpc-go/blob/master/codes/codes.go)) |
||||
and the relevant call labels increment the `grpc_server_handled_total` counter. |
||||
|
||||
```jsoniq |
||||
grpc_server_handled_total{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 |
||||
``` |
||||
|
||||
## Histograms |
||||
|
||||
[Prometheus histograms](https://prometheus.io/docs/concepts/metric_types/#histogram) are a great way |
||||
to measure latency distributions of your RPCs. However, since it is bad practice to have metrics |
||||
of [high cardinality](https://prometheus.io/docs/practices/instrumentation/#do-not-overuse-labels) |
||||
the latency monitoring metrics are disabled by default. To enable them please call the following |
||||
in your server initialization code: |
||||
|
||||
```jsoniq |
||||
grpc_prometheus.EnableHandlingTimeHistogram() |
||||
``` |
||||
|
||||
After the call completes, its handling time will be recorded in a [Prometheus histogram](https://prometheus.io/docs/concepts/metric_types/#histogram) |
||||
variable `grpc_server_handling_seconds`. The histogram variable contains three sub-metrics: |
||||
|
||||
* `grpc_server_handling_seconds_count` - the count of all completed RPCs by status and method |
||||
* `grpc_server_handling_seconds_sum` - cumulative time of RPCs by status and method, useful for |
||||
calculating average handling times |
||||
* `grpc_server_handling_seconds_bucket` - contains the counts of RPCs by status and method in respective |
||||
handling-time buckets. These buckets can be used by Prometheus to estimate SLAs (see [here](https://prometheus.io/docs/practices/histograms/)) |
||||
|
||||
The counter values will look as follows: |
||||
|
||||
```jsoniq |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.005"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.01"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.025"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.05"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.1"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.25"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.5"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="1"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="2.5"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="5"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="10"} 1 |
||||
grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="+Inf"} 1 |
||||
grpc_server_handling_seconds_sum{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 0.0003866430000000001 |
||||
grpc_server_handling_seconds_count{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 |
||||
``` |
||||
|
||||
|
||||
## Useful query examples |
||||
|
||||
Prometheus philosophy is to provide raw metrics to the monitoring system, and |
||||
let the aggregations be handled there. The verbosity of above metrics make it possible to have that |
||||
flexibility. Here's a couple of useful monitoring queries: |
||||
|
||||
|
||||
### request inbound rate |
||||
```jsoniq |
||||
sum(rate(grpc_server_started_total{job="foo"}[1m])) by (grpc_service) |
||||
``` |
||||
For `job="foo"` (common label to differentiate between Prometheus monitoring targets), calculate the |
||||
rate of requests per second (1 minute window) for each gRPC `grpc_service` that the job has. Please note |
||||
how the `grpc_method` is being omitted here: all methods of a given gRPC service will be summed together. |
||||
|
||||
### unary request error rate |
||||
```jsoniq |
||||
sum(rate(grpc_server_handled_total{job="foo",grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service) |
||||
``` |
||||
For `job="foo"`, calculate the per-`grpc_service` rate of `unary` (1:1) RPCs that failed, i.e. the |
||||
ones that didn't finish with `OK` code. |
||||
|
||||
### unary request error percentage |
||||
```jsoniq |
||||
sum(rate(grpc_server_handled_total{job="foo",grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service) |
||||
/ |
||||
sum(rate(grpc_server_started_total{job="foo",grpc_type="unary"}[1m])) by (grpc_service) |
||||
* 100.0 |
||||
``` |
||||
For `job="foo"`, calculate the percentage of failed requests by service. It's easy to notice that |
||||
this is a combination of the two above examples. This is an example of a query you would like to |
||||
[alert on](https://prometheus.io/docs/alerting/rules/) in your system for SLA violations, e.g. |
||||
"no more than 1% requests should fail". |
||||
|
||||
### average response stream size |
||||
```jsoniq |
||||
sum(rate(grpc_server_msg_sent_total{job="foo",grpc_type="server_stream"}[10m])) by (grpc_service) |
||||
/ |
||||
sum(rate(grpc_server_started_total{job="foo",grpc_type="server_stream"}[10m])) by (grpc_service) |
||||
``` |
||||
For `job="foo"` what is the `grpc_service`-wide `10m` average of messages returned for all ` |
||||
server_stream` RPCs. This allows you to track the stream sizes returned by your system, e.g. allows |
||||
you to track when clients started to send "wide" queries that ret |
||||
Note the divisor is the number of started RPCs, in order to account for in-flight requests. |
||||
|
||||
### 99%-tile latency of unary requests |
||||
```jsoniq |
||||
histogram_quantile(0.99, |
||||
sum(rate(grpc_server_handling_seconds_bucket{job="foo",grpc_type="unary"}[5m])) by (grpc_service,le) |
||||
) |
||||
``` |
||||
For `job="foo"`, returns an 99%-tile [quantile estimation](https://prometheus.io/docs/practices/histograms/#quantiles) |
||||
of the handling time of RPCs per service. Please note the `5m` rate, this means that the quantile |
||||
estimation will take samples in a rolling `5m` window. When combined with other quantiles |
||||
(e.g. 50%, 90%), this query gives you tremendous insight into the responsiveness of your system |
||||
(e.g. impact of caching). |
||||
|
||||
### percentage of slow unary queries (>250ms) |
||||
```jsoniq |
||||
100.0 - ( |
||||
sum(rate(grpc_server_handling_seconds_bucket{job="foo",grpc_type="unary",le="0.25"}[5m])) by (grpc_service) |
||||
/ |
||||
sum(rate(grpc_server_handling_seconds_count{job="foo",grpc_type="unary"}[5m])) by (grpc_service) |
||||
) * 100.0 |
||||
``` |
||||
For `job="foo"` calculate the by-`grpc_service` fraction of slow requests that took longer than `0.25` |
||||
seconds. This query is relatively complex, since the Prometheus aggregations use `le` (less or equal) |
||||
buckets, meaning that counting "fast" requests fractions is easier. However, simple maths helps. |
||||
This is an example of a query you would like to alert on in your system for SLA violations, |
||||
e.g. "less than 1% of requests are slower than 250ms". |
||||
|
||||
|
||||
## Status |
||||
|
||||
This code has been used since August 2015 as the basis for monitoring of *production* gRPC micro services at [Improbable](https://improbable.io). |
||||
|
||||
## License |
||||
|
||||
`go-grpc-prometheus` is released under the Apache 2.0 license. See the [LICENSE](LICENSE) file for details. |
||||
@ -0,0 +1,39 @@ |
||||
// Copyright 2016 Michal Witkowski. All Rights Reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
|
||||
// gRPC Prometheus monitoring interceptors for client-side gRPC.
|
||||
|
||||
package grpc_prometheus |
||||
|
||||
import ( |
||||
prom "github.com/prometheus/client_golang/prometheus" |
||||
) |
||||
|
||||
var ( |
||||
// DefaultClientMetrics is the default instance of ClientMetrics. It is
|
||||
// intended to be used in conjunction the default Prometheus metrics
|
||||
// registry.
|
||||
DefaultClientMetrics = NewClientMetrics() |
||||
|
||||
// UnaryClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Unary RPCs.
|
||||
UnaryClientInterceptor = DefaultClientMetrics.UnaryClientInterceptor() |
||||
|
||||
// StreamClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Streaming RPCs.
|
||||
StreamClientInterceptor = DefaultClientMetrics.StreamClientInterceptor() |
||||
) |
||||
|
||||
func init() { |
||||
prom.MustRegister(DefaultClientMetrics.clientStartedCounter) |
||||
prom.MustRegister(DefaultClientMetrics.clientHandledCounter) |
||||
prom.MustRegister(DefaultClientMetrics.clientStreamMsgReceived) |
||||
prom.MustRegister(DefaultClientMetrics.clientStreamMsgSent) |
||||
} |
||||
|
||||
// EnableClientHandlingTimeHistogram turns on recording of handling time of
|
||||
// RPCs. Histogram metrics can be very expensive for Prometheus to retain and
|
||||
// query. This function acts on the DefaultClientMetrics variable and the
|
||||
// default Prometheus metrics registry.
|
||||
func EnableClientHandlingTimeHistogram(opts ...HistogramOption) { |
||||
DefaultClientMetrics.EnableClientHandlingTimeHistogram(opts...) |
||||
prom.Register(DefaultClientMetrics.clientHandledHistogram) |
||||
} |
||||
@ -0,0 +1,170 @@ |
||||
package grpc_prometheus |
||||
|
||||
import ( |
||||
"io" |
||||
|
||||
prom "github.com/prometheus/client_golang/prometheus" |
||||
"golang.org/x/net/context" |
||||
"google.golang.org/grpc" |
||||
"google.golang.org/grpc/codes" |
||||
"google.golang.org/grpc/status" |
||||
) |
||||
|
||||
// ClientMetrics represents a collection of metrics to be registered on a
|
||||
// Prometheus metrics registry for a gRPC client.
|
||||
type ClientMetrics struct { |
||||
clientStartedCounter *prom.CounterVec |
||||
clientHandledCounter *prom.CounterVec |
||||
clientStreamMsgReceived *prom.CounterVec |
||||
clientStreamMsgSent *prom.CounterVec |
||||
clientHandledHistogramEnabled bool |
||||
clientHandledHistogramOpts prom.HistogramOpts |
||||
clientHandledHistogram *prom.HistogramVec |
||||
} |
||||
|
||||
// NewClientMetrics returns a ClientMetrics object. Use a new instance of
|
||||
// ClientMetrics when not using the default Prometheus metrics registry, for
|
||||
// example when wanting to control which metrics are added to a registry as
|
||||
// opposed to automatically adding metrics via init functions.
|
||||
func NewClientMetrics(counterOpts ...CounterOption) *ClientMetrics { |
||||
opts := counterOptions(counterOpts) |
||||
return &ClientMetrics{ |
||||
clientStartedCounter: prom.NewCounterVec( |
||||
opts.apply(prom.CounterOpts{ |
||||
Name: "grpc_client_started_total", |
||||
Help: "Total number of RPCs started on the client.", |
||||
}), []string{"grpc_type", "grpc_service", "grpc_method"}), |
||||
|
||||
clientHandledCounter: prom.NewCounterVec( |
||||
opts.apply(prom.CounterOpts{ |
||||
Name: "grpc_client_handled_total", |
||||
Help: "Total number of RPCs completed by the client, regardless of success or failure.", |
||||
}), []string{"grpc_type", "grpc_service", "grpc_method", "grpc_code"}), |
||||
|
||||
clientStreamMsgReceived: prom.NewCounterVec( |
||||
opts.apply(prom.CounterOpts{ |
||||
Name: "grpc_client_msg_received_total", |
||||
Help: "Total number of RPC stream messages received by the client.", |
||||
}), []string{"grpc_type", "grpc_service", "grpc_method"}), |
||||
|
||||
clientStreamMsgSent: prom.NewCounterVec( |
||||
opts.apply(prom.CounterOpts{ |
||||
Name: "grpc_client_msg_sent_total", |
||||
Help: "Total number of gRPC stream messages sent by the client.", |
||||
}), []string{"grpc_type", "grpc_service", "grpc_method"}), |
||||
|
||||
clientHandledHistogramEnabled: false, |
||||
clientHandledHistogramOpts: prom.HistogramOpts{ |
||||
Name: "grpc_client_handling_seconds", |
||||
Help: "Histogram of response latency (seconds) of the gRPC until it is finished by the application.", |
||||
Buckets: prom.DefBuckets, |
||||
}, |
||||
clientHandledHistogram: nil, |
||||
} |
||||
} |
||||
|
||||
// Describe sends the super-set of all possible descriptors of metrics
|
||||
// collected by this Collector to the provided channel and returns once
|
||||
// the last descriptor has been sent.
|
||||
func (m *ClientMetrics) Describe(ch chan<- *prom.Desc) { |
||||
m.clientStartedCounter.Describe(ch) |
||||
m.clientHandledCounter.Describe(ch) |
||||
m.clientStreamMsgReceived.Describe(ch) |
||||
m.clientStreamMsgSent.Describe(ch) |
||||
if m.clientHandledHistogramEnabled { |
||||
m.clientHandledHistogram.Describe(ch) |
||||
} |
||||
} |
||||
|
||||
// Collect is called by the Prometheus registry when collecting
|
||||
// metrics. The implementation sends each collected metric via the
|
||||
// provided channel and returns once the last metric has been sent.
|
||||
func (m *ClientMetrics) Collect(ch chan<- prom.Metric) { |
||||
m.clientStartedCounter.Collect(ch) |
||||
m.clientHandledCounter.Collect(ch) |
||||
m.clientStreamMsgReceived.Collect(ch) |
||||
m.clientStreamMsgSent.Collect(ch) |
||||
if m.clientHandledHistogramEnabled { |
||||
m.clientHandledHistogram.Collect(ch) |
||||
} |
||||
} |
||||
|
||||
// EnableClientHandlingTimeHistogram turns on recording of handling time of RPCs.
|
||||
// Histogram metrics can be very expensive for Prometheus to retain and query.
|
||||
func (m *ClientMetrics) EnableClientHandlingTimeHistogram(opts ...HistogramOption) { |
||||
for _, o := range opts { |
||||
o(&m.clientHandledHistogramOpts) |
||||
} |
||||
if !m.clientHandledHistogramEnabled { |
||||
m.clientHandledHistogram = prom.NewHistogramVec( |
||||
m.clientHandledHistogramOpts, |
||||
[]string{"grpc_type", "grpc_service", "grpc_method"}, |
||||
) |
||||
} |
||||
m.clientHandledHistogramEnabled = true |
||||
} |
||||
|
||||
// UnaryClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Unary RPCs.
|
||||
func (m *ClientMetrics) UnaryClientInterceptor() func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error { |
||||
return func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error { |
||||
monitor := newClientReporter(m, Unary, method) |
||||
monitor.SentMessage() |
||||
err := invoker(ctx, method, req, reply, cc, opts...) |
||||
if err != nil { |
||||
monitor.ReceivedMessage() |
||||
} |
||||
st, _ := status.FromError(err) |
||||
monitor.Handled(st.Code()) |
||||
return err |
||||
} |
||||
} |
||||
|
||||
// StreamClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Streaming RPCs.
|
||||
func (m *ClientMetrics) StreamClientInterceptor() func(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) { |
||||
return func(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) { |
||||
monitor := newClientReporter(m, clientStreamType(desc), method) |
||||
clientStream, err := streamer(ctx, desc, cc, method, opts...) |
||||
if err != nil { |
||||
st, _ := status.FromError(err) |
||||
monitor.Handled(st.Code()) |
||||
return nil, err |
||||
} |
||||
return &monitoredClientStream{clientStream, monitor}, nil |
||||
} |
||||
} |
||||
|
||||
func clientStreamType(desc *grpc.StreamDesc) grpcType { |
||||
if desc.ClientStreams && !desc.ServerStreams { |
||||
return ClientStream |
||||
} else if !desc.ClientStreams && desc.ServerStreams { |
||||
return ServerStream |
||||
} |
||||
return BidiStream |
||||
} |
||||
|
||||
// monitoredClientStream wraps grpc.ClientStream allowing each Sent/Recv of message to increment counters.
|
||||
type monitoredClientStream struct { |
||||
grpc.ClientStream |
||||
monitor *clientReporter |
||||
} |
||||
|
||||
func (s *monitoredClientStream) SendMsg(m interface{}) error { |
||||
err := s.ClientStream.SendMsg(m) |
||||
if err == nil { |
||||
s.monitor.SentMessage() |
||||
} |
||||
return err |
||||
} |
||||
|
||||
func (s *monitoredClientStream) RecvMsg(m interface{}) error { |
||||
err := s.ClientStream.RecvMsg(m) |
||||
if err == nil { |
||||
s.monitor.ReceivedMessage() |
||||
} else if err == io.EOF { |
||||
s.monitor.Handled(codes.OK) |
||||
} else { |
||||
st, _ := status.FromError(err) |
||||
s.monitor.Handled(st.Code()) |
||||
} |
||||
return err |
||||
} |
||||
@ -0,0 +1,46 @@ |
||||
// Copyright 2016 Michal Witkowski. All Rights Reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
|
||||
package grpc_prometheus |
||||
|
||||
import ( |
||||
"time" |
||||
|
||||
"google.golang.org/grpc/codes" |
||||
) |
||||
|
||||
type clientReporter struct { |
||||
metrics *ClientMetrics |
||||
rpcType grpcType |
||||
serviceName string |
||||
methodName string |
||||
startTime time.Time |
||||
} |
||||
|
||||
func newClientReporter(m *ClientMetrics, rpcType grpcType, fullMethod string) *clientReporter { |
||||
r := &clientReporter{ |
||||
metrics: m, |
||||
rpcType: rpcType, |
||||
} |
||||
if r.metrics.clientHandledHistogramEnabled { |
||||
r.startTime = time.Now() |
||||
} |
||||
r.serviceName, r.methodName = splitMethodName(fullMethod) |
||||
r.metrics.clientStartedCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() |
||||
return r |
||||
} |
||||
|
||||
func (r *clientReporter) ReceivedMessage() { |
||||
r.metrics.clientStreamMsgReceived.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() |
||||
} |
||||
|
||||
func (r *clientReporter) SentMessage() { |
||||
r.metrics.clientStreamMsgSent.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() |
||||
} |
||||
|
||||
func (r *clientReporter) Handled(code codes.Code) { |
||||
r.metrics.clientHandledCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName, code.String()).Inc() |
||||
if r.metrics.clientHandledHistogramEnabled { |
||||
r.metrics.clientHandledHistogram.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Observe(time.Since(r.startTime).Seconds()) |
||||
} |
||||
} |
||||
@ -0,0 +1,16 @@ |
||||
SHELL="/bin/bash"
|
||||
|
||||
GOFILES_NOVENDOR = $(shell go list ./... | grep -v /vendor/)
|
||||
|
||||
all: vet fmt test |
||||
|
||||
fmt: |
||||
go fmt $(GOFILES_NOVENDOR)
|
||||
|
||||
vet: |
||||
go vet $(GOFILES_NOVENDOR)
|
||||
|
||||
test: vet |
||||
./scripts/test_all.sh
|
||||
|
||||
.PHONY: all vet test |
||||
@ -0,0 +1,41 @@ |
||||
package grpc_prometheus |
||||
|
||||
import ( |
||||
prom "github.com/prometheus/client_golang/prometheus" |
||||
) |
||||
|
||||
// A CounterOption lets you add options to Counter metrics using With* funcs.
|
||||
type CounterOption func(*prom.CounterOpts) |
||||
|
||||
type counterOptions []CounterOption |
||||
|
||||
func (co counterOptions) apply(o prom.CounterOpts) prom.CounterOpts { |
||||
for _, f := range co { |
||||
f(&o) |
||||
} |
||||
return o |
||||
} |
||||
|
||||
// WithConstLabels allows you to add ConstLabels to Counter metrics.
|
||||
func WithConstLabels(labels prom.Labels) CounterOption { |
||||
return func(o *prom.CounterOpts) { |
||||
o.ConstLabels = labels |
||||
} |
||||
} |
||||
|
||||
// A HistogramOption lets you add options to Histogram metrics using With*
|
||||
// funcs.
|
||||
type HistogramOption func(*prom.HistogramOpts) |
||||
|
||||
// WithHistogramBuckets allows you to specify custom bucket ranges for histograms if EnableHandlingTimeHistogram is on.
|
||||
func WithHistogramBuckets(buckets []float64) HistogramOption { |
||||
return func(o *prom.HistogramOpts) { o.Buckets = buckets } |
||||
} |
||||
|
||||
// WithHistogramConstLabels allows you to add custom ConstLabels to
|
||||
// histograms metrics.
|
||||
func WithHistogramConstLabels(labels prom.Labels) HistogramOption { |
||||
return func(o *prom.HistogramOpts) { |
||||
o.ConstLabels = labels |
||||
} |
||||
} |
||||
@ -0,0 +1,48 @@ |
||||
// Copyright 2016 Michal Witkowski. All Rights Reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
|
||||
// gRPC Prometheus monitoring interceptors for server-side gRPC.
|
||||
|
||||
package grpc_prometheus |
||||
|
||||
import ( |
||||
prom "github.com/prometheus/client_golang/prometheus" |
||||
"google.golang.org/grpc" |
||||
) |
||||
|
||||
var ( |
||||
// DefaultServerMetrics is the default instance of ServerMetrics. It is
|
||||
// intended to be used in conjunction the default Prometheus metrics
|
||||
// registry.
|
||||
DefaultServerMetrics = NewServerMetrics() |
||||
|
||||
// UnaryServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Unary RPCs.
|
||||
UnaryServerInterceptor = DefaultServerMetrics.UnaryServerInterceptor() |
||||
|
||||
// StreamServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Streaming RPCs.
|
||||
StreamServerInterceptor = DefaultServerMetrics.StreamServerInterceptor() |
||||
) |
||||
|
||||
func init() { |
||||
prom.MustRegister(DefaultServerMetrics.serverStartedCounter) |
||||
prom.MustRegister(DefaultServerMetrics.serverHandledCounter) |
||||
prom.MustRegister(DefaultServerMetrics.serverStreamMsgReceived) |
||||
prom.MustRegister(DefaultServerMetrics.serverStreamMsgSent) |
||||
} |
||||
|
||||
// Register takes a gRPC server and pre-initializes all counters to 0. This
|
||||
// allows for easier monitoring in Prometheus (no missing metrics), and should
|
||||
// be called *after* all services have been registered with the server. This
|
||||
// function acts on the DefaultServerMetrics variable.
|
||||
func Register(server *grpc.Server) { |
||||
DefaultServerMetrics.InitializeMetrics(server) |
||||
} |
||||
|
||||
// EnableHandlingTimeHistogram turns on recording of handling time
|
||||
// of RPCs. Histogram metrics can be very expensive for Prometheus
|
||||
// to retain and query. This function acts on the DefaultServerMetrics
|
||||
// variable and the default Prometheus metrics registry.
|
||||
func EnableHandlingTimeHistogram(opts ...HistogramOption) { |
||||
DefaultServerMetrics.EnableHandlingTimeHistogram(opts...) |
||||
prom.Register(DefaultServerMetrics.serverHandledHistogram) |
||||
} |
||||
@ -0,0 +1,185 @@ |
||||
package grpc_prometheus |
||||
|
||||
import ( |
||||
prom "github.com/prometheus/client_golang/prometheus" |
||||
"golang.org/x/net/context" |
||||
"google.golang.org/grpc" |
||||
"google.golang.org/grpc/status" |
||||
) |
||||
|
||||
// ServerMetrics represents a collection of metrics to be registered on a
|
||||
// Prometheus metrics registry for a gRPC server.
|
||||
type ServerMetrics struct { |
||||
serverStartedCounter *prom.CounterVec |
||||
serverHandledCounter *prom.CounterVec |
||||
serverStreamMsgReceived *prom.CounterVec |
||||
serverStreamMsgSent *prom.CounterVec |
||||
serverHandledHistogramEnabled bool |
||||
serverHandledHistogramOpts prom.HistogramOpts |
||||
serverHandledHistogram *prom.HistogramVec |
||||
} |
||||
|
||||
// NewServerMetrics returns a ServerMetrics object. Use a new instance of
|
||||
// ServerMetrics when not using the default Prometheus metrics registry, for
|
||||
// example when wanting to control which metrics are added to a registry as
|
||||
// opposed to automatically adding metrics via init functions.
|
||||
func NewServerMetrics(counterOpts ...CounterOption) *ServerMetrics { |
||||
opts := counterOptions(counterOpts) |
||||
return &ServerMetrics{ |
||||
serverStartedCounter: prom.NewCounterVec( |
||||
opts.apply(prom.CounterOpts{ |
||||
Name: "grpc_server_started_total", |
||||
Help: "Total number of RPCs started on the server.", |
||||
}), []string{"grpc_type", "grpc_service", "grpc_method"}), |
||||
serverHandledCounter: prom.NewCounterVec( |
||||
opts.apply(prom.CounterOpts{ |
||||
Name: "grpc_server_handled_total", |
||||
Help: "Total number of RPCs completed on the server, regardless of success or failure.", |
||||
}), []string{"grpc_type", "grpc_service", "grpc_method", "grpc_code"}), |
||||
serverStreamMsgReceived: prom.NewCounterVec( |
||||
opts.apply(prom.CounterOpts{ |
||||
Name: "grpc_server_msg_received_total", |
||||
Help: "Total number of RPC stream messages received on the server.", |
||||
}), []string{"grpc_type", "grpc_service", "grpc_method"}), |
||||
serverStreamMsgSent: prom.NewCounterVec( |
||||
opts.apply(prom.CounterOpts{ |
||||
Name: "grpc_server_msg_sent_total", |
||||
Help: "Total number of gRPC stream messages sent by the server.", |
||||
}), []string{"grpc_type", "grpc_service", "grpc_method"}), |
||||
serverHandledHistogramEnabled: false, |
||||
serverHandledHistogramOpts: prom.HistogramOpts{ |
||||
Name: "grpc_server_handling_seconds", |
||||
Help: "Histogram of response latency (seconds) of gRPC that had been application-level handled by the server.", |
||||
Buckets: prom.DefBuckets, |
||||
}, |
||||
serverHandledHistogram: nil, |
||||
} |
||||
} |
||||
|
||||
// EnableHandlingTimeHistogram enables histograms being registered when
|
||||
// registering the ServerMetrics on a Prometheus registry. Histograms can be
|
||||
// expensive on Prometheus servers. It takes options to configure histogram
|
||||
// options such as the defined buckets.
|
||||
func (m *ServerMetrics) EnableHandlingTimeHistogram(opts ...HistogramOption) { |
||||
for _, o := range opts { |
||||
o(&m.serverHandledHistogramOpts) |
||||
} |
||||
if !m.serverHandledHistogramEnabled { |
||||
m.serverHandledHistogram = prom.NewHistogramVec( |
||||
m.serverHandledHistogramOpts, |
||||
[]string{"grpc_type", "grpc_service", "grpc_method"}, |
||||
) |
||||
} |
||||
m.serverHandledHistogramEnabled = true |
||||
} |
||||
|
||||
// Describe sends the super-set of all possible descriptors of metrics
|
||||
// collected by this Collector to the provided channel and returns once
|
||||
// the last descriptor has been sent.
|
||||
func (m *ServerMetrics) Describe(ch chan<- *prom.Desc) { |
||||
m.serverStartedCounter.Describe(ch) |
||||
m.serverHandledCounter.Describe(ch) |
||||
m.serverStreamMsgReceived.Describe(ch) |
||||
m.serverStreamMsgSent.Describe(ch) |
||||
if m.serverHandledHistogramEnabled { |
||||
m.serverHandledHistogram.Describe(ch) |
||||
} |
||||
} |
||||
|
||||
// Collect is called by the Prometheus registry when collecting
|
||||
// metrics. The implementation sends each collected metric via the
|
||||
// provided channel and returns once the last metric has been sent.
|
||||
func (m *ServerMetrics) Collect(ch chan<- prom.Metric) { |
||||
m.serverStartedCounter.Collect(ch) |
||||
m.serverHandledCounter.Collect(ch) |
||||
m.serverStreamMsgReceived.Collect(ch) |
||||
m.serverStreamMsgSent.Collect(ch) |
||||
if m.serverHandledHistogramEnabled { |
||||
m.serverHandledHistogram.Collect(ch) |
||||
} |
||||
} |
||||
|
||||
// UnaryServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Unary RPCs.
|
||||
func (m *ServerMetrics) UnaryServerInterceptor() func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { |
||||
return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { |
||||
monitor := newServerReporter(m, Unary, info.FullMethod) |
||||
monitor.ReceivedMessage() |
||||
resp, err := handler(ctx, req) |
||||
st, _ := status.FromError(err) |
||||
monitor.Handled(st.Code()) |
||||
if err == nil { |
||||
monitor.SentMessage() |
||||
} |
||||
return resp, err |
||||
} |
||||
} |
||||
|
||||
// StreamServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Streaming RPCs.
|
||||
func (m *ServerMetrics) StreamServerInterceptor() func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { |
||||
return func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { |
||||
monitor := newServerReporter(m, streamRPCType(info), info.FullMethod) |
||||
err := handler(srv, &monitoredServerStream{ss, monitor}) |
||||
st, _ := status.FromError(err) |
||||
monitor.Handled(st.Code()) |
||||
return err |
||||
} |
||||
} |
||||
|
||||
// InitializeMetrics initializes all metrics, with their appropriate null
|
||||
// value, for all gRPC methods registered on a gRPC server. This is useful, to
|
||||
// ensure that all metrics exist when collecting and querying.
|
||||
func (m *ServerMetrics) InitializeMetrics(server *grpc.Server) { |
||||
serviceInfo := server.GetServiceInfo() |
||||
for serviceName, info := range serviceInfo { |
||||
for _, mInfo := range info.Methods { |
||||
preRegisterMethod(m, serviceName, &mInfo) |
||||
} |
||||
} |
||||
} |
||||
|
||||
func streamRPCType(info *grpc.StreamServerInfo) grpcType { |
||||
if info.IsClientStream && !info.IsServerStream { |
||||
return ClientStream |
||||
} else if !info.IsClientStream && info.IsServerStream { |
||||
return ServerStream |
||||
} |
||||
return BidiStream |
||||
} |
||||
|
||||
// monitoredStream wraps grpc.ServerStream allowing each Sent/Recv of message to increment counters.
|
||||
type monitoredServerStream struct { |
||||
grpc.ServerStream |
||||
monitor *serverReporter |
||||
} |
||||
|
||||
func (s *monitoredServerStream) SendMsg(m interface{}) error { |
||||
err := s.ServerStream.SendMsg(m) |
||||
if err == nil { |
||||
s.monitor.SentMessage() |
||||
} |
||||
return err |
||||
} |
||||
|
||||
func (s *monitoredServerStream) RecvMsg(m interface{}) error { |
||||
err := s.ServerStream.RecvMsg(m) |
||||
if err == nil { |
||||
s.monitor.ReceivedMessage() |
||||
} |
||||
return err |
||||
} |
||||
|
||||
// preRegisterMethod is invoked on Register of a Server, allowing all gRPC services labels to be pre-populated.
|
||||
func preRegisterMethod(metrics *ServerMetrics, serviceName string, mInfo *grpc.MethodInfo) { |
||||
methodName := mInfo.Name |
||||
methodType := string(typeFromMethodInfo(mInfo)) |
||||
// These are just references (no increments), as just referencing will create the labels but not set values.
|
||||
metrics.serverStartedCounter.GetMetricWithLabelValues(methodType, serviceName, methodName) |
||||
metrics.serverStreamMsgReceived.GetMetricWithLabelValues(methodType, serviceName, methodName) |
||||
metrics.serverStreamMsgSent.GetMetricWithLabelValues(methodType, serviceName, methodName) |
||||
if metrics.serverHandledHistogramEnabled { |
||||
metrics.serverHandledHistogram.GetMetricWithLabelValues(methodType, serviceName, methodName) |
||||
} |
||||
for _, code := range allCodes { |
||||
metrics.serverHandledCounter.GetMetricWithLabelValues(methodType, serviceName, methodName, code.String()) |
||||
} |
||||
} |
||||
@ -0,0 +1,46 @@ |
||||
// Copyright 2016 Michal Witkowski. All Rights Reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
|
||||
package grpc_prometheus |
||||
|
||||
import ( |
||||
"time" |
||||
|
||||
"google.golang.org/grpc/codes" |
||||
) |
||||
|
||||
type serverReporter struct { |
||||
metrics *ServerMetrics |
||||
rpcType grpcType |
||||
serviceName string |
||||
methodName string |
||||
startTime time.Time |
||||
} |
||||
|
||||
func newServerReporter(m *ServerMetrics, rpcType grpcType, fullMethod string) *serverReporter { |
||||
r := &serverReporter{ |
||||
metrics: m, |
||||
rpcType: rpcType, |
||||
} |
||||
if r.metrics.serverHandledHistogramEnabled { |
||||
r.startTime = time.Now() |
||||
} |
||||
r.serviceName, r.methodName = splitMethodName(fullMethod) |
||||
r.metrics.serverStartedCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() |
||||
return r |
||||
} |
||||
|
||||
func (r *serverReporter) ReceivedMessage() { |
||||
r.metrics.serverStreamMsgReceived.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() |
||||
} |
||||
|
||||
func (r *serverReporter) SentMessage() { |
||||
r.metrics.serverStreamMsgSent.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() |
||||
} |
||||
|
||||
func (r *serverReporter) Handled(code codes.Code) { |
||||
r.metrics.serverHandledCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName, code.String()).Inc() |
||||
if r.metrics.serverHandledHistogramEnabled { |
||||
r.metrics.serverHandledHistogram.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Observe(time.Since(r.startTime).Seconds()) |
||||
} |
||||
} |
||||
@ -0,0 +1,50 @@ |
||||
// Copyright 2016 Michal Witkowski. All Rights Reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
|
||||
package grpc_prometheus |
||||
|
||||
import ( |
||||
"strings" |
||||
|
||||
"google.golang.org/grpc" |
||||
"google.golang.org/grpc/codes" |
||||
) |
||||
|
||||
type grpcType string |
||||
|
||||
const ( |
||||
Unary grpcType = "unary" |
||||
ClientStream grpcType = "client_stream" |
||||
ServerStream grpcType = "server_stream" |
||||
BidiStream grpcType = "bidi_stream" |
||||
) |
||||
|
||||
var ( |
||||
allCodes = []codes.Code{ |
||||
codes.OK, codes.Canceled, codes.Unknown, codes.InvalidArgument, codes.DeadlineExceeded, codes.NotFound, |
||||
codes.AlreadyExists, codes.PermissionDenied, codes.Unauthenticated, codes.ResourceExhausted, |
||||
codes.FailedPrecondition, codes.Aborted, codes.OutOfRange, codes.Unimplemented, codes.Internal, |
||||
codes.Unavailable, codes.DataLoss, |
||||
} |
||||
) |
||||
|
||||
func splitMethodName(fullMethodName string) (string, string) { |
||||
fullMethodName = strings.TrimPrefix(fullMethodName, "/") // remove leading slash
|
||||
if i := strings.Index(fullMethodName, "/"); i >= 0 { |
||||
return fullMethodName[:i], fullMethodName[i+1:] |
||||
} |
||||
return "unknown", "unknown" |
||||
} |
||||
|
||||
func typeFromMethodInfo(mInfo *grpc.MethodInfo) grpcType { |
||||
if !mInfo.IsClientStream && !mInfo.IsServerStream { |
||||
return Unary |
||||
} |
||||
if mInfo.IsClientStream && !mInfo.IsServerStream { |
||||
return ClientStream |
||||
} |
||||
if !mInfo.IsClientStream && mInfo.IsServerStream { |
||||
return ServerStream |
||||
} |
||||
return BidiStream |
||||
} |
||||
Loading…
Reference in new issue