The open and composable observability and data visualization platform. Visualize metrics, logs, and traces from multiple sources like Prometheus, Loki, Elasticsearch, InfluxDB, Postgres and many more.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
grafana/pkg/aggregator/availableController.go

466 lines
17 KiB

// SPDX-License-Identifier: AGPL-3.0-only
// Provenance-includes-location: https://github.com/kubernetes/kube-aggregator/blob/master/pkg/controllers/status/available_controller.go
// Provenance-includes-license: Apache-2.0
// Provenance-includes-copyright: The Kubernetes Authors.
package aggregator
import (
"context"
"fmt"
"net/http"
"net/url"
"reflect"
"sync"
"time"
"github.com/grafana/grafana/pkg/apis/service/v0alpha1"
informersservicev0alpha1 "github.com/grafana/grafana/pkg/generated/informers/externalversions/service/v0alpha1"
listersservicev0alpha1 "github.com/grafana/grafana/pkg/generated/listers/service/v0alpha1"
"k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/transport"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
apiregistrationv1 "k8s.io/kube-aggregator/pkg/apis/apiregistration/v1"
apiregistrationv1apihelper "k8s.io/kube-aggregator/pkg/apis/apiregistration/v1/helper"
apiregistrationclient "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/typed/apiregistration/v1"
informers "k8s.io/kube-aggregator/pkg/client/informers/externalversions/apiregistration/v1"
listers "k8s.io/kube-aggregator/pkg/client/listers/apiregistration/v1"
"k8s.io/kube-aggregator/pkg/controllers"
)
type certKeyFunc func() ([]byte, []byte)
// ServiceResolver knows how to convert a service reference into an actual location.
type ServiceResolver interface {
ResolveEndpoint(namespace, name string, port int32) (*url.URL, error)
}
// AvailableConditionController handles checking the availability of registered API services.
type AvailableConditionController struct {
apiServiceClient apiregistrationclient.APIServicesGetter
apiServiceLister listers.APIServiceLister
apiServiceSynced cache.InformerSynced
// externalNameLister is used to get the IP to create the transport for
externalNameLister listersservicev0alpha1.ExternalNameLister
servicesSynced cache.InformerSynced
// proxyTransportDial specifies the dial function for creating unencrypted TCP connections.
proxyTransportDial *transport.DialHolder
proxyCurrentCertKeyContent certKeyFunc
serviceResolver ServiceResolver
// To allow injection for testing.
syncFn func(key string) error
queue workqueue.RateLimitingInterface
// map from service-namespace -> service-name -> apiservice names
cache map[string]map[string][]string
// this lock protects operations on the above cache
cacheLock sync.RWMutex
}
// NewAvailableConditionController returns a new AvailableConditionController.
func NewAvailableConditionController(
apiServiceInformer informers.APIServiceInformer,
externalNameInformer informersservicev0alpha1.ExternalNameInformer,
apiServiceClient apiregistrationclient.APIServicesGetter,
proxyTransportDial *transport.DialHolder,
proxyCurrentCertKeyContent certKeyFunc,
serviceResolver ServiceResolver,
) (*AvailableConditionController, error) {
c := &AvailableConditionController{
apiServiceClient: apiServiceClient,
apiServiceLister: apiServiceInformer.Lister(),
externalNameLister: externalNameInformer.Lister(),
serviceResolver: serviceResolver,
queue: workqueue.NewNamedRateLimitingQueue(
// We want a fairly tight requeue time. The controller listens to the API, but because it relies on the routability of the
// service network, it is possible for an external, non-watchable factor to affect availability. This keeps
// the maximum disruption time to a minimum, but it does prevent hot loops.
workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 30*time.Second),
"AvailableConditionController"),
proxyTransportDial: proxyTransportDial,
proxyCurrentCertKeyContent: proxyCurrentCertKeyContent,
}
// resync on this one because it is low cardinality and rechecking the actual discovery
// allows us to detect health in a more timely fashion when network connectivity to
// nodes is snipped, but the network still attempts to route there. See
// https://github.com/openshift/origin/issues/17159#issuecomment-341798063
apiServiceHandler, _ := apiServiceInformer.Informer().AddEventHandlerWithResyncPeriod(
cache.ResourceEventHandlerFuncs{
AddFunc: c.addAPIService,
UpdateFunc: c.updateAPIService,
DeleteFunc: c.deleteAPIService,
},
30*time.Second)
c.apiServiceSynced = apiServiceHandler.HasSynced
serviceHandler, _ := externalNameInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: c.addService,
UpdateFunc: c.updateService,
DeleteFunc: c.deleteService,
})
c.servicesSynced = serviceHandler.HasSynced
c.syncFn = c.sync
return c, nil
}
func (c *AvailableConditionController) sync(key string) error {
originalAPIService, err := c.apiServiceLister.Get(key)
if apierrors.IsNotFound(err) {
return nil
}
if err != nil {
return err
}
// if a particular transport was specified, use that otherwise build one
// construct an http client that will ignore TLS verification (if someone owns the network and messes with your status
// that's not so bad) and sets a very short timeout. This is a best effort GET that provides no additional information
transportConfig := &transport.Config{
TLS: transport.TLSConfig{
Insecure: true,
},
DialHolder: c.proxyTransportDial,
}
if c.proxyCurrentCertKeyContent != nil {
proxyClientCert, proxyClientKey := c.proxyCurrentCertKeyContent()
transportConfig.TLS.CertData = proxyClientCert
transportConfig.TLS.KeyData = proxyClientKey
}
restTransport, err := transport.New(transportConfig)
if err != nil {
return err
}
discoveryClient := &http.Client{
Transport: restTransport,
// the request should happen quickly.
Timeout: 5 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
}
apiService := originalAPIService.DeepCopy()
availableCondition := apiregistrationv1.APIServiceCondition{
Type: apiregistrationv1.Available,
Status: apiregistrationv1.ConditionTrue,
LastTransitionTime: metav1.Now(),
}
// local API services are always considered available
if apiService.Spec.Service == nil {
apiregistrationv1apihelper.SetAPIServiceCondition(apiService, apiregistrationv1apihelper.NewLocalAvailableAPIServiceCondition())
_, err := c.updateAPIServiceStatus(originalAPIService, apiService)
return err
}
_, err = c.externalNameLister.ExternalNames(apiService.Spec.Service.Namespace).Get(apiService.Spec.Service.Name)
if apierrors.IsNotFound(err) {
availableCondition.Status = apiregistrationv1.ConditionFalse
availableCondition.Reason = "ServiceNotFound"
availableCondition.Message = fmt.Sprintf("service/%s in %q is not present", apiService.Spec.Service.Name, apiService.Spec.Service.Namespace)
apiregistrationv1apihelper.SetAPIServiceCondition(apiService, availableCondition)
_, err := c.updateAPIServiceStatus(originalAPIService, apiService)
return err
} else if err != nil {
availableCondition.Status = apiregistrationv1.ConditionUnknown
availableCondition.Reason = "ServiceAccessError"
availableCondition.Message = fmt.Sprintf("service/%s in %q cannot be checked due to: %v", apiService.Spec.Service.Name, apiService.Spec.Service.Namespace, err)
apiregistrationv1apihelper.SetAPIServiceCondition(apiService, availableCondition)
_, err := c.updateAPIServiceStatus(originalAPIService, apiService)
return err
}
// actually try to hit the discovery endpoint when it isn't local and when we're routing as a service.
if apiService.Spec.Service != nil && c.serviceResolver != nil {
attempts := 5
results := make(chan error, attempts)
for i := 0; i < attempts; i++ {
go func() {
discoveryURL, err := c.serviceResolver.ResolveEndpoint(apiService.Spec.Service.Namespace, apiService.Spec.Service.Name, *apiService.Spec.Service.Port)
if err != nil {
results <- err
return
}
// render legacyAPIService health check path when it is delegated to a service
if apiService.Name == "v1." {
discoveryURL.Path = "/api/" + apiService.Spec.Version
} else {
discoveryURL.Path = "/apis/" + apiService.Spec.Group + "/" + apiService.Spec.Version
}
errCh := make(chan error, 1)
go func() {
// be sure to check a URL that the aggregated API server is required to serve
newReq, err := http.NewRequest("GET", discoveryURL.String(), nil)
if err != nil {
errCh <- err
return
}
// setting the system-masters identity ensures that we will always have access rights
transport.SetAuthProxyHeaders(newReq, "system:kube-aggregator", []string{"system:masters"}, nil)
resp, err := discoveryClient.Do(newReq)
if resp != nil {
_ = resp.Body.Close()
// we should always been in the 200s or 300s
if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices {
errCh <- fmt.Errorf("bad status from %v: %v", discoveryURL, resp.StatusCode)
return
}
}
errCh <- err
}()
select {
case err = <-errCh:
if err != nil {
results <- fmt.Errorf("failing or missing response from %v: %v", discoveryURL, err)
return
}
// we had trouble with slow dial and DNS responses causing us to wait too long.
// we added this as insurance
case <-time.After(6 * time.Second):
results <- fmt.Errorf("timed out waiting for %v", discoveryURL)
return
}
results <- nil
}()
}
var lastError error
for i := 0; i < attempts; i++ {
lastError = <-results
// if we had at least one success, we are successful overall and we can return now
if lastError == nil {
break
}
}
if lastError != nil {
availableCondition.Status = apiregistrationv1.ConditionFalse
availableCondition.Reason = "FailedDiscoveryCheck"
availableCondition.Message = lastError.Error()
apiregistrationv1apihelper.SetAPIServiceCondition(apiService, availableCondition)
_, updateErr := c.updateAPIServiceStatus(originalAPIService, apiService)
if updateErr != nil {
return updateErr
}
// force a requeue to make it very obvious that this will be retried at some point in the future
// along with other requeues done via service change, endpoint change, and resync
return lastError
}
}
availableCondition.Reason = "Passed"
availableCondition.Message = "all checks passed"
apiregistrationv1apihelper.SetAPIServiceCondition(apiService, availableCondition)
_, err = c.updateAPIServiceStatus(originalAPIService, apiService)
return err
}
// updateAPIServiceStatus only issues an update if a change is detected. We have a tight resync loop to quickly detect dead
// apiservices. Doing that means we don't want to quickly issue no-op updates.
func (c *AvailableConditionController) updateAPIServiceStatus(originalAPIService, newAPIService *apiregistrationv1.APIService) (*apiregistrationv1.APIService, error) {
if equality.Semantic.DeepEqual(originalAPIService.Status, newAPIService.Status) {
return newAPIService, nil
}
orig := apiregistrationv1apihelper.GetAPIServiceConditionByType(originalAPIService, apiregistrationv1.Available)
now := apiregistrationv1apihelper.GetAPIServiceConditionByType(newAPIService, apiregistrationv1.Available)
unknown := apiregistrationv1.APIServiceCondition{
Type: apiregistrationv1.Available,
Status: apiregistrationv1.ConditionUnknown,
}
if orig == nil {
orig = &unknown
}
if now == nil {
now = &unknown
}
if *orig != *now {
klog.V(2).InfoS("changing APIService availability", "name", newAPIService.Name, "oldStatus", orig.Status, "newStatus", now.Status, "message", now.Message, "reason", now.Reason)
}
newAPIService, err := c.apiServiceClient.APIServices().UpdateStatus(context.TODO(), newAPIService, metav1.UpdateOptions{})
if err != nil {
return nil, err
}
return newAPIService, nil
}
// Run starts the AvailableConditionController loop which manages the availability condition of API services.
func (c *AvailableConditionController) Run(workers int, stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
defer c.queue.ShutDown()
klog.Info("Starting AvailableConditionController")
defer klog.Info("Shutting down AvailableConditionController")
// This waits not just for the informers to sync, but for our handlers
// to be called; since the handlers are three different ways of
// enqueueing the same thing, waiting for this permits the queue to
// maximally de-duplicate the entries.
if !controllers.WaitForCacheSync("AvailableConditionCOverrideController", stopCh, c.apiServiceSynced, c.servicesSynced) {
return
}
for i := 0; i < workers; i++ {
go wait.Until(c.runWorker, time.Second, stopCh)
}
<-stopCh
}
func (c *AvailableConditionController) runWorker() {
for c.processNextWorkItem() {
}
}
// processNextWorkItem deals with one key off the queue. It returns false when it's time to quit.
func (c *AvailableConditionController) processNextWorkItem() bool {
key, quit := c.queue.Get()
if quit {
return false
}
defer c.queue.Done(key)
err := c.syncFn(key.(string))
if err == nil {
c.queue.Forget(key)
return true
}
utilruntime.HandleError(fmt.Errorf("%v failed with: %v", key, err))
c.queue.AddRateLimited(key)
return true
}
func (c *AvailableConditionController) addAPIService(obj interface{}) {
castObj := obj.(*apiregistrationv1.APIService)
klog.V(4).Infof("Adding %s", castObj.Name)
if castObj.Spec.Service != nil {
c.rebuildAPIServiceCache()
}
c.queue.Add(castObj.Name)
}
func (c *AvailableConditionController) updateAPIService(oldObj, newObj interface{}) {
castObj := newObj.(*apiregistrationv1.APIService)
oldCastObj := oldObj.(*apiregistrationv1.APIService)
klog.V(4).Infof("Updating %s", oldCastObj.Name)
if !reflect.DeepEqual(castObj.Spec.Service, oldCastObj.Spec.Service) {
c.rebuildAPIServiceCache()
}
c.queue.Add(oldCastObj.Name)
}
func (c *AvailableConditionController) deleteAPIService(obj interface{}) {
castObj, ok := obj.(*apiregistrationv1.APIService)
if !ok {
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("Couldn't get object from tombstone %#v", obj)
return
}
castObj, ok = tombstone.Obj.(*apiregistrationv1.APIService)
if !ok {
klog.Errorf("Tombstone contained object that is not expected %#v", obj)
return
}
}
klog.V(4).Infof("Deleting %q", castObj.Name)
if castObj.Spec.Service != nil {
c.rebuildAPIServiceCache()
}
c.queue.Add(castObj.Name)
}
func (c *AvailableConditionController) getAPIServicesFor(obj runtime.Object) []string {
metadata, err := meta.Accessor(obj)
if err != nil {
utilruntime.HandleError(err)
return nil
}
c.cacheLock.RLock()
defer c.cacheLock.RUnlock()
return c.cache[metadata.GetNamespace()][metadata.GetName()]
}
// if the service/endpoint handler wins the race against the cache rebuilding, it may queue a no-longer-relevant apiservice
// (which will get processed an extra time - this doesn't matter),
// and miss a newly relevant apiservice (which will get queued by the apiservice handler)
func (c *AvailableConditionController) rebuildAPIServiceCache() {
apiServiceList, _ := c.apiServiceLister.List(labels.Everything())
newCache := map[string]map[string][]string{}
for _, apiService := range apiServiceList {
if apiService.Spec.Service == nil {
continue
}
if newCache[apiService.Spec.Service.Namespace] == nil {
newCache[apiService.Spec.Service.Namespace] = map[string][]string{}
}
newCache[apiService.Spec.Service.Namespace][apiService.Spec.Service.Name] = append(newCache[apiService.Spec.Service.Namespace][apiService.Spec.Service.Name], apiService.Name)
}
c.cacheLock.Lock()
defer c.cacheLock.Unlock()
c.cache = newCache
}
// TODO, think of a way to avoid checking on every service manipulation
func (c *AvailableConditionController) addService(obj interface{}) {
for _, apiService := range c.getAPIServicesFor(obj.(*v0alpha1.ExternalName)) {
c.queue.Add(apiService)
}
}
func (c *AvailableConditionController) updateService(obj, _ interface{}) {
for _, apiService := range c.getAPIServicesFor(obj.(*v0alpha1.ExternalName)) {
c.queue.Add(apiService)
}
}
func (c *AvailableConditionController) deleteService(obj interface{}) {
castObj, ok := obj.(*v0alpha1.ExternalName)
if !ok {
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("Couldn't get object from tombstone %#v", obj)
return
}
castObj, ok = tombstone.Obj.(*v0alpha1.ExternalName)
if !ok {
klog.Errorf("Tombstone contained object that is not expected %#v", obj)
return
}
}
for _, apiService := range c.getAPIServicesFor(castObj) {
c.queue.Add(apiService)
}
}