From 8d75a003e979d3934e96924fa14f79cc5791aa54 Mon Sep 17 00:00:00 2001 From: Tim Ramlot <42113979+inteon@users.noreply.github.com> Date: Thu, 14 Sep 2023 13:55:44 +0200 Subject: [PATCH 1/3] add health probe that detects skew between 'real' system clock and 'monotonic' internal clock Signed-off-by: Tim Ramlot <42113979+inteon@users.noreply.github.com> --- deploy/charts/cert-manager/values.yaml | 2 +- pkg/healthz/clock_health.go | 64 ++++++++++++++++++++++++++ pkg/healthz/healthz.go | 4 +- 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 pkg/healthz/clock_health.go diff --git a/deploy/charts/cert-manager/values.yaml b/deploy/charts/cert-manager/values.yaml index 2d47d7141..5ccbc8ec8 100644 --- a/deploy/charts/cert-manager/values.yaml +++ b/deploy/charts/cert-manager/values.yaml @@ -284,7 +284,7 @@ topologySpreadConstraints: [] # controller-manager. See: # https://github.com/kubernetes/kubernetes/blob/806b30170c61a38fedd54cc9ede4cd6275a1ad3b/cmd/kubeadm/app/util/staticpod/utils.go#L241-L245 livenessProbe: - enabled: false + enabled: true initialDelaySeconds: 10 periodSeconds: 10 timeoutSeconds: 15 diff --git a/pkg/healthz/clock_health.go b/pkg/healthz/clock_health.go new file mode 100644 index 000000000..d4b389068 --- /dev/null +++ b/pkg/healthz/clock_health.go @@ -0,0 +1,64 @@ +/* +Copyright 2020 The cert-manager Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package healthz + +import ( + "fmt" + "net/http" + "time" + + "k8s.io/utils/clock" +) + +type clockHealthAdaptor struct { + clock clock.Clock + startTimeReal time.Time + startTimeMonotonic time.Time +} + +func NewClockHealthAdaptor(c clock.Clock) *clockHealthAdaptor { + return &clockHealthAdaptor{ + clock: c, + startTimeReal: c.Now().Round(0), // .Round(0) removes the monotonic part from the time + startTimeMonotonic: c.Now(), + } +} + +func (c *clockHealthAdaptor) skew() time.Duration { + realDuration := c.clock.Since(c.startTimeReal) + monotonicDuration := c.clock.Since(c.startTimeMonotonic) + + if monotonicDuration > realDuration { + return monotonicDuration - realDuration + } + + return realDuration - monotonicDuration +} + +// Name returns the name of the health check we are implementing. +func (l *clockHealthAdaptor) Name() string { + return "clockHealth" +} + +// Check is called by the healthz endpoint handler. +// It fails (returns an error) if we own the lease but had not been able to renew it. +func (l *clockHealthAdaptor) Check(req *http.Request) error { + if skew := l.skew(); skew > 1*time.Minute { + return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed 1m", skew) + } + return nil +} diff --git a/pkg/healthz/healthz.go b/pkg/healthz/healthz.go index 8e223c3e8..6f720ad75 100644 --- a/pkg/healthz/healthz.go +++ b/pkg/healthz/healthz.go @@ -26,6 +26,7 @@ import ( "golang.org/x/sync/errgroup" "k8s.io/apiserver/pkg/server/healthz" "k8s.io/client-go/tools/leaderelection" + "k8s.io/utils/clock" ) const ( @@ -51,8 +52,9 @@ type Server struct { // leader lease time, the leader election will be considered to have failed. func NewServer(leaderElectionHealthzAdaptorTimeout time.Duration) *Server { leaderHealthzAdaptor := leaderelection.NewLeaderHealthzAdaptor(leaderElectionHealthzAdaptorTimeout) + clockHealthAdaptor := NewClockHealthAdaptor(clock.RealClock{}) mux := http.NewServeMux() - healthz.InstallLivezHandler(mux, leaderHealthzAdaptor) + healthz.InstallLivezHandler(mux, leaderHealthzAdaptor, clockHealthAdaptor) return &Server{ server: &http.Server{ ReadTimeout: healthzServerReadTimeout, From 5d876c5b911821bb0bf32241cd6dd66d414dd30a Mon Sep 17 00:00:00 2001 From: Tim Ramlot <42113979+inteon@users.noreply.github.com> Date: Wed, 20 Sep 2023 18:23:13 +0200 Subject: [PATCH 2/3] improvements based on PR feedback Signed-off-by: Tim Ramlot <42113979+inteon@users.noreply.github.com> --- deploy/charts/cert-manager/values.yaml | 5 ++- pkg/healthz/clock_health.go | 43 +++++++++++++++++++------- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/deploy/charts/cert-manager/values.yaml b/deploy/charts/cert-manager/values.yaml index 5ccbc8ec8..a063ca557 100644 --- a/deploy/charts/cert-manager/values.yaml +++ b/deploy/charts/cert-manager/values.yaml @@ -277,9 +277,8 @@ topologySpreadConstraints: [] # LivenessProbe settings for the controller container of the controller Pod. # -# Disabled by default, because the controller has a leader election mechanism -# which should cause it to exit if it is unable to renew its leader election -# record. +# Enabled by default, because we want to enable the clock-skew liveness probe that +# restarts the controller in case of a skew between the system clock and the monotonic clock. # LivenessProbe durations and thresholds are based on those used for the Kubernetes # controller-manager. See: # https://github.com/kubernetes/kubernetes/blob/806b30170c61a38fedd54cc9ede4cd6275a1ad3b/cmd/kubeadm/app/util/staticpod/utils.go#L241-L245 diff --git a/pkg/healthz/clock_health.go b/pkg/healthz/clock_health.go index d4b389068..e7cae61e9 100644 --- a/pkg/healthz/clock_health.go +++ b/pkg/healthz/clock_health.go @@ -24,6 +24,26 @@ import ( "k8s.io/utils/clock" ) +const maxClockSkew = 1 * time.Minute + +// The clockHealthAdaptor implements the HealthChecker interface. +// It checks the system clock is in sync with the internal monotonic clock. +// This is important because the internal monotonic clock is used to trigger certificate +// reconciles for renewals. If the monotonic clock is out of sync with the system clock +// then renewals might not be triggered in time. Ideally we would trigger renewals based +// on the system clock, but this is not (yet) possible in Go. +// See https://github.com/golang/go/issues/35012 +// +// A clock skew can be caused by: +// 1. The system clock being adjusted +// -> this eg. happens when ntp adjusts the system clock +// 2. Pausing the process (e.g. with SIGSTOP) +// -> the monotonic clock will stop, but the system clock will continue +// -> this eg. happens when you pause a VM/ hibernate a laptop +// +// Small clock skews of < 1m are allowed, because they can happen when the system clock is +// adjusted. However, we do compound the clock skew over time, so that if the clock skew +// is small but constant, it will eventually fail the health check. type clockHealthAdaptor struct { clock clock.Clock startTimeReal time.Time @@ -31,22 +51,20 @@ type clockHealthAdaptor struct { } func NewClockHealthAdaptor(c clock.Clock) *clockHealthAdaptor { + now := c.Now() return &clockHealthAdaptor{ clock: c, - startTimeReal: c.Now().Round(0), // .Round(0) removes the monotonic part from the time - startTimeMonotonic: c.Now(), + startTimeReal: now.Round(0), // .Round(0) removes the monotonic part from the time + startTimeMonotonic: now, } } func (c *clockHealthAdaptor) skew() time.Duration { - realDuration := c.clock.Since(c.startTimeReal) - monotonicDuration := c.clock.Since(c.startTimeMonotonic) + now := c.clock.Now() + realDuration := now.Sub(c.startTimeReal) + monotonicDuration := now.Sub(c.startTimeMonotonic) - if monotonicDuration > realDuration { - return monotonicDuration - realDuration - } - - return realDuration - monotonicDuration + return (realDuration - monotonicDuration).Abs() } // Name returns the name of the health check we are implementing. @@ -55,10 +73,11 @@ func (l *clockHealthAdaptor) Name() string { } // Check is called by the healthz endpoint handler. -// It fails (returns an error) if we own the lease but had not been able to renew it. +// It fails (returns an error) when the system clock is out of sync with the +// internal monotonic clock by more than the maxClockSkew. func (l *clockHealthAdaptor) Check(req *http.Request) error { - if skew := l.skew(); skew > 1*time.Minute { - return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed 1m", skew) + if skew := l.skew(); skew > maxClockSkew { + return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed %v", skew, maxClockSkew) } return nil } From 5049cd4d3541ce985afd766a3b616b013678e342 Mon Sep 17 00:00:00 2001 From: Tim Ramlot <42113979+inteon@users.noreply.github.com> Date: Wed, 27 Sep 2023 11:20:48 +0200 Subject: [PATCH 3/3] increase maxClockSkew to 5 minutes, just to be safe Signed-off-by: Tim Ramlot <42113979+inteon@users.noreply.github.com> --- pkg/healthz/clock_health.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/healthz/clock_health.go b/pkg/healthz/clock_health.go index e7cae61e9..d4ecbf234 100644 --- a/pkg/healthz/clock_health.go +++ b/pkg/healthz/clock_health.go @@ -24,7 +24,7 @@ import ( "k8s.io/utils/clock" ) -const maxClockSkew = 1 * time.Minute +const maxClockSkew = 5 * time.Minute // The clockHealthAdaptor implements the HealthChecker interface. // It checks the system clock is in sync with the internal monotonic clock. @@ -41,7 +41,7 @@ const maxClockSkew = 1 * time.Minute // -> the monotonic clock will stop, but the system clock will continue // -> this eg. happens when you pause a VM/ hibernate a laptop // -// Small clock skews of < 1m are allowed, because they can happen when the system clock is +// Small clock skews of < 5m are allowed, because they can happen when the system clock is // adjusted. However, we do compound the clock skew over time, so that if the clock skew // is small but constant, it will eventually fail the health check. type clockHealthAdaptor struct {