diff --git a/deploy/charts/cert-manager/values.yaml b/deploy/charts/cert-manager/values.yaml index 5ccbc8ec8..a063ca557 100644 --- a/deploy/charts/cert-manager/values.yaml +++ b/deploy/charts/cert-manager/values.yaml @@ -277,9 +277,8 @@ topologySpreadConstraints: [] # LivenessProbe settings for the controller container of the controller Pod. # -# Disabled by default, because the controller has a leader election mechanism -# which should cause it to exit if it is unable to renew its leader election -# record. +# Enabled by default, because we want to enable the clock-skew liveness probe that +# restarts the controller in case of a skew between the system clock and the monotonic clock. # LivenessProbe durations and thresholds are based on those used for the Kubernetes # controller-manager. See: # https://github.com/kubernetes/kubernetes/blob/806b30170c61a38fedd54cc9ede4cd6275a1ad3b/cmd/kubeadm/app/util/staticpod/utils.go#L241-L245 diff --git a/pkg/healthz/clock_health.go b/pkg/healthz/clock_health.go index d4b389068..e7cae61e9 100644 --- a/pkg/healthz/clock_health.go +++ b/pkg/healthz/clock_health.go @@ -24,6 +24,26 @@ import ( "k8s.io/utils/clock" ) +const maxClockSkew = 1 * time.Minute + +// The clockHealthAdaptor implements the HealthChecker interface. +// It checks the system clock is in sync with the internal monotonic clock. +// This is important because the internal monotonic clock is used to trigger certificate +// reconciles for renewals. If the monotonic clock is out of sync with the system clock +// then renewals might not be triggered in time. Ideally we would trigger renewals based +// on the system clock, but this is not (yet) possible in Go. +// See https://github.com/golang/go/issues/35012 +// +// A clock skew can be caused by: +// 1. The system clock being adjusted +// -> this eg. happens when ntp adjusts the system clock +// 2. Pausing the process (e.g. with SIGSTOP) +// -> the monotonic clock will stop, but the system clock will continue +// -> this eg. happens when you pause a VM/ hibernate a laptop +// +// Small clock skews of < 1m are allowed, because they can happen when the system clock is +// adjusted. However, we do compound the clock skew over time, so that if the clock skew +// is small but constant, it will eventually fail the health check. type clockHealthAdaptor struct { clock clock.Clock startTimeReal time.Time @@ -31,22 +51,20 @@ type clockHealthAdaptor struct { } func NewClockHealthAdaptor(c clock.Clock) *clockHealthAdaptor { + now := c.Now() return &clockHealthAdaptor{ clock: c, - startTimeReal: c.Now().Round(0), // .Round(0) removes the monotonic part from the time - startTimeMonotonic: c.Now(), + startTimeReal: now.Round(0), // .Round(0) removes the monotonic part from the time + startTimeMonotonic: now, } } func (c *clockHealthAdaptor) skew() time.Duration { - realDuration := c.clock.Since(c.startTimeReal) - monotonicDuration := c.clock.Since(c.startTimeMonotonic) + now := c.clock.Now() + realDuration := now.Sub(c.startTimeReal) + monotonicDuration := now.Sub(c.startTimeMonotonic) - if monotonicDuration > realDuration { - return monotonicDuration - realDuration - } - - return realDuration - monotonicDuration + return (realDuration - monotonicDuration).Abs() } // Name returns the name of the health check we are implementing. @@ -55,10 +73,11 @@ func (l *clockHealthAdaptor) Name() string { } // Check is called by the healthz endpoint handler. -// It fails (returns an error) if we own the lease but had not been able to renew it. +// It fails (returns an error) when the system clock is out of sync with the +// internal monotonic clock by more than the maxClockSkew. func (l *clockHealthAdaptor) Check(req *http.Request) error { - if skew := l.skew(); skew > 1*time.Minute { - return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed 1m", skew) + if skew := l.skew(); skew > maxClockSkew { + return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed %v", skew, maxClockSkew) } return nil }