improvements based on PR feedback

Signed-off-by: Tim Ramlot <42113979+inteon@users.noreply.github.com>
This commit is contained in:
Tim Ramlot 2023-09-20 18:23:13 +02:00
parent 8d75a003e9
commit 5d876c5b91
No known key found for this signature in database
GPG Key ID: 47428728E0C2878D
2 changed files with 33 additions and 15 deletions

View File

@ -277,9 +277,8 @@ topologySpreadConstraints: []
# LivenessProbe settings for the controller container of the controller Pod.
#
# Disabled by default, because the controller has a leader election mechanism
# which should cause it to exit if it is unable to renew its leader election
# record.
# Enabled by default, because we want to enable the clock-skew liveness probe that
# restarts the controller in case of a skew between the system clock and the monotonic clock.
# LivenessProbe durations and thresholds are based on those used for the Kubernetes
# controller-manager. See:
# https://github.com/kubernetes/kubernetes/blob/806b30170c61a38fedd54cc9ede4cd6275a1ad3b/cmd/kubeadm/app/util/staticpod/utils.go#L241-L245

View File

@ -24,6 +24,26 @@ import (
"k8s.io/utils/clock"
)
const maxClockSkew = 1 * time.Minute
// The clockHealthAdaptor implements the HealthChecker interface.
// It checks the system clock is in sync with the internal monotonic clock.
// This is important because the internal monotonic clock is used to trigger certificate
// reconciles for renewals. If the monotonic clock is out of sync with the system clock
// then renewals might not be triggered in time. Ideally we would trigger renewals based
// on the system clock, but this is not (yet) possible in Go.
// See https://github.com/golang/go/issues/35012
//
// A clock skew can be caused by:
// 1. The system clock being adjusted
// -> this eg. happens when ntp adjusts the system clock
// 2. Pausing the process (e.g. with SIGSTOP)
// -> the monotonic clock will stop, but the system clock will continue
// -> this eg. happens when you pause a VM/ hibernate a laptop
//
// Small clock skews of < 1m are allowed, because they can happen when the system clock is
// adjusted. However, we do compound the clock skew over time, so that if the clock skew
// is small but constant, it will eventually fail the health check.
type clockHealthAdaptor struct {
clock clock.Clock
startTimeReal time.Time
@ -31,22 +51,20 @@ type clockHealthAdaptor struct {
}
func NewClockHealthAdaptor(c clock.Clock) *clockHealthAdaptor {
now := c.Now()
return &clockHealthAdaptor{
clock: c,
startTimeReal: c.Now().Round(0), // .Round(0) removes the monotonic part from the time
startTimeMonotonic: c.Now(),
startTimeReal: now.Round(0), // .Round(0) removes the monotonic part from the time
startTimeMonotonic: now,
}
}
func (c *clockHealthAdaptor) skew() time.Duration {
realDuration := c.clock.Since(c.startTimeReal)
monotonicDuration := c.clock.Since(c.startTimeMonotonic)
now := c.clock.Now()
realDuration := now.Sub(c.startTimeReal)
monotonicDuration := now.Sub(c.startTimeMonotonic)
if monotonicDuration > realDuration {
return monotonicDuration - realDuration
}
return realDuration - monotonicDuration
return (realDuration - monotonicDuration).Abs()
}
// Name returns the name of the health check we are implementing.
@ -55,10 +73,11 @@ func (l *clockHealthAdaptor) Name() string {
}
// Check is called by the healthz endpoint handler.
// It fails (returns an error) if we own the lease but had not been able to renew it.
// It fails (returns an error) when the system clock is out of sync with the
// internal monotonic clock by more than the maxClockSkew.
func (l *clockHealthAdaptor) Check(req *http.Request) error {
if skew := l.skew(); skew > 1*time.Minute {
return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed 1m", skew)
if skew := l.skew(); skew > maxClockSkew {
return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed %v", skew, maxClockSkew)
}
return nil
}