diff --git a/deploy/charts/cert-manager/values.yaml b/deploy/charts/cert-manager/values.yaml index 2d47d7141..a063ca557 100644 --- a/deploy/charts/cert-manager/values.yaml +++ b/deploy/charts/cert-manager/values.yaml @@ -277,14 +277,13 @@ topologySpreadConstraints: [] # LivenessProbe settings for the controller container of the controller Pod. # -# Disabled by default, because the controller has a leader election mechanism -# which should cause it to exit if it is unable to renew its leader election -# record. +# Enabled by default, because we want to enable the clock-skew liveness probe that +# restarts the controller in case of a skew between the system clock and the monotonic clock. # LivenessProbe durations and thresholds are based on those used for the Kubernetes # controller-manager. See: # https://github.com/kubernetes/kubernetes/blob/806b30170c61a38fedd54cc9ede4cd6275a1ad3b/cmd/kubeadm/app/util/staticpod/utils.go#L241-L245 livenessProbe: - enabled: false + enabled: true initialDelaySeconds: 10 periodSeconds: 10 timeoutSeconds: 15 diff --git a/pkg/healthz/clock_health.go b/pkg/healthz/clock_health.go new file mode 100644 index 000000000..d4ecbf234 --- /dev/null +++ b/pkg/healthz/clock_health.go @@ -0,0 +1,83 @@ +/* +Copyright 2020 The cert-manager Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package healthz + +import ( + "fmt" + "net/http" + "time" + + "k8s.io/utils/clock" +) + +const maxClockSkew = 5 * time.Minute + +// The clockHealthAdaptor implements the HealthChecker interface. +// It checks the system clock is in sync with the internal monotonic clock. +// This is important because the internal monotonic clock is used to trigger certificate +// reconciles for renewals. If the monotonic clock is out of sync with the system clock +// then renewals might not be triggered in time. Ideally we would trigger renewals based +// on the system clock, but this is not (yet) possible in Go. +// See https://github.com/golang/go/issues/35012 +// +// A clock skew can be caused by: +// 1. The system clock being adjusted +// -> this eg. happens when ntp adjusts the system clock +// 2. Pausing the process (e.g. with SIGSTOP) +// -> the monotonic clock will stop, but the system clock will continue +// -> this eg. happens when you pause a VM/ hibernate a laptop +// +// Small clock skews of < 5m are allowed, because they can happen when the system clock is +// adjusted. However, we do compound the clock skew over time, so that if the clock skew +// is small but constant, it will eventually fail the health check. +type clockHealthAdaptor struct { + clock clock.Clock + startTimeReal time.Time + startTimeMonotonic time.Time +} + +func NewClockHealthAdaptor(c clock.Clock) *clockHealthAdaptor { + now := c.Now() + return &clockHealthAdaptor{ + clock: c, + startTimeReal: now.Round(0), // .Round(0) removes the monotonic part from the time + startTimeMonotonic: now, + } +} + +func (c *clockHealthAdaptor) skew() time.Duration { + now := c.clock.Now() + realDuration := now.Sub(c.startTimeReal) + monotonicDuration := now.Sub(c.startTimeMonotonic) + + return (realDuration - monotonicDuration).Abs() +} + +// Name returns the name of the health check we are implementing. +func (l *clockHealthAdaptor) Name() string { + return "clockHealth" +} + +// Check is called by the healthz endpoint handler. +// It fails (returns an error) when the system clock is out of sync with the +// internal monotonic clock by more than the maxClockSkew. +func (l *clockHealthAdaptor) Check(req *http.Request) error { + if skew := l.skew(); skew > maxClockSkew { + return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed %v", skew, maxClockSkew) + } + return nil +} diff --git a/pkg/healthz/healthz.go b/pkg/healthz/healthz.go index 8e223c3e8..6f720ad75 100644 --- a/pkg/healthz/healthz.go +++ b/pkg/healthz/healthz.go @@ -26,6 +26,7 @@ import ( "golang.org/x/sync/errgroup" "k8s.io/apiserver/pkg/server/healthz" "k8s.io/client-go/tools/leaderelection" + "k8s.io/utils/clock" ) const ( @@ -51,8 +52,9 @@ type Server struct { // leader lease time, the leader election will be considered to have failed. func NewServer(leaderElectionHealthzAdaptorTimeout time.Duration) *Server { leaderHealthzAdaptor := leaderelection.NewLeaderHealthzAdaptor(leaderElectionHealthzAdaptorTimeout) + clockHealthAdaptor := NewClockHealthAdaptor(clock.RealClock{}) mux := http.NewServeMux() - healthz.InstallLivezHandler(mux, leaderHealthzAdaptor) + healthz.InstallLivezHandler(mux, leaderHealthzAdaptor, clockHealthAdaptor) return &Server{ server: &http.Server{ ReadTimeout: healthzServerReadTimeout,