Merge pull request #6328 from inteon/add_clock_health

Add health probe that detects skew between system clock and monotonic go process clock
This commit is contained in:
jetstack-bot 2023-09-27 11:37:11 +02:00 committed by GitHub
commit 8aafddb974
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 89 additions and 5 deletions

View File

@ -277,14 +277,13 @@ topologySpreadConstraints: []
# LivenessProbe settings for the controller container of the controller Pod.
#
# Disabled by default, because the controller has a leader election mechanism
# which should cause it to exit if it is unable to renew its leader election
# record.
# Enabled by default, because we want to enable the clock-skew liveness probe that
# restarts the controller in case of a skew between the system clock and the monotonic clock.
# LivenessProbe durations and thresholds are based on those used for the Kubernetes
# controller-manager. See:
# https://github.com/kubernetes/kubernetes/blob/806b30170c61a38fedd54cc9ede4cd6275a1ad3b/cmd/kubeadm/app/util/staticpod/utils.go#L241-L245
livenessProbe:
enabled: false
enabled: true
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 15

View File

@ -0,0 +1,83 @@
/*
Copyright 2020 The cert-manager Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package healthz
import (
"fmt"
"net/http"
"time"
"k8s.io/utils/clock"
)
const maxClockSkew = 5 * time.Minute
// The clockHealthAdaptor implements the HealthChecker interface.
// It checks the system clock is in sync with the internal monotonic clock.
// This is important because the internal monotonic clock is used to trigger certificate
// reconciles for renewals. If the monotonic clock is out of sync with the system clock
// then renewals might not be triggered in time. Ideally we would trigger renewals based
// on the system clock, but this is not (yet) possible in Go.
// See https://github.com/golang/go/issues/35012
//
// A clock skew can be caused by:
// 1. The system clock being adjusted
// -> this eg. happens when ntp adjusts the system clock
// 2. Pausing the process (e.g. with SIGSTOP)
// -> the monotonic clock will stop, but the system clock will continue
// -> this eg. happens when you pause a VM/ hibernate a laptop
//
// Small clock skews of < 5m are allowed, because they can happen when the system clock is
// adjusted. However, we do compound the clock skew over time, so that if the clock skew
// is small but constant, it will eventually fail the health check.
type clockHealthAdaptor struct {
clock clock.Clock
startTimeReal time.Time
startTimeMonotonic time.Time
}
func NewClockHealthAdaptor(c clock.Clock) *clockHealthAdaptor {
now := c.Now()
return &clockHealthAdaptor{
clock: c,
startTimeReal: now.Round(0), // .Round(0) removes the monotonic part from the time
startTimeMonotonic: now,
}
}
func (c *clockHealthAdaptor) skew() time.Duration {
now := c.clock.Now()
realDuration := now.Sub(c.startTimeReal)
monotonicDuration := now.Sub(c.startTimeMonotonic)
return (realDuration - monotonicDuration).Abs()
}
// Name returns the name of the health check we are implementing.
func (l *clockHealthAdaptor) Name() string {
return "clockHealth"
}
// Check is called by the healthz endpoint handler.
// It fails (returns an error) when the system clock is out of sync with the
// internal monotonic clock by more than the maxClockSkew.
func (l *clockHealthAdaptor) Check(req *http.Request) error {
if skew := l.skew(); skew > maxClockSkew {
return fmt.Errorf("the system clock is out of sync with the internal monotonic clock by %v, which is more than the allowed %v", skew, maxClockSkew)
}
return nil
}

View File

@ -26,6 +26,7 @@ import (
"golang.org/x/sync/errgroup"
"k8s.io/apiserver/pkg/server/healthz"
"k8s.io/client-go/tools/leaderelection"
"k8s.io/utils/clock"
)
const (
@ -51,8 +52,9 @@ type Server struct {
// leader lease time, the leader election will be considered to have failed.
func NewServer(leaderElectionHealthzAdaptorTimeout time.Duration) *Server {
leaderHealthzAdaptor := leaderelection.NewLeaderHealthzAdaptor(leaderElectionHealthzAdaptorTimeout)
clockHealthAdaptor := NewClockHealthAdaptor(clock.RealClock{})
mux := http.NewServeMux()
healthz.InstallLivezHandler(mux, leaderHealthzAdaptor)
healthz.InstallLivezHandler(mux, leaderHealthzAdaptor, clockHealthAdaptor)
return &Server{
server: &http.Server{
ReadTimeout: healthzServerReadTimeout,