Merge pull request #1811 from cheukwing/issue-1679

Add Prometheus metrics for Certificate ready status
This commit is contained in:
jetstack-bot 2019-07-15 17:52:04 +01:00 committed by GitHub
commit 65138f5e8c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 242 additions and 61 deletions

View File

@ -585,6 +585,8 @@ func generateLocallySignedTemporaryCertificate(crt *v1alpha1.Certificate, pk []b
}
func (c *controller) updateCertificateStatus(ctx context.Context, old, new *v1alpha1.Certificate) (*v1alpha1.Certificate, error) {
defer c.metrics.UpdateCertificateStatus(new)
log := logf.FromContext(ctx, "updateStatus")
oldBytes, _ := json.Marshal(old.Status)
newBytes, _ := json.Marshal(new.Status)

View File

@ -17,6 +17,7 @@ limitations under the License.
// Package metrics contains global structures related to metrics collection
// cert-manager exposes the following metrics:
// certificate_expiration_timestamp_seconds{name, namespace}
// certificate_ready_status{name, namespace, condition}
package metrics
import (
@ -52,6 +53,8 @@ const (
prometheusMetricsServerMaxHeaderBytes = 1 << 20 // 1 MiB
)
var readyConditionStatuses = [...]string{string(v1alpha1.ConditionTrue), string(v1alpha1.ConditionFalse), string(v1alpha1.ConditionUnknown)}
// Default set of metrics
var Default = New(logf.NewContext(context.Background(), logf.Log.WithName("metrics")))
@ -64,6 +67,15 @@ var CertificateExpiryTimeSeconds = prometheus.NewGaugeVec(
[]string{"name", "namespace"},
)
var CertificateReadyStatus = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "certificate_ready_status",
Help: "The ready status of the certificate.",
},
[]string{"name", "namespace", "condition"},
)
// ACMEClientRequestCount is a Prometheus summary to collect the number of
// requests made to each endpoint with the ACME client.
var ACMEClientRequestCount = prometheus.NewCounterVec(
@ -109,6 +121,17 @@ var registeredCertificates = &struct {
var activeCertificates cmlisters.CertificateLister
// cleanUpFunctions are functions called to clean up metrics which refer to
// deleted certificates, inputs are name and namespace of the certificate
var cleanUpFunctions = []func(string, string){
metricCleanUpCertificate(CertificateExpiryTimeSeconds),
metricCleanUpCertificateWith(CertificateReadyStatus, readyConditionStatuses[:]),
}
type cleanableMetric interface {
DeleteLabelValues(...string) bool
}
type Metrics struct {
ctx context.Context
http.Server
@ -117,6 +140,7 @@ type Metrics struct {
// TODO (@dippynark): switch this to use an interface to make it testable
registry *prometheus.Registry
CertificateExpiryTimeSeconds *prometheus.GaugeVec
CertificateReadyStatus *prometheus.GaugeVec
ACMEClientRequestDurationSeconds *prometheus.SummaryVec
ACMEClientRequestCount *prometheus.CounterVec
ControllerSyncCallCount *prometheus.CounterVec
@ -138,6 +162,7 @@ func New(ctx context.Context) *Metrics {
activeCertificates: nil,
registry: prometheus.NewRegistry(),
CertificateExpiryTimeSeconds: CertificateExpiryTimeSeconds,
CertificateReadyStatus: CertificateReadyStatus,
ACMEClientRequestDurationSeconds: ACMEClientRequestDurationSeconds,
ACMEClientRequestCount: ACMEClientRequestCount,
ControllerSyncCallCount: ControllerSyncCallCount,
@ -168,6 +193,7 @@ func (m *Metrics) Start(stopCh <-chan struct{}) {
log := logf.FromContext(m.ctx)
m.registry.MustRegister(m.CertificateExpiryTimeSeconds)
m.registry.MustRegister(m.CertificateReadyStatus)
m.registry.MustRegister(m.ACMEClientRequestDurationSeconds)
m.registry.MustRegister(m.ACMEClientRequestCount)
m.registry.MustRegister(m.ControllerSyncCallCount)
@ -184,6 +210,7 @@ func (m *Metrics) Start(stopCh <-chan struct{}) {
}()
// clean up metrics referring to deleted resources every minute
go wait.Until(func() { m.cleanUp() }, time.Minute, stopCh)
m.waitShutdown(stopCh)
@ -221,6 +248,50 @@ func updateX509Expiry(crt *v1alpha1.Certificate, cert *x509.Certificate) {
CertificateExpiryTimeSeconds.With(prometheus.Labels{
"name": crt.Name,
"namespace": crt.Namespace}).Set(float64(expiryTime.Unix()))
registerCertificateKey(key)
}
func (m *Metrics) UpdateCertificateStatus(crt *v1alpha1.Certificate) {
log := logf.FromContext(m.ctx)
log = logf.WithResource(log, crt)
log.V(logf.DebugLevel).Info("attempting to retrieve ready status for certificate")
for _, c := range crt.Status.Conditions {
switch c.Type {
case v1alpha1.CertificateConditionReady:
updateCertificateReadyStatus(crt, c.Status)
}
}
}
func updateCertificateReadyStatus(crt *v1alpha1.Certificate, current v1alpha1.ConditionStatus) {
key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(crt)
if err != nil {
return
}
registeredCertificates.mtx.Lock()
defer registeredCertificates.mtx.Unlock()
for _, condition := range readyConditionStatuses {
value := 0.0
if string(current) == condition {
value = 1.0
}
CertificateReadyStatus.With(prometheus.Labels{
"name": crt.Name,
"namespace": crt.Namespace,
"condition": string(condition),
}).Set(value)
}
registerCertificateKey(key)
}
// registerCertificateKey adds an entry in registeredCertificates to track
// which certificates have metrics stored in prometheus, allowing for easier
// clean-up.
// You MUST lock the mutex before calling this function, this ensures no other
// function is cleaning up while we are registering a certificate
func registerCertificateKey(key string) {
registeredCertificates.certificates[key] = struct{}{}
}
@ -228,6 +299,7 @@ func (m *Metrics) SetActiveCertificates(cl cmlisters.CertificateLister) {
m.activeCertificates = cl
}
// cleanUp removes any metrics which reference resources which no longer exist
func (m *Metrics) cleanUp() {
log := logf.FromContext(m.ctx)
log.V(logf.DebugLevel).Info("attempting to clean up metrics for recently deleted certificates")
@ -246,6 +318,7 @@ func (m *Metrics) cleanUp() {
cleanUpCertificates(activeCrts)
}
// cleanUpCertificates removes metrics for recently deleted certificates
func cleanUpCertificates(activeCrts []*v1alpha1.Certificate) {
activeMap := make(map[string]struct{}, len(activeCrts))
for _, crt := range activeCrts {
@ -267,19 +340,47 @@ func cleanUpCertificates(activeCrts []*v1alpha1.Certificate) {
}
for _, key := range toCleanUp {
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
continue
}
CertificateExpiryTimeSeconds.Delete(prometheus.Labels{
"name": name,
"namespace": namespace,
})
delete(registeredCertificates.certificates, key)
cleanUpCertificateByKey(key)
}
}
// metricCleanUpCertificate creates a clean up function which deletes the entry
// (if any) for a certificate in the given metric
func metricCleanUpCertificate(c cleanableMetric) func(string, string) {
return func(name, namespace string) {
c.DeleteLabelValues(name, namespace)
}
}
// metricCleanUpCertificateWith creates a clean up function which deletes the
// entries (if any) for a certificate in the given metric, iterating over the
// additional labels.
// This is used if the metric keys on data in addition to the name and
// namespace.
func metricCleanUpCertificateWith(c cleanableMetric, additionalLabels []string) func(string, string) {
return func(name, namespace string) {
for _, label := range additionalLabels {
c.DeleteLabelValues(name, namespace, label)
}
}
}
// cleanUpCertificateByKey removes metrics which refer to a certificate,
// given the key of the certificate.
func cleanUpCertificateByKey(key string) {
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return
}
// apply all the clean up functions
for _, f := range cleanUpFunctions {
f(name, namespace)
}
delete(registeredCertificates.certificates, key)
}
func (m *Metrics) IncrementSyncCallCount(controllerName string) {
log := logf.FromContext(m.ctx)
log.V(logf.DebugLevel).Info("incrementing controller sync call count", "controllerName", controllerName)

View File

@ -28,6 +28,33 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func getReadyConditionStatus(crt *v1alpha1.Certificate) v1alpha1.ConditionStatus {
for _, c := range crt.Status.Conditions {
switch c.Type {
case v1alpha1.CertificateConditionReady:
return c.Status
}
}
return v1alpha1.ConditionUnknown
}
func buildCertificate(name, namespace string, condition v1alpha1.ConditionStatus) *v1alpha1.Certificate {
return &v1alpha1.Certificate{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
},
Status: v1alpha1.CertificateStatus{
Conditions: []v1alpha1.CertificateCondition{
{
Type: v1alpha1.CertificateConditionReady,
Status: condition,
},
},
},
}
}
func TestUpdateCertificateExpiry(t *testing.T) {
const metadata = `
# HELP certmanager_certificate_expiration_timestamp_seconds The date after which the certificate expires. Expressed as a Unix Epoch Time.
@ -72,94 +99,135 @@ func TestUpdateCertificateExpiry(t *testing.T) {
}
}
func TestCleanUp(t *testing.T) {
func TestUpdateCertificateReadyStatus(t *testing.T) {
const metadata = `
# HELP certmanager_certificate_expiration_timestamp_seconds The date after which the certificate expires. Expressed as a Unix Epoch Time.
# TYPE certmanager_certificate_expiration_timestamp_seconds gauge
# HELP certmanager_certificate_ready_status The ready status of the certificate.
# TYPE certmanager_certificate_ready_status gauge
`
type testT struct {
active map[*v1alpha1.Certificate]*x509.Certificate
inactive map[*v1alpha1.Certificate]*x509.Certificate
crt *v1alpha1.Certificate
expected string
}
tests := map[string]testT{
"active and inactive": {
"ready status true is updated correctly": {
crt: buildCertificate("something", "default", v1alpha1.ConditionTrue),
expected: `
certmanager_certificate_ready_status{condition="False",name="something",namespace="default"} 0
certmanager_certificate_ready_status{condition="True",name="something",namespace="default"} 1
certmanager_certificate_ready_status{condition="Unknown",name="something",namespace="default"} 0
`,
},
"ready status false is updated correctly": {
crt: buildCertificate("something", "default", v1alpha1.ConditionFalse),
expected: `
certmanager_certificate_ready_status{condition="False",name="something",namespace="default"} 1
certmanager_certificate_ready_status{condition="True",name="something",namespace="default"} 0
certmanager_certificate_ready_status{condition="Unknown",name="something",namespace="default"} 0
`,
},
"ready status unknown is updated correctly": {
crt: buildCertificate("something", "default", v1alpha1.ConditionUnknown),
expected: `
certmanager_certificate_ready_status{condition="False",name="something",namespace="default"} 0
certmanager_certificate_ready_status{condition="True",name="something",namespace="default"} 0
certmanager_certificate_ready_status{condition="Unknown",name="something",namespace="default"} 1
`,
},
}
for n, test := range tests {
t.Run(n, func(t *testing.T) {
updateCertificateReadyStatus(test.crt, getReadyConditionStatus(test.crt))
if err := testutil.CollectAndCompare(
CertificateReadyStatus,
strings.NewReader(metadata+test.expected),
"certmanager_certificate_ready_status",
); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
})
}
}
func TestCleanUp(t *testing.T) {
const metadataExpiry = `
# HELP certmanager_certificate_expiration_timestamp_seconds The date after which the certificate expires. Expressed as a Unix Epoch Time.
# TYPE certmanager_certificate_expiration_timestamp_seconds gauge
`
const metadataReady = `
# HELP certmanager_certificate_ready_status The ready status of the certificate.
# TYPE certmanager_certificate_ready_status gauge
`
type testT struct {
active map[*v1alpha1.Certificate]*x509.Certificate
inactive map[*v1alpha1.Certificate]*x509.Certificate
expectedExpiry string
expectedReady string
}
tests := map[string]testT{
"inactive certificate metrics cleaned up while active certificate metrics kept": {
active: map[*v1alpha1.Certificate]*x509.Certificate{
{
ObjectMeta: metav1.ObjectMeta{
Name: "something",
Namespace: "default",
},
}: {
buildCertificate("active", "default", v1alpha1.ConditionTrue): {
// fixed expiry time for testing
NotAfter: time.Unix(2208988804, 0),
},
},
inactive: map[*v1alpha1.Certificate]*x509.Certificate{
{
ObjectMeta: metav1.ObjectMeta{
Name: "something-else",
Namespace: "default",
},
}: {
buildCertificate("inactive", "default", v1alpha1.ConditionTrue): {
// fixed expiry time for testing
NotAfter: time.Unix(2208988804, 0),
},
},
expected: `
certmanager_certificate_expiration_timestamp_seconds{name="something",namespace="default"} 2.208988804e+09
expectedExpiry: `
certmanager_certificate_expiration_timestamp_seconds{name="active",namespace="default"} 2.208988804e+09
`,
expectedReady: `
certmanager_certificate_ready_status{condition="False",name="active",namespace="default"} 0
certmanager_certificate_ready_status{condition="True",name="active",namespace="default"} 1
certmanager_certificate_ready_status{condition="Unknown",name="active",namespace="default"} 0
`,
},
"only active": {
"no metrics cleaned up when only active certificate metrics": {
active: map[*v1alpha1.Certificate]*x509.Certificate{
{
ObjectMeta: metav1.ObjectMeta{
Name: "something",
Namespace: "default",
},
}: {
buildCertificate("active", "default", v1alpha1.ConditionTrue): {
// fixed expiry time for testing
NotAfter: time.Unix(2208988804, 0),
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "something-else",
Namespace: "default",
},
}: {
buildCertificate("also-active", "default", v1alpha1.ConditionTrue): {
// fixed expiry time for testing
NotAfter: time.Unix(2208988804, 0),
},
},
inactive: map[*v1alpha1.Certificate]*x509.Certificate{},
expected: `
certmanager_certificate_expiration_timestamp_seconds{name="something",namespace="default"} 2.208988804e+09
certmanager_certificate_expiration_timestamp_seconds{name="something-else",namespace="default"} 2.208988804e+09
expectedExpiry: `
certmanager_certificate_expiration_timestamp_seconds{name="active",namespace="default"} 2.208988804e+09
certmanager_certificate_expiration_timestamp_seconds{name="also-active",namespace="default"} 2.208988804e+09
`,
expectedReady: `
certmanager_certificate_ready_status{condition="False",name="active",namespace="default"} 0
certmanager_certificate_ready_status{condition="False",name="also-active",namespace="default"} 0
certmanager_certificate_ready_status{condition="True",name="active",namespace="default"} 1
certmanager_certificate_ready_status{condition="True",name="also-active",namespace="default"} 1
certmanager_certificate_ready_status{condition="Unknown",name="active",namespace="default"} 0
certmanager_certificate_ready_status{condition="Unknown",name="also-active",namespace="default"} 0
`,
},
"only inactive": {
"all metrics cleaned up when only inactive certificate metrics": {
active: map[*v1alpha1.Certificate]*x509.Certificate{},
inactive: map[*v1alpha1.Certificate]*x509.Certificate{
{
ObjectMeta: metav1.ObjectMeta{
Name: "something",
Namespace: "default",
},
}: {
buildCertificate("inactive", "default", v1alpha1.ConditionTrue): {
// fixed expiry time for testing
NotAfter: time.Unix(2208988804, 0),
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "something-else",
Namespace: "default",
},
}: {
buildCertificate("also-inactive", "default", v1alpha1.ConditionTrue): {
// fixed expiry time for testing
NotAfter: time.Unix(2208988804, 0),
},
},
expected: "",
expectedExpiry: "",
expectedReady: "",
},
}
for n, test := range tests {
@ -169,9 +237,11 @@ func TestCleanUp(t *testing.T) {
var activeCrts []*v1alpha1.Certificate
for crt, cert := range test.active {
updateX509Expiry(crt, cert)
updateCertificateReadyStatus(crt, getReadyConditionStatus(crt))
activeCrts = append(activeCrts, crt)
}
for crt, cert := range test.inactive {
updateCertificateReadyStatus(crt, getReadyConditionStatus(crt))
updateX509Expiry(crt, cert)
}
@ -179,11 +249,19 @@ func TestCleanUp(t *testing.T) {
if err := testutil.CollectAndCompare(
CertificateExpiryTimeSeconds,
strings.NewReader(metadata+test.expected),
strings.NewReader(metadataExpiry+test.expectedExpiry),
"certmanager_certificate_expiration_timestamp_seconds",
); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
if err := testutil.CollectAndCompare(
CertificateReadyStatus,
strings.NewReader(metadataReady+test.expectedReady),
"certmanager_certificate_ready_status",
); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
})
}
}