diff --git a/assets/grafana/celeborn-dashboard.json b/assets/grafana/celeborn-dashboard.json index f5603e9a2..3ad72ada9 100644 --- a/assets/grafana/celeborn-dashboard.json +++ b/assets/grafana/celeborn-dashboard.json @@ -525,6 +525,94 @@ ], "title": "metrics_RunningApplicationCount_Value", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Celeborn worker high workload status.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 148 + }, + "id": 97, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "metrics_IsHighWorkload_Value{instance=~\"${instance}\"}", + "legendFormat": "${baseLegend}", + "refId": "A" + } + ], + "title": "metrics_IsHighWorkload_Value", + "type": "timeseries" } ], "title": "Overall", diff --git a/docs/monitoring.md b/docs/monitoring.md index a455e3dcc..44a52e5b8 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -254,6 +254,7 @@ These metrics are exposed by Celeborn worker. | UserProduceSpeed | The speed of user production for congestion control. | | WorkerConsumeSpeed | The speed of worker consumption for congestion control. | | IsDecommissioningWorker | 1 means worker decommissioning, 0 means not decommissioning. | + | IsHighWorkload | 1 means worker high workload, 0 means not high workload. | | UnreleasedShuffleCount | Unreleased shuffle count when worker is decommissioning. | | UnreleasedPartitionLocationCount | Unreleased partition location count when worker is shutting down. | | MemoryStorageFileCount | The count of files in Memory Storage of a worker. | diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala index 93208751d..5107dfd5f 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala @@ -440,6 +440,13 @@ private[celeborn] class Worker( workerSource.addGauge(WorkerSource.PAUSE_PUSH_DATA_AND_REPLICATE_COUNT) { () => memoryManager.getPausePushDataAndReplicateCounter } + workerSource.addGauge(WorkerSource.IS_HIGH_WORKLOAD) { () => + if (highWorkload) { + 1 + } else { + 0 + } + } workerSource.addGauge(WorkerSource.ACTIVE_SLOTS_COUNT) { () => workerInfo.usedSlots() } diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala index 1b94406c8..0d6328340 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala @@ -242,6 +242,8 @@ object WorkerSource { val MEMORY_STORAGE_FILE_COUNT = "MemoryStorageFileCount" + val IS_HIGH_WORKLOAD = "IsHighWorkload" + // credit val ACTIVE_CREDIT_STREAM_COUNT = "ActiveCreditStreamCount" val ACTIVE_MAP_PARTITION_COUNT = "ActiveMapPartitionCount"