diff --git a/assets/grafana/celeborn-dashboard.json b/assets/grafana/celeborn-dashboard.json index 2962deadf..8e98e6402 100644 --- a/assets/grafana/celeborn-dashboard.json +++ b/assets/grafana/celeborn-dashboard.json @@ -3236,6 +3236,102 @@ ], "title": "metrics_ PartitionFileSizeBytes_Mean", "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 70 + }, + "id": 238, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "metrics_UnreleasedPartitionLocationCount_Value{role=\"Worker\", instance=~\"${instance}\"}", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "metrics_UnreleasedPartitionLocationCount_Value", + "type": "timeseries" } ], "title": "Worker", diff --git a/docs/monitoring.md b/docs/monitoring.md index 377d00aa1..95cefcf92 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -240,6 +240,7 @@ These metrics are exposed by Celeborn worker. | WorkerConsumeSpeed | The speed of worker consumption for congestion control. | | IsDecommissioningWorker | 1 means worker decommissioning, 0 means not decommissioning. | | UnreleasedShuffleCount | Unreleased shuffle count when worker is decommissioning. | + | UnreleasedPartitionLocationCount | Unreleased partition location counit when worker is shutting down. | | MemoryStorageFileCount | The count of files in Memory Storage of a worker. | | MemoryFileStorageSize | The total amount of memory used by Memory Storage. | | EvictedFileCount | The count of files evicted from Memory Storage to Disk | diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala index edfb21d21..502740cd7 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala @@ -455,6 +455,15 @@ private[celeborn] class Worker( 0 } } + // Unreleased partition location count when worker is restarting + workerSource.addGauge(WorkerSource.UNRELEASED_PARTITION_LOCATION_COUNT) { () => + if (shutdown.get()) { + partitionLocationInfo.primaryPartitionLocations.size() + + partitionLocationInfo.replicaPartitionLocations.size() + } else { + 0 + } + } workerSource.addGauge(WorkerSource.CLEAN_TASK_QUEUE_SIZE) { () => cleanTaskQueue.size() } diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala index b8e72dd80..c4a82225c 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala @@ -231,6 +231,9 @@ object WorkerSource { val IS_DECOMMISSIONING_WORKER = "IsDecommissioningWorker" val UNRELEASED_SHUFFLE_COUNT = "UnreleasedShuffleCount" + // graceful + val UNRELEASED_PARTITION_LOCATION_COUNT = "UnreleasedPartitionLocationCount" + // clean val CLEAN_TASK_QUEUE_SIZE = "CleanTaskQueueSize" val CLEAN_EXPIRED_SHUFFLE_KEYS_TIME = "CleanExpiredShuffleKeysTime"