[CELEBORN-2118] Introduce IsHighWorkload metric to monitor worker overload status

### What changes were proposed in this pull request?

Introduce `IsHighWorkload` metric to monitor worker overload status.

### Why are the changes needed?

There is no any metric to monitor worker overload status at present.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

[Grafana test](https://xy2953396112.grafana.net/public-dashboards/22ab1750ef874a1bb39b5879b81a24cf).

Closes #3435 from xy2953396112/CELEBORN-2118.

Authored-by: xxx <953396112@qq.com>
Signed-off-by: SteNicholas <programgeek@163.com>
This commit is contained in:
xxx 2025-08-25 20:46:17 +08:00 committed by SteNicholas
parent 1d6299717f
commit a9490d6e24
4 changed files with 98 additions and 0 deletions

View File

@ -525,6 +525,94 @@
],
"title": "metrics_RunningApplicationCount_Value",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Celeborn worker high workload status.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
}
]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 148
},
"id": 97,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"maxHeight": 600,
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "metrics_IsHighWorkload_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"refId": "A"
}
],
"title": "metrics_IsHighWorkload_Value",
"type": "timeseries"
}
],
"title": "Overall",

View File

@ -254,6 +254,7 @@ These metrics are exposed by Celeborn worker.
| UserProduceSpeed | The speed of user production for congestion control. |
| WorkerConsumeSpeed | The speed of worker consumption for congestion control. |
| IsDecommissioningWorker | 1 means worker decommissioning, 0 means not decommissioning. |
| IsHighWorkload | 1 means worker high workload, 0 means not high workload. |
| UnreleasedShuffleCount | Unreleased shuffle count when worker is decommissioning. |
| UnreleasedPartitionLocationCount | Unreleased partition location count when worker is shutting down. |
| MemoryStorageFileCount | The count of files in Memory Storage of a worker. |

View File

@ -440,6 +440,13 @@ private[celeborn] class Worker(
workerSource.addGauge(WorkerSource.PAUSE_PUSH_DATA_AND_REPLICATE_COUNT) { () =>
memoryManager.getPausePushDataAndReplicateCounter
}
workerSource.addGauge(WorkerSource.IS_HIGH_WORKLOAD) { () =>
if (highWorkload) {
1
} else {
0
}
}
workerSource.addGauge(WorkerSource.ACTIVE_SLOTS_COUNT) { () =>
workerInfo.usedSlots()
}

View File

@ -242,6 +242,8 @@ object WorkerSource {
val MEMORY_STORAGE_FILE_COUNT = "MemoryStorageFileCount"
val IS_HIGH_WORKLOAD = "IsHighWorkload"
// credit
val ACTIVE_CREDIT_STREAM_COUNT = "ActiveCreditStreamCount"
val ACTIVE_MAP_PARTITION_COUNT = "ActiveMapPartitionCount"