[CELEBORN-2118] Introduce IsHighWorkload metric to monitor worker overload status
### What changes were proposed in this pull request? Introduce `IsHighWorkload` metric to monitor worker overload status. ### Why are the changes needed? There is no any metric to monitor worker overload status at present. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? [Grafana test](https://xy2953396112.grafana.net/public-dashboards/22ab1750ef874a1bb39b5879b81a24cf). Closes #3435 from xy2953396112/CELEBORN-2118. Authored-by: xxx <953396112@qq.com> Signed-off-by: SteNicholas <programgeek@163.com>
This commit is contained in:
parent
1d6299717f
commit
a9490d6e24
@ -525,6 +525,94 @@
|
||||
],
|
||||
"title": "metrics_RunningApplicationCount_Value",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Celeborn worker high workload status.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "decbytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 148
|
||||
},
|
||||
"id": 97,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"maxHeight": 600,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"expr": "metrics_IsHighWorkload_Value{instance=~\"${instance}\"}",
|
||||
"legendFormat": "${baseLegend}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "metrics_IsHighWorkload_Value",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"title": "Overall",
|
||||
|
||||
@ -254,6 +254,7 @@ These metrics are exposed by Celeborn worker.
|
||||
| UserProduceSpeed | The speed of user production for congestion control. |
|
||||
| WorkerConsumeSpeed | The speed of worker consumption for congestion control. |
|
||||
| IsDecommissioningWorker | 1 means worker decommissioning, 0 means not decommissioning. |
|
||||
| IsHighWorkload | 1 means worker high workload, 0 means not high workload. |
|
||||
| UnreleasedShuffleCount | Unreleased shuffle count when worker is decommissioning. |
|
||||
| UnreleasedPartitionLocationCount | Unreleased partition location count when worker is shutting down. |
|
||||
| MemoryStorageFileCount | The count of files in Memory Storage of a worker. |
|
||||
|
||||
@ -440,6 +440,13 @@ private[celeborn] class Worker(
|
||||
workerSource.addGauge(WorkerSource.PAUSE_PUSH_DATA_AND_REPLICATE_COUNT) { () =>
|
||||
memoryManager.getPausePushDataAndReplicateCounter
|
||||
}
|
||||
workerSource.addGauge(WorkerSource.IS_HIGH_WORKLOAD) { () =>
|
||||
if (highWorkload) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
workerSource.addGauge(WorkerSource.ACTIVE_SLOTS_COUNT) { () =>
|
||||
workerInfo.usedSlots()
|
||||
}
|
||||
|
||||
@ -242,6 +242,8 @@ object WorkerSource {
|
||||
|
||||
val MEMORY_STORAGE_FILE_COUNT = "MemoryStorageFileCount"
|
||||
|
||||
val IS_HIGH_WORKLOAD = "IsHighWorkload"
|
||||
|
||||
// credit
|
||||
val ACTIVE_CREDIT_STREAM_COUNT = "ActiveCreditStreamCount"
|
||||
val ACTIVE_MAP_PARTITION_COUNT = "ActiveMapPartitionCount"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user