[KYUUBI #7072] Expose metrics of engine startup permit state

### Why are the changes needed?

The metrics `kyuubi_operation_state_LaunchEngine_*` cannot reflect the state of Semaphore after configuring the maximum engine startup limit through `kyuubi.server.limit.engine.startup`, add some metrics to show the relevant permit state.

### How was this patch tested?

### Was this patch authored or co-authored using generative AI tooling?

Closes #7072 from LennonChin/engine_startup_metrics.

Closes #7072

d6bf3696a [Lennon Chin] Expose metrics of engine startup permit status

Authored-by: Lennon Chin <i@coderap.com>
Signed-off-by: Cheng Pan <chengpan@apache.org>
This commit is contained in:
Lennon Chin 2025-05-29 13:27:42 +08:00 committed by Cheng Pan
parent bcaff5a3f1
commit cad5a392f3
No known key found for this signature in database
GPG Key ID: 8001952629BCC75D
4 changed files with 157 additions and 24 deletions

View File

@ -65,6 +65,9 @@ These metrics include:
| `kyuubi.engine.timeout` | | counter | 1.2.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> cumulative timeout engines</div> |
| `kyuubi.engine.failed` | `${user}` | counter | 1.2.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> cumulative explicitly failed engine count for a `${user}`</div> |
| `kyuubi.engine.failed` | `${errorType}` | counter | 1.2.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> cumulative explicitly failed engine count for a particular `${errorType}`, e.g. `ClassNotFoundException`</div> |
| `kyuubi.engine.startup.permit.limit.total` | | meter | 1.11.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> concurrently startup engines permit limit </div> |
| `kyuubi.engine.startup.permit.available` | | gauge | 1.11.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> available permits of concurrently startup engines </div> |
| `kyuubi.engine.startup.permit.waiting` | | gauge | 1.11.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> startup engines that waiting to acquire permit </div> |
| `kyuubi.backend_service.open_session` | | timer | 1.5.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> kyuubi backend service `openSession` method execution time and rate </div> |
| `kyuubi.backend_service.close_session` | | timer | 1.5.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> kyuubi backend service `closeSession` method execution time and rate </div> |
| `kyuubi.backend_service.get_info` | | timer | 1.5.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> kyuubi backend service `getInfo` method execution time and rate </div> |

View File

@ -823,6 +823,123 @@
"x": 16,
"y": 7
},
"id": 104,
"maxPerRow": 2,
"options": {
"alertThreshold": true,
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"repeatDirection": "h",
"targets": [
{
"datasource": "${DS_PROMETHEUS}",
"editorMode": "code",
"expr": " kyuubi_engine_startup_permit_limit_total{$baseFilter,instance=~\"$instance\"}",
"hide": false,
"legendFormat": "${baseLegend}-limit",
"range": true,
"refId": "A"
},
{
"datasource": "${DS_PROMETHEUS}",
"editorMode": "code",
"expr": " kyuubi_engine_startup_permit_waiting{$baseFilter,instance=~\"$instance\"}",
"hide": false,
"legendFormat": "${baseLegend}-waiting",
"range": true,
"refId": "B"
},
{
"datasource": "${DS_PROMETHEUS}",
"editorMode": "code",
"expr": " kyuubi_engine_startup_permit_available{$baseFilter,instance=~\"$instance\"}",
"hide": false,
"legendFormat": "${baseLegend}-available",
"range": true,
"refId": "C"
}
],
"title": "Engine startup permit",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"links": [],
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 13
},
"id": 75,
"maxPerRow": 2,
"options": {
@ -918,7 +1035,7 @@
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"x": 8,
"y": 13
},
"id": 77,
@ -1025,7 +1142,7 @@
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"x": 16,
"y": 13
},
"id": 79,
@ -1130,10 +1247,10 @@
"overrides": []
},
"gridPos": {
"h": 7,
"h": 6,
"w": 8,
"x": 16,
"y": 13
"x": 0,
"y": 20
},
"id": 80,
"maxPerRow": 2,
@ -1236,7 +1353,7 @@
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"x": 8,
"y": 20
},
"id": 34,
@ -1335,7 +1452,7 @@
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"x": 16,
"y": 20
},
"id": 71,
@ -1430,8 +1547,8 @@
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 20
"x": 0,
"y": 26
},
"id": 76,
"maxPerRow": 2,
@ -1478,7 +1595,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 26
"y": 32
},
"id": 88,
"panels": [],
@ -1549,7 +1666,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 27
"y": 33
},
"id": 89,
"maxPerRow": 2,
@ -1646,7 +1763,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 27
"y": 33
},
"id": 92,
"maxPerRow": 2,
@ -1743,7 +1860,7 @@
"h": 7,
"w": 12,
"x": 0,
"y": 35
"y": 41
},
"id": 90,
"maxPerRow": 2,
@ -1849,7 +1966,7 @@
"h": 7,
"w": 12,
"x": 12,
"y": 35
"y": 41
},
"id": 91,
"maxPerRow": 2,
@ -1899,7 +2016,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 42
"y": 48
},
"id": 93,
"panels": [],
@ -1970,7 +2087,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 43
"y": 49
},
"id": 94,
"maxPerRow": 2,
@ -2076,7 +2193,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 43
"y": 49
},
"id": 99,
"maxPerRow": 2,
@ -2173,7 +2290,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 51
"y": 57
},
"id": 98,
"maxPerRow": 2,
@ -2271,7 +2388,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 51
"y": 57
},
"id": 97,
"maxPerRow": 2,
@ -2321,7 +2438,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 59
"y": 65
},
"id": 68,
"panels": [],
@ -2392,7 +2509,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 60
"y": 66
},
"id": 100,
"options": {
@ -2511,7 +2628,7 @@
"h": 8,
"w": 12,
"x": 12,
"y": 60
"y": 66
},
"id": 101,
"options": {
@ -2601,7 +2718,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 68
"y": 74
},
"id": 49,
"options": {
@ -2653,7 +2770,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 68
"y": 82
},
"id": 60,
"panels": [

View File

@ -60,6 +60,11 @@ object MetricsConstants {
final val ENGINE_TIMEOUT: String = ENGINE + "timeout"
final val ENGINE_TOTAL: String = ENGINE + "total"
final private val ENGINE_STARTUP_PERMIT: String = ENGINE + "startup.permit."
final val ENGINE_STARTUP_PERMIT_LIMIT: String = ENGINE_STARTUP_PERMIT + "limit"
final val ENGINE_STARTUP_PERMIT_AVAILABLE: String = ENGINE_STARTUP_PERMIT + "available"
final val ENGINE_STARTUP_PERMIT_WAITING: String = ENGINE_STARTUP_PERMIT + "waiting"
final private val OPERATION = KYUUBI + "operation."
final val OPERATION_OPEN: String = OPERATION + "opened"
final val OPERATION_FAIL: String = OPERATION + "failed"

View File

@ -300,6 +300,14 @@ class KyuubiSessionManager private (name: String) extends SessionManager(name) {
ms.registerGauge(EXEC_POOL_ALIVE, getExecPoolSize, 0)
ms.registerGauge(EXEC_POOL_ACTIVE, getActiveCount, 0)
ms.registerGauge(EXEC_POOL_WORK_QUEUE_SIZE, getWorkQueueSize, 0)
this.engineStartupProcessSemaphore.foreach { semaphore =>
ms.markMeter(ENGINE_STARTUP_PERMIT_LIMIT, semaphore.availablePermits)
ms.registerGauge(
ENGINE_STARTUP_PERMIT_AVAILABLE,
semaphore.availablePermits,
semaphore.availablePermits)
ms.registerGauge(ENGINE_STARTUP_PERMIT_WAITING, semaphore.getQueueLength, 0)
}
}
super.start()
startEngineAliveChecker()