From cb6e2202ae80232cdfbaddb7f960654dd3004c31 Mon Sep 17 00:00:00 2001 From: mingji Date: Fri, 5 Jul 2024 09:55:02 +0800 Subject: [PATCH] [CELEBORN-1491] introduce flusher working queue size metric ### What changes were proposed in this pull request? Add metrics about flusher working queue size. ### Why are the changes needed? To show if there is an accumulation of flush tasks. ### Does this PR introduce _any_ user-facing change? NO. ### How was this patch tested? GA. Closes #2598 from FMX/b1491. Authored-by: mingji Signed-off-by: Shuang --- assets/grafana/celeborn-dashboard.json | 760 ++++++------------ .../service/deploy/worker/WorkerSource.scala | 1 + .../deploy/worker/storage/Flusher.scala | 14 +- 3 files changed, 272 insertions(+), 503 deletions(-) diff --git a/assets/grafana/celeborn-dashboard.json b/assets/grafana/celeborn-dashboard.json index 70b26ce3f..57c85bc76 100644 --- a/assets/grafana/celeborn-dashboard.json +++ b/assets/grafana/celeborn-dashboard.json @@ -2,7 +2,7 @@ "__inputs": [ { "name": "DS_PROMETHEUS", - "label": "Prometheus", + "label": "prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", @@ -15,7 +15,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "10.0.3" + "version": "11.0.0" }, { "type": "datasource", @@ -74,13 +74,14 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "The count of active workers.", + "description": "The count of registered shuffle.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -94,6 +95,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -133,7 +135,7 @@ "x": 0, "y": 1 }, - "id": 2, + "id": 94, "options": { "legend": { "calcs": [], @@ -142,6 +144,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -152,12 +155,12 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "expr": "metrics_WorkerCount_Value", + "expr": "metrics_RegisteredShuffleCount_Value", "legendFormat": "${baseLegend}", "refId": "A" } ], - "title": "metrics_WorkerCount_Value", + "title": "metrics_RegisteredShuffleCount_Value", "type": "timeseries" }, { @@ -165,13 +168,14 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "The count of registered shuffle.", + "description": "The count of active workers.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -185,6 +189,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -224,7 +229,7 @@ "x": 12, "y": 1 }, - "id": 94, + "id": 2, "options": { "legend": { "calcs": [], @@ -233,6 +238,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -243,12 +249,12 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "expr": "metrics_RegisteredShuffleCount_Value", + "expr": "metrics_WorkerCount_Value", "legendFormat": "${baseLegend}", "refId": "A" } ], - "title": "metrics_RegisteredShuffleCount_Value", + "title": "metrics_WorkerCount_Value", "type": "timeseries" }, { @@ -263,6 +269,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -274,6 +283,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -323,6 +333,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -353,6 +364,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -364,6 +378,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -413,6 +428,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -443,6 +459,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -456,6 +473,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -477,7 +495,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -503,6 +522,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -547,6 +567,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -560,6 +581,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -609,6 +631,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -641,6 +664,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -654,6 +678,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -703,6 +728,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -728,13 +754,14 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "The active shuffle size of workers.", + "description": "The count of workers in shutdown list.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -748,6 +775,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -777,8 +805,7 @@ "value": 80 } ] - }, - "unit": "bytes" + } }, "overrides": [] }, @@ -788,7 +815,7 @@ "x": 0, "y": 10 }, - "id": 122, + "id": 189, "options": { "legend": { "calcs": [], @@ -797,6 +824,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -807,14 +835,13 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "editorMode": "code", - "expr": "metrics_ActiveShuffleSize_Value{role=\"Master\"}", + "expr": "metrics_ShutdownWorkerCount_Value", "legendFormat": "${baseLegend}", "range": true, "refId": "A" } ], - "title": "metrics_ActiveShuffleSize_Value", + "title": "metrics_ShutdownWorkerCount_Value", "type": "timeseries" }, { @@ -829,6 +856,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -842,6 +870,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -890,6 +919,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -915,12 +945,14 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "The active shuffle size of workers.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -934,6 +966,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -964,7 +997,7 @@ } ] }, - "unit": "ms" + "unit": "bytes" }, "overrides": [] }, @@ -974,7 +1007,7 @@ "x": 0, "y": 18 }, - "id": 100, + "id": 122, "options": { "legend": { "calcs": [], @@ -983,6 +1016,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -993,12 +1027,14 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "expr": "metrics_OfferSlotsTime_Mean", + "editorMode": "code", + "expr": "metrics_ActiveShuffleSize_Value{role=\"Master\"}", "legendFormat": "${baseLegend}", + "range": true, "refId": "A" } ], - "title": "metrics_OfferSlotsTime_Mean", + "title": "metrics_ActiveShuffleSize_Value", "type": "timeseries" }, { @@ -1012,6 +1048,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -1025,6 +1062,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1074,6 +1112,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1097,13 +1136,13 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "The count of workers in lost list.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -1117,6 +1156,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1138,15 +1178,15 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", "value": 80 } ] - } + }, + "unit": "ms" }, "overrides": [] }, @@ -1156,7 +1196,7 @@ "x": 0, "y": 26 }, - "id": 36, + "id": 100, "options": { "legend": { "calcs": [], @@ -1165,6 +1205,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1175,13 +1216,12 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "expr": "metrics_LostWorkerCount_Value", + "expr": "metrics_OfferSlotsTime_Mean", "legendFormat": "${baseLegend}", - "range": true, "refId": "A" } ], - "title": "metrics_LostWorkerCount_Value", + "title": "metrics_OfferSlotsTime_Mean", "type": "timeseries" }, { @@ -1196,6 +1236,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -1209,6 +1250,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1230,8 +1272,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1257,6 +1298,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1282,13 +1324,14 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "The count of workers in shutdown list.", + "description": "The count of workers in lost list.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -1302,6 +1345,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1323,8 +1367,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1350,6 +1393,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1360,103 +1404,13 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "expr": "metrics_ShutdownWorkerCount_Value", + "expr": "metrics_LostWorkerCount_Value", "legendFormat": "${baseLegend}", "range": true, "refId": "A" } ], - "title": "metrics_ShutdownWorkerCount_Value", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "The count of workers in decommission list.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 34 - }, - "id": 189, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "metrics_DecommissionWorkerCount_Value", - "legendFormat": "${baseLegend}", - "range": true, - "refId": "A" - } - ], - "title": "metrics_DecommissionWorkerCount_Value", + "title": "metrics_LostWorkerCount_Value", "type": "timeseries" } ], @@ -1469,7 +1423,7 @@ "h": 1, "w": 24, "x": 0, - "y": 42 + "y": 2 }, "id": 28, "panels": [ @@ -1484,6 +1438,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1495,6 +1452,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1542,6 +1500,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1571,6 +1530,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1582,6 +1544,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1630,6 +1593,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1659,6 +1623,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1670,6 +1637,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1718,6 +1686,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1728,8 +1697,10 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "editorMode": "code", "expr": "metrics_ReserveSlotsTime_Max", "legendFormat": "${baseLegend}", + "range": true, "refId": "A" } ], @@ -1747,6 +1718,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1758,6 +1732,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1805,6 +1780,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1835,6 +1811,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1846,6 +1825,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1893,6 +1873,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -1923,6 +1904,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1934,6 +1918,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1982,6 +1967,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -2012,6 +1998,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2023,6 +2012,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -2071,6 +2061,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -2102,6 +2093,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2113,6 +2107,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -2161,6 +2156,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -2194,6 +2190,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2205,6 +2204,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -2252,6 +2252,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -2284,6 +2285,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2295,6 +2299,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -2342,6 +2347,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -2372,6 +2378,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2383,6 +2392,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -2421,94 +2431,6 @@ "x": 12, "y": 46 }, - "id": 190, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "metrics_IsDecommissioningWorker_Value", - "legendFormat": "${baseLegend}", - "range": true, - "refId": "A" - } - ], - "title": "metrics_IsDecommissioningWorker_Value", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 54 - }, "id": 48, "options": { "legend": { @@ -2518,6 +2440,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -2548,6 +2471,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -2559,6 +2485,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -2594,10 +2521,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, + "x": 0, "y": 54 }, - "id": 49, + "id": 193, "options": { "legend": { "calcs": [], @@ -2606,6 +2533,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -2616,189 +2544,19 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "expr": "metrics_CleanTaskQueueSize_Value", - "legendFormat": "${baseLegend}", + "disableTextWrap": false, + "editorMode": "builder", + "expr": "metrics_FlushWorkingQueueSize_Value", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", "range": true, - "refId": "A" + "refId": "A", + "useBackend": false } ], - "title": "metrics_CleanTaskQueueSize_Value", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 62 - }, - "id": 51, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "metrics_CleanExpiredShuffleKeysTime_Mean", - "legendFormat": "${baseLegend}", - "refId": "A" - } - ], - "title": "metrics_CleanExpiredShuffleKeysTime_Mean", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 62 - }, - "id": 52, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "metrics_CleanExpiredShuffleKeysTime_Max", - "legendFormat": "${baseLegend}", - "refId": "A" - } - ], - "title": "metrics_CleanExpiredShuffleKeysTime_Max", + "title": "metrics_FlushWorkingQueueSize_Value", "type": "timeseries" } ], @@ -2811,7 +2569,7 @@ "h": 1, "w": 24, "x": 0, - "y": 71 + "y": 3 }, "id": 134, "panels": [ @@ -2876,7 +2634,7 @@ "h": 9, "w": 12, "x": 0, - "y": 72 + "y": 145 }, "id": 68, "options": { @@ -2966,7 +2724,7 @@ "h": 9, "w": 12, "x": 12, - "y": 72 + "y": 145 }, "id": 70, "options": { @@ -3056,7 +2814,7 @@ "h": 9, "w": 12, "x": 0, - "y": 81 + "y": 154 }, "id": 72, "options": { @@ -3146,7 +2904,7 @@ "h": 9, "w": 12, "x": 12, - "y": 81 + "y": 154 }, "id": 74, "options": { @@ -3235,7 +2993,7 @@ "h": 8, "w": 12, "x": 0, - "y": 90 + "y": 163 }, "id": 83, "options": { @@ -3326,7 +3084,7 @@ "h": 8, "w": 12, "x": 12, - "y": 90 + "y": 163 }, "id": 76, "options": { @@ -3417,7 +3175,7 @@ "h": 8, "w": 12, "x": 0, - "y": 98 + "y": 171 }, "id": 128, "options": { @@ -3508,7 +3266,7 @@ "h": 8, "w": 12, "x": 12, - "y": 98 + "y": 171 }, "id": 129, "options": { @@ -3599,7 +3357,7 @@ "h": 8, "w": 12, "x": 0, - "y": 106 + "y": 179 }, "id": 130, "options": { @@ -3690,7 +3448,7 @@ "h": 8, "w": 12, "x": 12, - "y": 106 + "y": 179 }, "id": 132, "options": { @@ -3781,7 +3539,7 @@ "h": 8, "w": 12, "x": 0, - "y": 179 + "y": 187 }, "id": 131, "options": { @@ -3872,7 +3630,7 @@ "h": 8, "w": 12, "x": 12, - "y": 114 + "y": 187 }, "id": 133, "options": { @@ -3963,7 +3721,7 @@ "h": 8, "w": 12, "x": 0, - "y": 122 + "y": 166 }, "id": 79, "options": { @@ -4004,7 +3762,7 @@ "h": 1, "w": 24, "x": 0, - "y": 72 + "y": 4 }, "id": 12, "panels": [ @@ -4069,7 +3827,7 @@ "h": 8, "w": 12, "x": 0, - "y": 188 + "y": 196 }, "id": 66, "options": { @@ -4159,7 +3917,7 @@ "h": 8, "w": 12, "x": 12, - "y": 188 + "y": 196 }, "id": 96, "options": { @@ -4249,7 +4007,7 @@ "h": 8, "w": 12, "x": 0, - "y": 196 + "y": 204 }, "id": 17, "options": { @@ -4339,7 +4097,7 @@ "h": 8, "w": 12, "x": 12, - "y": 196 + "y": 204 }, "id": 18, "options": { @@ -4428,7 +4186,7 @@ "h": 8, "w": 12, "x": 0, - "y": 204 + "y": 212 }, "id": 81, "options": { @@ -4519,7 +4277,7 @@ "h": 8, "w": 12, "x": 12, - "y": 204 + "y": 212 }, "id": 77, "options": { @@ -4610,7 +4368,7 @@ "h": 8, "w": 12, "x": 0, - "y": 212 + "y": 220 }, "id": 82, "options": { @@ -4701,7 +4459,7 @@ "h": 8, "w": 12, "x": 12, - "y": 212 + "y": 220 }, "id": 75, "options": { @@ -4792,7 +4550,7 @@ "h": 8, "w": 12, "x": 0, - "y": 220 + "y": 228 }, "id": 73, "options": { @@ -4833,7 +4591,7 @@ "h": 1, "w": 24, "x": 0, - "y": 73 + "y": 5 }, "id": 10, "panels": [ @@ -4898,7 +4656,7 @@ "h": 8, "w": 12, "x": 0, - "y": 155 + "y": 163 }, "id": 78, "options": { @@ -4988,7 +4746,7 @@ "h": 8, "w": 12, "x": 12, - "y": 155 + "y": 163 }, "id": 80, "options": { @@ -5078,7 +4836,7 @@ "h": 9, "w": 12, "x": 0, - "y": 163 + "y": 171 }, "id": 4, "options": { @@ -5168,7 +4926,7 @@ "h": 9, "w": 12, "x": 12, - "y": 163 + "y": 171 }, "id": 6, "options": { @@ -5258,7 +5016,7 @@ "h": 8, "w": 12, "x": 0, - "y": 172 + "y": 180 }, "id": 56, "options": { @@ -5348,7 +5106,7 @@ "h": 8, "w": 12, "x": 12, - "y": 172 + "y": 180 }, "id": 58, "options": { @@ -5387,7 +5145,7 @@ "h": 1, "w": 24, "x": 0, - "y": 74 + "y": 6 }, "id": 8, "panels": [ @@ -5402,6 +5160,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -5415,6 +5174,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -5436,8 +5196,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -5464,6 +5223,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -5474,6 +5234,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "editorMode": "code", "expr": "metrics_NettyMemory_Value", "legendFormat": "${baseLegend}", "range": true, @@ -5494,6 +5255,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -5507,6 +5269,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -5528,8 +5291,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -5547,7 +5309,7 @@ "x": 12, "y": 7 }, - "id": 185, + "id": 190, "options": { "legend": { "calcs": [], @@ -5556,6 +5318,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -5587,6 +5350,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -5600,6 +5364,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -5621,8 +5386,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -5640,7 +5404,7 @@ "x": 0, "y": 15 }, - "id": 186, + "id": 191, "options": { "legend": { "calcs": [], @@ -5649,6 +5413,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -5680,6 +5445,7 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -5693,6 +5459,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -5714,8 +5481,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -5741,6 +5507,7 @@ "showLegend": true }, "tooltip": { + "maxHeight": 600, "mode": "single", "sort": "none" } @@ -5806,8 +5573,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -5898,8 +5664,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -5990,8 +5755,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6083,8 +5847,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6176,8 +5939,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6269,8 +6031,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6362,8 +6123,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -6786,7 +6546,7 @@ "h": 1, "w": 24, "x": 0, - "y": 75 + "y": 7 }, "id": 50, "panels": [ @@ -6851,7 +6611,7 @@ "h": 8, "w": 12, "x": 0, - "y": 230 + "y": 238 }, "id": 44, "options": { @@ -6941,7 +6701,7 @@ "h": 8, "w": 12, "x": 12, - "y": 230 + "y": 238 }, "id": 46, "options": { @@ -7030,9 +6790,9 @@ "h": 8, "w": 12, "x": 0, - "y": 238 + "y": 246 }, - "id": 48, + "id": 192, "options": { "legend": { "calcs": [], @@ -7120,7 +6880,7 @@ "h": 8, "w": 12, "x": 12, - "y": 238 + "y": 246 }, "id": 180, "options": { @@ -7211,7 +6971,7 @@ "h": 8, "w": 12, "x": 0, - "y": 246 + "y": 254 }, "id": 88, "options": { @@ -7301,7 +7061,7 @@ "h": 8, "w": 12, "x": 12, - "y": 246 + "y": 254 }, "id": 135, "options": { @@ -7342,7 +7102,7 @@ "h": 1, "w": 24, "x": 0, - "y": 76 + "y": 8 }, "id": 157, "panels": [ @@ -7408,7 +7168,7 @@ "h": 8, "w": 12, "x": 0, - "y": 255 + "y": 263 }, "id": 159, "options": { @@ -7501,7 +7261,7 @@ "h": 8, "w": 12, "x": 12, - "y": 255 + "y": 263 }, "id": 160, "options": { @@ -7594,7 +7354,7 @@ "h": 8, "w": 12, "x": 0, - "y": 263 + "y": 271 }, "id": 161, "options": { @@ -7635,7 +7395,7 @@ "h": 1, "w": 24, "x": 0, - "y": 77 + "y": 9 }, "id": 137, "panels": [ @@ -7700,7 +7460,7 @@ "h": 9, "w": 12, "x": 0, - "y": 272 + "y": 280 }, "id": 139, "options": { @@ -7792,7 +7552,7 @@ "h": 9, "w": 12, "x": 12, - "y": 272 + "y": 280 }, "id": 141, "options": { @@ -7884,7 +7644,7 @@ "h": 9, "w": 12, "x": 0, - "y": 281 + "y": 289 }, "id": 142, "options": { @@ -7976,7 +7736,7 @@ "h": 9, "w": 12, "x": 12, - "y": 281 + "y": 289 }, "id": 143, "options": { @@ -8068,7 +7828,7 @@ "h": 9, "w": 12, "x": 0, - "y": 290 + "y": 298 }, "id": 144, "options": { @@ -8160,7 +7920,7 @@ "h": 9, "w": 12, "x": 12, - "y": 290 + "y": 298 }, "id": 145, "options": { @@ -8252,7 +8012,7 @@ "h": 9, "w": 12, "x": 0, - "y": 299 + "y": 307 }, "id": 146, "options": { @@ -8344,7 +8104,7 @@ "h": 9, "w": 12, "x": 12, - "y": 299 + "y": 307 }, "id": 147, "options": { @@ -8436,7 +8196,7 @@ "h": 9, "w": 12, "x": 0, - "y": 308 + "y": 316 }, "id": 148, "options": { @@ -8528,7 +8288,7 @@ "h": 9, "w": 12, "x": 12, - "y": 308 + "y": 316 }, "id": 149, "options": { @@ -8620,7 +8380,7 @@ "h": 9, "w": 12, "x": 0, - "y": 317 + "y": 325 }, "id": 150, "options": { @@ -8712,7 +8472,7 @@ "h": 9, "w": 12, "x": 12, - "y": 317 + "y": 325 }, "id": 151, "options": { @@ -8803,7 +8563,7 @@ "h": 8, "w": 12, "x": 0, - "y": 326 + "y": 334 }, "id": 153, "options": { @@ -8894,7 +8654,7 @@ "h": 8, "w": 12, "x": 12, - "y": 326 + "y": 334 }, "id": 154, "options": { @@ -8985,7 +8745,7 @@ "h": 8, "w": 12, "x": 0, - "y": 334 + "y": 342 }, "id": 155, "options": { @@ -9026,7 +8786,7 @@ "h": 1, "w": 24, "x": 0, - "y": 78 + "y": 10 }, "id": 110, "panels": [ @@ -9090,7 +8850,7 @@ "h": 8, "w": 12, "x": 0, - "y": 343 + "y": 351 }, "id": 112, "options": { @@ -9181,7 +8941,7 @@ "h": 8, "w": 12, "x": 12, - "y": 343 + "y": 351 }, "id": 116, "options": { @@ -9222,7 +8982,7 @@ "h": 1, "w": 24, "x": 0, - "y": 79 + "y": 11 }, "id": 123, "panels": [ @@ -9287,7 +9047,7 @@ "h": 8, "w": 12, "x": 0, - "y": 352 + "y": 360 }, "id": 125, "options": { @@ -9380,7 +9140,7 @@ "h": 8, "w": 12, "x": 12, - "y": 352 + "y": 360 }, "id": 126, "options": { @@ -9473,7 +9233,7 @@ "h": 8, "w": 12, "x": 0, - "y": 360 + "y": 368 }, "id": 163, "options": { @@ -9566,7 +9326,7 @@ "h": 8, "w": 12, "x": 12, - "y": 360 + "y": 368 }, "id": 162, "options": { @@ -9659,7 +9419,7 @@ "h": 8, "w": 12, "x": 0, - "y": 368 + "y": 376 }, "id": 127, "options": { @@ -9700,7 +9460,7 @@ "h": 1, "w": 24, "x": 0, - "y": 80 + "y": 12 }, "id": 172, "panels": [ @@ -9765,7 +9525,7 @@ "h": 9, "w": 12, "x": 0, - "y": 377 + "y": 385 }, "id": 174, "options": { @@ -9858,7 +9618,7 @@ "h": 9, "w": 12, "x": 12, - "y": 377 + "y": 385 }, "id": 176, "options": { @@ -9950,7 +9710,7 @@ "h": 9, "w": 12, "x": 0, - "y": 386 + "y": 394 }, "id": 175, "options": { @@ -10043,7 +9803,7 @@ "h": 9, "w": 12, "x": 12, - "y": 386 + "y": 394 }, "id": 177, "options": { @@ -10081,14 +9841,13 @@ ], "refresh": "5s", "revision": 1, - "schemaVersion": 38, - "style": "dark", + "schemaVersion": 39, "tags": [], "templating": { "list": [ { "current": { - "selected": true, + "selected": false, "text": "__auto", "value": "__auto" }, @@ -10116,10 +9875,11 @@ "from": "now-30m", "to": "now" }, + "timeRangeUpdatedDuringEditOrView": false, "timepicker": {}, "timezone": "", "title": "Celeborn", "uid": "U_qgru_7z", - "version": 1, + "version": 2, "weekStart": "" } \ No newline at end of file diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala index f4b152b08..5358fab02 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala @@ -168,6 +168,7 @@ object WorkerSource { val TAKE_BUFFER_TIME = "TakeBufferTime" val FLUSH_DATA_TIME = "FlushDataTime" val COMMIT_FILES_TIME = "CommitFilesTime" + val FLUSH_WORKING_QUEUE_SIZE = "FlushWorkingQueueSize" // slots val SLOTS_ALLOCATED = "SlotsAllocated" diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/Flusher.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/Flusher.scala index e5d93fa32..cfc94e962 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/Flusher.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/Flusher.scala @@ -33,6 +33,7 @@ import org.apache.celeborn.common.metrics.source.{AbstractSource, ThreadPoolSour import org.apache.celeborn.common.protocol.StorageInfo import org.apache.celeborn.common.util.{ThreadUtils, Utils} import org.apache.celeborn.service.deploy.worker.WorkerSource +import org.apache.celeborn.service.deploy.worker.WorkerSource.FLUSH_WORKING_QUEUE_SIZE import org.apache.celeborn.service.deploy.worker.congestcontrol.CongestionController import org.apache.celeborn.service.deploy.worker.memory.MemoryManager @@ -41,7 +42,8 @@ abstract private[worker] class Flusher( val threadCount: Int, val allocator: PooledByteBufAllocator, val maxComponents: Int, - flushTimeMetric: TimeWindow) extends Logging { + flushTimeMetric: TimeWindow, + mountPoint: String) extends Logging { protected lazy val flusherId: Int = System.identityHashCode(this) protected val workingQueues = new Array[LinkedBlockingQueue[FlushTask]](threadCount) protected val bufferQueue = new LinkedBlockingQueue[CompositeByteBuf]() @@ -95,6 +97,10 @@ abstract private[worker] class Flusher( } } }) + workerSource.addGauge(FLUSH_WORKING_QUEUE_SIZE, Map("mountpoint" -> s"$mountPoint-$index")) { + () => + workingQueues(index).size() + } } ThreadPoolSource.registerSource(s"$this", workers) } @@ -147,7 +153,8 @@ private[worker] class LocalFlusher( threadCount, allocator, maxComponents, - timeWindow) + timeWindow, + mountPoint) with DeviceObserver with Logging { deviceMonitor.registerFlusher(this) @@ -182,7 +189,8 @@ final private[worker] class HdfsFlusher( hdfsFlusherThreads, allocator, maxComponents, - null) with Logging { + null, + "HDFS") with Logging { override def processIOException(e: IOException, deviceErrorType: DiskStatus): Unit = { logError(s"$this write failed, reason $deviceErrorType ,exception: $e")