[CELEBORN-1007] Improve JVM metrics naming and add ThreadStates metrics
### What changes were proposed in this pull request?
Since we use codahale metrics to expose JVM metrics, the name without prefix is not clear and it‘s not easy to make a grafana template for these metrics because it adds collector name or pool name in names rather than labels.
So here I add jvm metric prefixes, remove pool info from name and obtain the pool name as labels if needed.
And add ThreadStates metrics additionally.
### Why are the changes needed?
Make jvm metrics easy to understand and get template
### Does this PR introduce _any_ user-facing change?
Yes,jvm metrics naming is changed,expose threads state additionally.
change examples like below:
For GarbageCollectorMetricSet, G1-Old-Generation.time -> jvm.gc.time{name="G1-Old-Generation"}
For MemoryUsageGaugeSet, total.init -> jvm.memory.total.init ; pools.Metaspace.usage -> jvm.memory.pools.usage{name="Metaspace"}
For BufferPoolMetricSet, direct.count -> jvm.direct.count
For ThreadStatesGaugeSet, add jvm.thread.count.
For G1, the jvm metrics exposed now:
metrics_jvm_gc_time_Value{name="G1-Old-Generation",role="Worker"} 0 1695731141588
metrics_jvm_gc_count_Value{name="G1-Young-Generation",role="Worker"} 2 1695731141588
metrics_jvm_gc_time_Value{name="G1-Young-Generation",role="Worker"} 74 1695731141588
metrics_jvm_gc_count_Value{name="G1-Old-Generation",role="Worker"} 0 1695731141588
metrics_jvm_heap_committed_Value{role="Worker"} 2109734912 1695731141588
metrics_jvm_non_heap_used_Value{role="Worker"} 47700056 1695731141588
metrics_jvm_heap_used_Value{role="Worker"} 82801184 1695731141588
metrics_jvm_total_committed_Value{role="Worker"} 2160263168 1695731141588
metrics_jvm_total_init_Value{role="Worker"} 2112290816 1695731141588
metrics_jvm_non_heap_max_Value{role="Worker"} -1 1695731141588
metrics_jvm_heap_usage_Value{role="Worker"} 0.009639326483011246 1695731141588
metrics_jvm_total_used_Value{role="Worker"} 130502480 1695731141589
metrics_jvm_heap_init_Value{role="Worker"} 2109734912 1695731141589
metrics_jvm_non_heap_committed_Value{role="Worker"} 50528256 1695731141589
metrics_jvm_non_heap_init_Value{role="Worker"} 2555904 1695731141589
metrics_jvm_non_heap_usage_Value{role="Worker"} -4.7701296E7 1695731141589
metrics_jvm_heap_max_Value{role="Worker"} 8589934592 1695731141589
metrics_jvm_total_max_Value{role="Worker"} 8589934591 1695731141589
metrics_jvm_memory_pool_used_Value{name="Code-Cache",role="Worker"} 10314368 1695731141588
metrics_jvm_memory_pool_committed_Value{name="Code-Cache",role="Worker"} 10944512 1695731141588
metrics_jvm_memory_pool_init_Value{name="G1-Eden-Space",role="Worker"} 111149056 1695731141588
metrics_jvm_memory_pool_max_Value{name="G1-Old-Gen",role="Worker"} 8589934592 1695731141588
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141588
metrics_jvm_memory_pool_used_Value{name="Compressed-Class-Space",role="Worker"} 4440192 1695731141588
metrics_jvm_memory_pool_usage_Value{name="Metaspace",role="Worker"} 0.9449504192610433 1695731141588
metrics_jvm_memory_pool_max_Value{name="Metaspace",role="Worker"} -1 1695731141588
metrics_jvm_memory_pool_init_Value{name="G1-Survivor-Space",role="Worker"} 0 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Old-Gen",role="Worker"} 1998585856 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Eden-Space",role="Worker"} 96468992 1695731141588
metrics_jvm_memory_pool_max_Value{name="G1-Survivor-Space",role="Worker"} -1 1695731141588
metrics_jvm_memory_pool_usage_Value{name="Compressed-Class-Space",role="Worker"} 0.004135251045227051 1695731141588
metrics_jvm_memory_pool_usage_Value{name="G1-Survivor-Space",role="Worker"} 1.0 1695731141588
metrics_jvm_memory_pool_max_Value{name="Code-Cache",role="Worker"} 251658240 1695731141588
metrics_jvm_memory_pool_init_Value{name="Compressed-Class-Space",role="Worker"} 0 1695731141589
metrics_jvm_memory_pool_usage_Value{name="G1-Eden-Space",role="Worker"} 0.34782608695652173 1695731141589
metrics_jvm_memory_pool_init_Value{name="Metaspace",role="Worker"} 0 1695731141589
metrics_jvm_memory_pool_max_Value{name="G1-Eden-Space",role="Worker"} -1 1695731141589
metrics_jvm_memory_pool_usage_Value{name="Code-Cache",role="Worker"} 0.04098917643229167 1695731141589
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Eden-Space",role="Worker"} 0 1695731141589
metrics_jvm_memory_pool_init_Value{name="Code-Cache",role="Worker"} 2555904 1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141589
metrics_jvm_memory_pool_committed_Value{name="Compressed-Class-Space",role="Worker"} 4718592 1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Eden-Space",role="Worker"} 33554432 1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Old-Gen",role="Worker"} 34566688 1695731141589
metrics_jvm_memory_pool_usage_Value{name="G1-Old-Gen",role="Worker"} 0.004024092108011246 1695731141589
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Old-Gen",role="Worker"} 0 1695731141589
metrics_jvm_memory_pool_committed_Value{name="Metaspace",role="Worker"} 34865152 1695731141589
metrics_jvm_memory_pool_init_Value{name="G1-Old-Gen",role="Worker"} 1998585856 1695731141589
metrics_jvm_memory_pool_used_Value{name="Metaspace",role="Worker"} 32945840 1695731141589
metrics_jvm_memory_pool_max_Value{name="Compressed-Class-Space",role="Worker"} 1073741824 1695731141589
metrics_jvm_direct_count_Value{role="Worker"} 8 1695731141589
metrics_jvm_direct_capacity_Value{role="Worker"} 1036 1695731141589
metrics_jvm_direct_used_Value{role="Worker"} 1037 1695731141589
metrics_jvm_mapped_used_Value{role="Worker"} 0 1695731141589
metrics_jvm_mapped_capacity_Value{role="Worker"} 0 1695731141589
metrics_jvm_mapped_count_Value{role="Worker"} 0 1695731141589
metrics_jvm_thread_timed_waiting_count_Value{role="Worker"} 23 1695731141589
metrics_jvm_thread_deadlock_count_Value{role="Worker"} 0 1695731141589
metrics_jvm_thread_count_Value{role="Worker"} 78 1695731141589
metrics_jvm_thread_waiting_count_Value{role="Worker"} 45 1695731141589
metrics_jvm_thread_daemon_count_Value{role="Worker"} 75 1695731141589
metrics_jvm_thread_new_count_Value{role="Worker"} 0 1695731141589
metrics_jvm_thread_blocked_count_Value{role="Worker"} 0 1695731141590
metrics_jvm_thread_deadlocks_Value{role="Worker"} [] 1695731141590
metrics_jvm_thread_runnable_count_Value{role="Worker"} 10 1695731141590
metrics_jvm_thread_terminated_count_Value{role="Worker"} 0 1695731141590
### How was this patch tested?
UT and cluster test with g1, PS-Scavenge/PS-MarkSweep and ParNew/CMS
Closes #1939 from onebox-li/improve-jvm-metrics.
Authored-by: onebox-li <lyh-36@163.com>
Signed-off-by: zky.zhoukeyong <zky.zhoukeyong@alibaba-inc.com>
This commit is contained in:
parent
3e515c5d2e
commit
b4dfc09dcf
@ -20,26 +20,86 @@ package org.apache.celeborn.common.metrics.source
|
||||
import java.lang.management.ManagementFactory
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
|
||||
import com.codahale.metrics.Gauge
|
||||
import com.codahale.metrics.jvm.{BufferPoolMetricSet, GarbageCollectorMetricSet, MemoryUsageGaugeSet}
|
||||
import com.codahale.metrics.{Gauge, MetricRegistry}
|
||||
import com.codahale.metrics.jvm.{BufferPoolMetricSet, GarbageCollectorMetricSet, MemoryUsageGaugeSet, ThreadStatesGaugeSet}
|
||||
|
||||
import org.apache.celeborn.common.CelebornConf
|
||||
|
||||
class JVMSource(conf: CelebornConf, role: String) extends AbstractSource(conf, role) {
|
||||
override val sourceName = "JVM"
|
||||
|
||||
// all of metrics of GCMetricSet and BufferPoolMetricSet are Gauge
|
||||
Seq(
|
||||
new GarbageCollectorMetricSet(),
|
||||
new MemoryUsageGaugeSet(),
|
||||
new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer))
|
||||
.map { x =>
|
||||
x.getMetrics.asScala.map {
|
||||
case (name: String, metric: Gauge[_]) => addGauge(name, metric)
|
||||
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
|
||||
import JVMSource._
|
||||
|
||||
private val gcNames = ManagementFactory.getGarbageCollectorMXBeans.asScala.map(bean =>
|
||||
WHITESPACE.matcher(bean.getName).replaceAll("-"))
|
||||
private val poolNames = ManagementFactory.getMemoryPoolMXBeans.asScala.map(bean =>
|
||||
WHITESPACE.matcher(bean.getName).replaceAll("-"))
|
||||
|
||||
/**
|
||||
* Add jvm metric prefix, remove pool info from name and obtain the pool name as labels if needed
|
||||
* @param metricName metric name from MetricSet
|
||||
* @param targets keywords need to be replaced
|
||||
* @param prefix prefix for new metric name
|
||||
* @param replacement replacement for pool name
|
||||
* @return new metric without target, labels if exists
|
||||
*/
|
||||
def handleJVMMetricName(
|
||||
metricName: String,
|
||||
targets: mutable.Buffer[String],
|
||||
prefix: String,
|
||||
replacement: String): (String, Map[String, String]) = {
|
||||
for (target <- targets) {
|
||||
if (metricName.contains(target)) {
|
||||
val labels = Map("name" -> target)
|
||||
var replaceTarget = target
|
||||
if (replacement.isEmpty) {
|
||||
replaceTarget = target + "."
|
||||
}
|
||||
return (MetricRegistry.name(prefix, metricName.replace(replaceTarget, replacement)), labels)
|
||||
}
|
||||
}
|
||||
(MetricRegistry.name(prefix, metricName), Map.empty[String, String])
|
||||
}
|
||||
|
||||
// all metrics in MetricSet are gauges
|
||||
Seq(new GarbageCollectorMetricSet()).map(_.getMetrics.asScala.map {
|
||||
case (name: String, metric: Gauge[_]) =>
|
||||
val newMetrics = handleJVMMetricName(name, gcNames, JVM_METRIC_PREFIX, "gc")
|
||||
addGauge(newMetrics._1, newMetrics._2, metric)
|
||||
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
|
||||
})
|
||||
|
||||
Seq(new MemoryUsageGaugeSet()).map(_.getMetrics.asScala.map {
|
||||
case (name: String, metric: Gauge[_]) =>
|
||||
val newMetrics = handleJVMMetricName(name, poolNames, JVM_METRIC_MEMORY_PREFIX, "")
|
||||
addGauge(newMetrics._1, newMetrics._2, metric)
|
||||
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
|
||||
})
|
||||
|
||||
Seq(
|
||||
new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer)).map(
|
||||
_.getMetrics.asScala.map {
|
||||
case (name: String, metric: Gauge[_]) =>
|
||||
addGauge(MetricRegistry.name(JVM_METRIC_PREFIX, name), metric)
|
||||
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
|
||||
})
|
||||
|
||||
Seq(new ThreadStatesGaugeSet()).map(_.getMetrics.asScala.map {
|
||||
case (name: String, metric: Gauge[_]) =>
|
||||
addGauge(MetricRegistry.name(JVM_METRIC_THREAD_PREFIX, name), metric)
|
||||
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
|
||||
})
|
||||
|
||||
// start cleaner
|
||||
startCleaner()
|
||||
}
|
||||
|
||||
object JVMSource {
|
||||
private val JVM_METRIC_PREFIX = "jvm"
|
||||
private val JVM_METRIC_MEMORY_PREFIX = JVM_METRIC_PREFIX + ".memory"
|
||||
private val JVM_METRIC_THREAD_PREFIX = JVM_METRIC_PREFIX + ".thread"
|
||||
|
||||
private val WHITESPACE = "\\s+".r.pattern
|
||||
}
|
||||
|
||||
@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.celeborn.common.metrics.source
|
||||
|
||||
import org.apache.celeborn.CelebornFunSuite
|
||||
import org.apache.celeborn.common.CelebornConf
|
||||
|
||||
class JVMSourceSuite extends CelebornFunSuite {
|
||||
|
||||
val gcNames = Seq("G1-Young-Generation", "G1-Old-Generation").toBuffer
|
||||
val poolNames = Seq("G1-Eden-Space", "G1-Survivor-Space", "G1-Old-Gen").toBuffer
|
||||
|
||||
val JVM_METRIC_PREFIX = "jvm"
|
||||
val JVM_MEMORY_PREFIX = "jvm.memory"
|
||||
|
||||
test("Test handleJVMMetricName") {
|
||||
|
||||
val jvmSource = new JVMSource(new CelebornConf(), "test")
|
||||
|
||||
val gcMetric1 = "G1-Old-Generation.time"
|
||||
val gcMetric2 = "G1-Young-Generation.count"
|
||||
val gcResult1 = jvmSource.handleJVMMetricName(gcMetric1, gcNames, JVM_METRIC_PREFIX, "gc")
|
||||
val gcResult2 = jvmSource.handleJVMMetricName(gcMetric2, gcNames, JVM_METRIC_PREFIX, "gc")
|
||||
assert(gcResult1._1 == "jvm.gc.time")
|
||||
assert(gcResult1._2 == Map("name" -> "G1-Old-Generation"))
|
||||
assert(gcResult2._1 == "jvm.gc.count")
|
||||
assert(gcResult2._2 == Map("name" -> "G1-Young-Generation"))
|
||||
|
||||
val memoryMetric1 = "total.init"
|
||||
val memoryMetrics = "pools.G1-Eden-Space.init"
|
||||
val memoryResult1 =
|
||||
jvmSource.handleJVMMetricName(memoryMetric1, poolNames, JVM_MEMORY_PREFIX, "")
|
||||
val memoryResult2 =
|
||||
jvmSource.handleJVMMetricName(memoryMetrics, poolNames, JVM_MEMORY_PREFIX, "")
|
||||
assert(memoryResult1._1 == "jvm.memory.total.init")
|
||||
assert(memoryResult1._2 == Map.empty[String, String])
|
||||
assert(memoryResult2._1 == "jvm.memory.pools.init")
|
||||
assert(memoryResult2._2 == Map("name" -> "G1-Eden-Space"))
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user