From ee7448073731c7cf00487646710fc947a290c5e2 Mon Sep 17 00:00:00 2001 From: ulysses-you Date: Tue, 8 Jun 2021 19:16:18 +0800 Subject: [PATCH] [KYUUBI #667] Better error msg if yarn queue has no reousrce during submitting app ### _Why are the changes needed?_ Give more readable error msg if the queue has no resource after submitting app timeout. before ``` Failed to detect the root cause, please check $engineLog at server side if necessary ``` after ``` The last status of Spark App is ACCEPTED, please check your cluster resource ``` ### _How was this patch tested?_ Add new test. Closes #667 from ulysses-you/submit-app-better-error-msg. Closes #667 c7b8185 [ulysses-you] improve 317a120 [ulysses-you] fix b64daac [ulysses-you] client c907cd6 [ulysses-you] test 7ae84ec [ulysses-you] dump last log ea35609 [ulysses-you] init Authored-by: ulysses-you Signed-off-by: Kent Yao --- .../apache/kyuubi/engine/ProcBuilder.scala | 9 ++- .../kyuubi/engine/EngineSubmitSuite.scala | 67 +++++++++++++++++++ .../kyuubi/server/MiniYarnService.scala | 12 +++- 3 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 kyuubi-main/src/test/scala/org/apache/kyuubi/engine/EngineSubmitSuite.scala diff --git a/kyuubi-main/src/main/scala/org/apache/kyuubi/engine/ProcBuilder.scala b/kyuubi-main/src/main/scala/org/apache/kyuubi/engine/ProcBuilder.scala index b71cffc8a..95bdfa7a5 100644 --- a/kyuubi-main/src/main/scala/org/apache/kyuubi/engine/ProcBuilder.scala +++ b/kyuubi-main/src/main/scala/org/apache/kyuubi/engine/ProcBuilder.scala @@ -63,6 +63,7 @@ trait ProcBuilder { } @volatile private var error: Throwable = UNCAUGHT_ERROR + @volatile private var lastRowOfLog: String = "unknown" // Visible for test private[kyuubi] var logCaptureThread: Thread = _ @@ -123,6 +124,8 @@ trait ProcBuilder { } error = KyuubiSQLException(sb.toString() + s"\n See more: $engineLog") + } else if (line != null) { + lastRowOfLog = line } line = reader.readLine() } @@ -150,8 +153,9 @@ trait ProcBuilder { Thread.sleep(1000) } error match { - case UNCAUGHT_ERROR => KyuubiSQLException( - s"Failed to detect the root cause, please check $engineLog at server side if necessary") + case UNCAUGHT_ERROR => + KyuubiSQLException(s"Failed to detect the root cause, please check $engineLog at server " + + s"side if necessary. The last line log is: $lastRowOfLog") case other => other } } @@ -161,5 +165,4 @@ object ProcBuilder extends Logging { private val PROC_BUILD_LOGGER = new NamedThreadFactory("process-logger-capture", daemon = true) private val UNCAUGHT_ERROR = new RuntimeException("Uncaught error") - } diff --git a/kyuubi-main/src/test/scala/org/apache/kyuubi/engine/EngineSubmitSuite.scala b/kyuubi-main/src/test/scala/org/apache/kyuubi/engine/EngineSubmitSuite.scala new file mode 100644 index 000000000..2da8bd394 --- /dev/null +++ b/kyuubi-main/src/test/scala/org/apache/kyuubi/engine/EngineSubmitSuite.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kyuubi.engine + +import java.util.concurrent.CountDownLatch + +import org.apache.kyuubi.{KyuubiSQLException, WithKyuubiServerWithMiniYarnService} +import org.apache.kyuubi.config.KyuubiConf +import org.apache.kyuubi.operation.JDBCTestUtils + +class EngineSubmitSuite extends WithKyuubiServerWithMiniYarnService with JDBCTestUtils { + override protected def jdbcUrl: String = getJdbcUrl + override protected val kyuubiServerConf: KyuubiConf = KyuubiConf() + override protected val connectionConf: Map[String, String] = { + Map("spark.yarn.queue" -> "two_cores_queue", + "spark.master" -> "yarn", + "spark.submit.deployMode" -> "client", + "spark.executor.instances" -> "1", + "spark.driver.cores" -> "1", + "spark.executor.cores" -> "1", + KyuubiConf.ENGINE_SHARE_LEVEL.key -> "connection", + KyuubiConf.ENGINE_INIT_TIMEOUT.key -> "60000") + } + + test("submit spark app timeout with accepted status") { + @volatile var appIsRunning = false + val lock = new CountDownLatch(1) + new Thread(() => { + while (!appIsRunning) { Thread.sleep(100) } + try { + withJdbcStatement() { statement => + val exception = intercept[KyuubiSQLException] { + statement.execute("select 1") + } + + assert(exception.getMessage.contains("Failed to detect the root cause")) + assert(exception.getMessage.contains("The last line log")) + assert(exception.getMessage.contains("state: ACCEPTED")) + } + } finally { + lock.countDown() + } + }).start() + + withJdbcStatement() { statement => + appIsRunning = true + statement.execute("select 1") + // hold resource so that the queue has no resource for other app + lock.await() + } + } +} diff --git a/kyuubi-main/src/test/scala/org/apache/kyuubi/server/MiniYarnService.scala b/kyuubi-main/src/test/scala/org/apache/kyuubi/server/MiniYarnService.scala index 83f0bb7d6..1d4f6405f 100644 --- a/kyuubi-main/src/test/scala/org/apache/kyuubi/server/MiniYarnService.scala +++ b/kyuubi-main/src/test/scala/org/apache/kyuubi/server/MiniYarnService.scala @@ -47,15 +47,23 @@ class MiniYarnService(name: String) extends AbstractService(name) { "100.0") // capacity-scheduler.xml is missing in hadoop-client-minicluster so this is a workaround - yarnConfig.set("yarn.scheduler.capacity.root.queues", "default") + yarnConfig.set("yarn.scheduler.capacity.root.queues", "default,two_cores_queue") + yarnConfig.setInt("yarn.scheduler.capacity.root.default.capacity", 100) yarnConfig.setFloat("yarn.scheduler.capacity.root.default.user-limit-factor", 1) yarnConfig.setInt("yarn.scheduler.capacity.root.default.maximum-capacity", 100) yarnConfig.set("yarn.scheduler.capacity.root.default.state", "RUNNING") yarnConfig.set("yarn.scheduler.capacity.root.default.acl_submit_applications", "*") yarnConfig.set("yarn.scheduler.capacity.root.default.acl_administer_queue", "*") - yarnConfig.setInt("yarn.scheduler.capacity.node-locality-delay", -1) + yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-capacity", 100) + yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-applications", 2) + yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-allocation-vcores", 2) + yarnConfig.setFloat("yarn.scheduler.capacity.root.two_cores_queue.user-limit-factor", 1) + yarnConfig.set("yarn.scheduler.capacity.root.two_cores_queue.acl_administer_queue", "*") + yarnConfig.set("yarn.scheduler.capacity.root.two_cores_queue.acl_administer_queue", "*") + + yarnConfig.setInt("yarn.scheduler.capacity.node-locality-delay", -1) // Set bind host to localhost to avoid java.net.BindException yarnConfig.set("yarn.resourcemanager.bind-host", "localhost")