[KYUUBI #667] Better error msg if yarn queue has no reousrce during submitting app
<!-- Thanks for sending a pull request! Here are some tips for you: 1. If this is your first time, please read our contributor guidelines: https://kyuubi.readthedocs.io/en/latest/community/contributions.html 2. If the PR is related to an issue in https://github.com/NetEase/kyuubi/issues, add '[KYUUBI #XXXX]' in your PR title, e.g., '[KYUUBI #XXXX] Your PR title ...'. 3. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP][KYUUBI #XXXX] Your PR title ...'. --> ### _Why are the changes needed?_ <!-- Please clarify why the changes are needed. For instance, 1. If you add a feature, you can talk about the use case of it. 2. If you fix a bug, you can clarify why it is a bug. --> Give more readable error msg if the queue has no resource after submitting app timeout. before ``` Failed to detect the root cause, please check $engineLog at server side if necessary ``` after ``` The last status of Spark App is ACCEPTED, please check your cluster resource ``` ### _How was this patch tested?_ Add new test. Closes #667 from ulysses-you/submit-app-better-error-msg. Closes #667 c7b8185 [ulysses-you] improve 317a120 [ulysses-you] fix b64daac [ulysses-you] client c907cd6 [ulysses-you] test 7ae84ec [ulysses-you] dump last log ea35609 [ulysses-you] init Authored-by: ulysses-you <ulyssesyou18@gmail.com> Signed-off-by: Kent Yao <yao@apache.org>
This commit is contained in:
parent
e57a4346e2
commit
ee74480737
@ -63,6 +63,7 @@ trait ProcBuilder {
|
||||
}
|
||||
|
||||
@volatile private var error: Throwable = UNCAUGHT_ERROR
|
||||
@volatile private var lastRowOfLog: String = "unknown"
|
||||
// Visible for test
|
||||
private[kyuubi] var logCaptureThread: Thread = _
|
||||
|
||||
@ -123,6 +124,8 @@ trait ProcBuilder {
|
||||
}
|
||||
|
||||
error = KyuubiSQLException(sb.toString() + s"\n See more: $engineLog")
|
||||
} else if (line != null) {
|
||||
lastRowOfLog = line
|
||||
}
|
||||
line = reader.readLine()
|
||||
}
|
||||
@ -150,8 +153,9 @@ trait ProcBuilder {
|
||||
Thread.sleep(1000)
|
||||
}
|
||||
error match {
|
||||
case UNCAUGHT_ERROR => KyuubiSQLException(
|
||||
s"Failed to detect the root cause, please check $engineLog at server side if necessary")
|
||||
case UNCAUGHT_ERROR =>
|
||||
KyuubiSQLException(s"Failed to detect the root cause, please check $engineLog at server " +
|
||||
s"side if necessary. The last line log is: $lastRowOfLog")
|
||||
case other => other
|
||||
}
|
||||
}
|
||||
@ -161,5 +165,4 @@ object ProcBuilder extends Logging {
|
||||
private val PROC_BUILD_LOGGER = new NamedThreadFactory("process-logger-capture", daemon = true)
|
||||
|
||||
private val UNCAUGHT_ERROR = new RuntimeException("Uncaught error")
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.kyuubi.engine
|
||||
|
||||
import java.util.concurrent.CountDownLatch
|
||||
|
||||
import org.apache.kyuubi.{KyuubiSQLException, WithKyuubiServerWithMiniYarnService}
|
||||
import org.apache.kyuubi.config.KyuubiConf
|
||||
import org.apache.kyuubi.operation.JDBCTestUtils
|
||||
|
||||
class EngineSubmitSuite extends WithKyuubiServerWithMiniYarnService with JDBCTestUtils {
|
||||
override protected def jdbcUrl: String = getJdbcUrl
|
||||
override protected val kyuubiServerConf: KyuubiConf = KyuubiConf()
|
||||
override protected val connectionConf: Map[String, String] = {
|
||||
Map("spark.yarn.queue" -> "two_cores_queue",
|
||||
"spark.master" -> "yarn",
|
||||
"spark.submit.deployMode" -> "client",
|
||||
"spark.executor.instances" -> "1",
|
||||
"spark.driver.cores" -> "1",
|
||||
"spark.executor.cores" -> "1",
|
||||
KyuubiConf.ENGINE_SHARE_LEVEL.key -> "connection",
|
||||
KyuubiConf.ENGINE_INIT_TIMEOUT.key -> "60000")
|
||||
}
|
||||
|
||||
test("submit spark app timeout with accepted status") {
|
||||
@volatile var appIsRunning = false
|
||||
val lock = new CountDownLatch(1)
|
||||
new Thread(() => {
|
||||
while (!appIsRunning) { Thread.sleep(100) }
|
||||
try {
|
||||
withJdbcStatement() { statement =>
|
||||
val exception = intercept[KyuubiSQLException] {
|
||||
statement.execute("select 1")
|
||||
}
|
||||
|
||||
assert(exception.getMessage.contains("Failed to detect the root cause"))
|
||||
assert(exception.getMessage.contains("The last line log"))
|
||||
assert(exception.getMessage.contains("state: ACCEPTED"))
|
||||
}
|
||||
} finally {
|
||||
lock.countDown()
|
||||
}
|
||||
}).start()
|
||||
|
||||
withJdbcStatement() { statement =>
|
||||
appIsRunning = true
|
||||
statement.execute("select 1")
|
||||
// hold resource so that the queue has no resource for other app
|
||||
lock.await()
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -47,15 +47,23 @@ class MiniYarnService(name: String) extends AbstractService(name) {
|
||||
"100.0")
|
||||
|
||||
// capacity-scheduler.xml is missing in hadoop-client-minicluster so this is a workaround
|
||||
yarnConfig.set("yarn.scheduler.capacity.root.queues", "default")
|
||||
yarnConfig.set("yarn.scheduler.capacity.root.queues", "default,two_cores_queue")
|
||||
|
||||
yarnConfig.setInt("yarn.scheduler.capacity.root.default.capacity", 100)
|
||||
yarnConfig.setFloat("yarn.scheduler.capacity.root.default.user-limit-factor", 1)
|
||||
yarnConfig.setInt("yarn.scheduler.capacity.root.default.maximum-capacity", 100)
|
||||
yarnConfig.set("yarn.scheduler.capacity.root.default.state", "RUNNING")
|
||||
yarnConfig.set("yarn.scheduler.capacity.root.default.acl_submit_applications", "*")
|
||||
yarnConfig.set("yarn.scheduler.capacity.root.default.acl_administer_queue", "*")
|
||||
yarnConfig.setInt("yarn.scheduler.capacity.node-locality-delay", -1)
|
||||
|
||||
yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-capacity", 100)
|
||||
yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-applications", 2)
|
||||
yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-allocation-vcores", 2)
|
||||
yarnConfig.setFloat("yarn.scheduler.capacity.root.two_cores_queue.user-limit-factor", 1)
|
||||
yarnConfig.set("yarn.scheduler.capacity.root.two_cores_queue.acl_administer_queue", "*")
|
||||
yarnConfig.set("yarn.scheduler.capacity.root.two_cores_queue.acl_administer_queue", "*")
|
||||
|
||||
yarnConfig.setInt("yarn.scheduler.capacity.node-locality-delay", -1)
|
||||
// Set bind host to localhost to avoid java.net.BindException
|
||||
yarnConfig.set("yarn.resourcemanager.bind-host", "localhost")
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user