[KYUUBI #667] Better error msg if yarn queue has no reousrce during submitting app

<!--
Thanks for sending a pull request!

Here are some tips for you:
  1. If this is your first time, please read our contributor guidelines: https://kyuubi.readthedocs.io/en/latest/community/contributions.html
  2. If the PR is related to an issue in https://github.com/NetEase/kyuubi/issues, add '[KYUUBI #XXXX]' in your PR title, e.g., '[KYUUBI #XXXX] Your PR title ...'.
  3. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP][KYUUBI #XXXX] Your PR title ...'.
-->

### _Why are the changes needed?_
<!--
Please clarify why the changes are needed. For instance,
  1. If you add a feature, you can talk about the use case of it.
  2. If you fix a bug, you can clarify why it is a bug.
-->
Give more readable error msg if the queue has no resource after submitting app timeout.

before
```
Failed to detect the root cause, please check $engineLog at server side if necessary
```

after
```
The last status of Spark App is ACCEPTED, please check your cluster resource
```

### _How was this patch tested?_
Add new test.

Closes #667 from ulysses-you/submit-app-better-error-msg.

Closes #667

c7b8185 [ulysses-you] improve
317a120 [ulysses-you] fix
b64daac [ulysses-you] client
c907cd6 [ulysses-you] test
7ae84ec [ulysses-you] dump last log
ea35609 [ulysses-you] init

Authored-by: ulysses-you <ulyssesyou18@gmail.com>
Signed-off-by: Kent Yao <yao@apache.org>
This commit is contained in:
ulysses-you 2021-06-08 19:16:18 +08:00 committed by Kent Yao
parent e57a4346e2
commit ee74480737
No known key found for this signature in database
GPG Key ID: F7051850A0AF904D
3 changed files with 83 additions and 5 deletions

View File

@ -63,6 +63,7 @@ trait ProcBuilder {
}
@volatile private var error: Throwable = UNCAUGHT_ERROR
@volatile private var lastRowOfLog: String = "unknown"
// Visible for test
private[kyuubi] var logCaptureThread: Thread = _
@ -123,6 +124,8 @@ trait ProcBuilder {
}
error = KyuubiSQLException(sb.toString() + s"\n See more: $engineLog")
} else if (line != null) {
lastRowOfLog = line
}
line = reader.readLine()
}
@ -150,8 +153,9 @@ trait ProcBuilder {
Thread.sleep(1000)
}
error match {
case UNCAUGHT_ERROR => KyuubiSQLException(
s"Failed to detect the root cause, please check $engineLog at server side if necessary")
case UNCAUGHT_ERROR =>
KyuubiSQLException(s"Failed to detect the root cause, please check $engineLog at server " +
s"side if necessary. The last line log is: $lastRowOfLog")
case other => other
}
}
@ -161,5 +165,4 @@ object ProcBuilder extends Logging {
private val PROC_BUILD_LOGGER = new NamedThreadFactory("process-logger-capture", daemon = true)
private val UNCAUGHT_ERROR = new RuntimeException("Uncaught error")
}

View File

@ -0,0 +1,67 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kyuubi.engine
import java.util.concurrent.CountDownLatch
import org.apache.kyuubi.{KyuubiSQLException, WithKyuubiServerWithMiniYarnService}
import org.apache.kyuubi.config.KyuubiConf
import org.apache.kyuubi.operation.JDBCTestUtils
class EngineSubmitSuite extends WithKyuubiServerWithMiniYarnService with JDBCTestUtils {
override protected def jdbcUrl: String = getJdbcUrl
override protected val kyuubiServerConf: KyuubiConf = KyuubiConf()
override protected val connectionConf: Map[String, String] = {
Map("spark.yarn.queue" -> "two_cores_queue",
"spark.master" -> "yarn",
"spark.submit.deployMode" -> "client",
"spark.executor.instances" -> "1",
"spark.driver.cores" -> "1",
"spark.executor.cores" -> "1",
KyuubiConf.ENGINE_SHARE_LEVEL.key -> "connection",
KyuubiConf.ENGINE_INIT_TIMEOUT.key -> "60000")
}
test("submit spark app timeout with accepted status") {
@volatile var appIsRunning = false
val lock = new CountDownLatch(1)
new Thread(() => {
while (!appIsRunning) { Thread.sleep(100) }
try {
withJdbcStatement() { statement =>
val exception = intercept[KyuubiSQLException] {
statement.execute("select 1")
}
assert(exception.getMessage.contains("Failed to detect the root cause"))
assert(exception.getMessage.contains("The last line log"))
assert(exception.getMessage.contains("state: ACCEPTED"))
}
} finally {
lock.countDown()
}
}).start()
withJdbcStatement() { statement =>
appIsRunning = true
statement.execute("select 1")
// hold resource so that the queue has no resource for other app
lock.await()
}
}
}

View File

@ -47,15 +47,23 @@ class MiniYarnService(name: String) extends AbstractService(name) {
"100.0")
// capacity-scheduler.xml is missing in hadoop-client-minicluster so this is a workaround
yarnConfig.set("yarn.scheduler.capacity.root.queues", "default")
yarnConfig.set("yarn.scheduler.capacity.root.queues", "default,two_cores_queue")
yarnConfig.setInt("yarn.scheduler.capacity.root.default.capacity", 100)
yarnConfig.setFloat("yarn.scheduler.capacity.root.default.user-limit-factor", 1)
yarnConfig.setInt("yarn.scheduler.capacity.root.default.maximum-capacity", 100)
yarnConfig.set("yarn.scheduler.capacity.root.default.state", "RUNNING")
yarnConfig.set("yarn.scheduler.capacity.root.default.acl_submit_applications", "*")
yarnConfig.set("yarn.scheduler.capacity.root.default.acl_administer_queue", "*")
yarnConfig.setInt("yarn.scheduler.capacity.node-locality-delay", -1)
yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-capacity", 100)
yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-applications", 2)
yarnConfig.setInt("yarn.scheduler.capacity.root.two_cores_queue.maximum-allocation-vcores", 2)
yarnConfig.setFloat("yarn.scheduler.capacity.root.two_cores_queue.user-limit-factor", 1)
yarnConfig.set("yarn.scheduler.capacity.root.two_cores_queue.acl_administer_queue", "*")
yarnConfig.set("yarn.scheduler.capacity.root.two_cores_queue.acl_administer_queue", "*")
yarnConfig.setInt("yarn.scheduler.capacity.node-locality-delay", -1)
// Set bind host to localhost to avoid java.net.BindException
yarnConfig.set("yarn.resourcemanager.bind-host", "localhost")