From d46fb077ab8cbfc3bb9ad9f90e68ddd06a9bd60e Mon Sep 17 00:00:00 2001 From: "Kent Yao (from Travis CI)" Date: Tue, 22 Dec 2020 03:11:08 +0000 Subject: [PATCH] Deploy yaooqinn/kyuubi to github.com/yaooqinn/kyuubi.git:gh-pages --- README.md | 114 +++--------------- docs/quick_start/quick_start_with_jupyter.md | 8 ++ .../kyuubi/engine/spark/SparkSQLEngine.scala | 28 +---- .../operation/SparkSQLOperationManager.scala | 2 +- .../session/SparkSQLSessionManager.scala | 27 ++++- .../service/KinitAuxiliaryService.scala | 3 +- .../kyuubi/session/SessionManager.scala | 1 + .../kyuubi/util/KyuubiHadoopUtils.scala | 5 + .../kyuubi/ha/client/ServiceDiscovery.scala | 6 +- .../kyuubi/session/KyuubiSessionImpl.scala | 4 +- 10 files changed, 67 insertions(+), 131 deletions(-) create mode 100644 docs/quick_start/quick_start_with_jupyter.md diff --git a/README.md b/README.md index 28b2e068d..fc4bfdcad 100644 --- a/README.md +++ b/README.md @@ -7,105 +7,25 @@ [![HitCount](http://hits.dwyl.io/yaooqinn/kyuubi.svg)](http://hits.dwyl.io/yaooqinn/kyuubi) [![DepShield Badge](https://depshield.sonatype.org/badges/yaooqinn/kyuubi/depshield.svg)](https://depshield.github.io) [![Documentation Status](https://readthedocs.org/projects/kyuubi/badge/?version=latest)](https://kyuubi.readthedocs.io/en/latest/?badge=latest) -[![Gitter](https://badges.gitter.im/kyuubi-on-spark/Lobby.svg)](https://gitter.im/kyuubi-on-spark/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) - +Kyuubi is a unified multi-tenant JDBC interface for large-scale data processing and analytics, built on top of [Apache Spark](http://spark.apache.org). +The project took its name from a character of a popular Japanese manga - `Naruto`. +The character is named `Kyuubi Kitsune/Kurama`, which is a nine-tailed fox in mythology. +`Kyuubi` spread the power and spirit of fire, which is used here to represent the powerful [Apache Spark](http://spark.apache.org). +It's nine tails stands for end-to end multi-tenancy support of this project. -Kyuubi is an enhanced edition of the [Apache Spark](http://spark.apache.org)'s primordial - [Thrift JDBC/ODBC Server](http://spark.apache.org/docs/latest/sql-programming-guide.html#running-the-thrift-jdbcodbc-server). It is mainly designed for directly running SQL towards a cluster with all components including HDFS, YARN, Hive MetaStore, and itself secured. Kyuubi is a Spark SQL thrift service with end-to-end multi tenant guaranteed. Please go to [Kyuubi Architecture](https://yaooqinn.github.io/kyuubi/docs/architecture.html) to learn more if you are interested. +Ready? [Getting Started](https://kyuubi.readthedocs.io/en/latest/quick_start/quick_start.html) with Kyuubi. -Basically, the Thrift JDBC/ODBC Server as a similar ad-hoc SQL query service of [Apache Hive](https://hive.apache.org)'s [HiveServer2](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Overview) for Spark SQL, acts as a distributed query engine using its JDBC/ODBC or command-line interface. -In this mode, end-users or applications can interact with Spark SQL directly to run SQL queries, without the need to write any code. We can make pretty business reports with massive data using some BI tools which supported JDBC/ODBC connections, such as [Tableau](https://www.tableau.com), [NetEase YouData](https://youdata.163.com) and so on. Profiting from Apache Spark's capability, we can archive much more performance improvement than Apache Hive as a SQL on Hadoop service. +## Contributing -But unfortunately, due to the limitations of Spark's own architecture,to be used as an enterprise-class product, there are a number of problems compared with HiveServer2,such as multi-tenant isolation, authentication/authorization, high concurrency, high availability, and so on. And the Apache Spark community's support for this module has been in a state of prolonged stagnation. +All bits of help are welcome. You can make various types of contributions to Kyuubi, including the following but not limited to, -Kyuubi has enhanced the Thrift JDBC/ODBC Server in some ways for solving these existing problems, as shown in the following table. - -Features|Spark Thrift Server|Kyuubi|Comments - ---|---|---|--- - multiple SparkContext | ✘ | ✔ | [User tagged SparkContext](https://yaooqinn.github.io/kyuubi/docs/architecture.html#1.2.2) - lazy SparkContext| ✘ | ✔ |[Session level SparkContext](https://yaooqinn.github.io/kyuubi/docs/architecture.html#1.2.1) - SparkContext cache| ✘ | ✔ | [SparkContext Cache Management](https://yaooqinn.github.io/kyuubi/docs/architecture.html#1.2.2) - dynamic queue | ✘ | ✔ | Kyuubi identifies `spark.yarn.queue` in the connection string.| - session level configurations|`spark.sql.*`| ✔ |[Dynamic Resource Requesting](https://yaooqinn.github.io/kyuubi/docs/architecture.html#1.2.1) - authentication| ✔ | ✔ |[Authentication/Security Guide](https://yaooqinn.github.io/kyuubi/docs/authentication.html) | - authorization| ✘ | ✔ |[Kyuubi ACL Management Guide](https://yaooqinn.github.io/kyuubi/docs/authorization.html)| - impersonation| ✘ | ✔ |Kyuubi fully supports `hive.server2.proxy.user` and `hive.server2.doAs`| - multi tenancy| ✘ | ✔ |Based on the above features,Kyuubi is able to run as a multi-tenant server on a LCE supported Yarn cluster.| - operation log| ✘ | ✔ |Kyuubi redirect sql operation log to local file which has an interface for the client to fetch.| - high availability| ✘ | ✔ |[ZooKeeper Dynamic Service Discovery](https://yaooqinn.github.io/kyuubi/docs/architecture.html#1.4) | - containerization| ✘ | ✔ | [Kyuubi Containerization Guide](https://yaooqinn.github.io/kyuubi/docs/containerization.html)| - type mapping| ✘ | ✔ |Kyuubi support Spark result/schema to be directly converted to Thrift result/schemas bypassing Hive format results| - -## Getting Started - -### Packaging - -Please refer to the [Building Kyuubi](https://yaooqinn.github.io/kyuubi/docs/building.html) in the online documentation for an overview on how to build Kyuubi. - -### Start Kyuubi - -We can start Kyuubi with the built-in startup script `bin/start-kyuubi.sh`. -First of all, export `SPARK_HOME` in `$KYUUBI_HOME/bin/kyuubi-env.sh` - -```bash -export SPARK_HOME=/the/path/to/a/runable/spark/binary/dir -``` - -And then the last, start Kyuubi with `bin/start-kyuubi.sh` -```bash -$ bin/start-kyuubi.sh \ - --master yarn \ - --deploy-mode client \ - --driver-memory 10g \ - --conf spark.kyuubi.frontend.bind.port=10009 -``` - -### Run Spark SQL on Kyuubi - -Now you can use [beeline](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients), [Tableau](https://www.tableau.com/zh-cn) or Thrift API based programs to connect to Kyuubi server. - -### Stop Kyuubi - -```bash -bin/stop-kyuubi.sh -``` - -## Multi Tenancy Support - -### Prerequisites - -Kyuubi may work well with different deployments such as non-secured Yarn, Standalone, Mesos or even local mode, but it is mainly designed for a secured HDFS/Yarn Cluster on which Kyuubi will play well with multi tenant and secure features. - -Suppose that you already have a secured HDFS cluster for deploying Spark, Hive or other applications. - -#### Configure Yarn - -- YARN Secure Containers - + To configure the NodeManager to use the [LinuxExecutorCantainer](https://hadoop.apache.org/docs/r2.7.2/hadoop-yarn/hadoop-yarn-site/SecureContainer.html) - + Queues(Optional), please refer to [Capacity Scheduler](https://hadoop.apache.org/docs/r2.7.2/hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html) or [Fair Scheduler](https://hadoop.apache.org/docs/r2.7.2/hadoop-yarn/hadoop-yarn-site/FairScheduler.html) to see more. - -#### Spark on Yarn -- Setup for [Spark On Yarn](http://spark.apache.org/docs/latest/running-on-yarn.html) Ensure that `HADOOP_CONF_DIR` or `YARN_CONF_DIR` points to the directory which contains the (client side) configuration files for the Hadoop cluster. - -#### Configure Hive - -- Configuration of Hive is done by placing your `hive-site.xml`, `core-site.xml` and `hdfs-site.xml` files in `$SPARK_HOME/conf`. - -## Configuration - -Please refer to the [Configuration Guide](https://yaooqinn.github.io/kyuubi/docs/configurations.html) in the online documentation for an overview on how to configure Kyuubi. - -## Authentication - -Please refer to the [Authentication/Security Guide](https://yaooqinn.github.io/kyuubi/docs/authentication.html) in the online documentation for an overview on how to enable security for Kyuubi. - -## Additional Documentations -[Building Kyuubi](https://yaooqinn.github.io/kyuubi/docs/building.html) -[Kyuubi Deployment Guide](https://yaooqinn.github.io/kyuubi/docs/deploy.html) -[Kyuubi Containerization Guide](https://yaooqinn.github.io/kyuubi/docs/containerization.html) -[High Availability Guide](https://yaooqinn.github.io/kyuubi/docs/high_availability_guide.html) -[Configuration Guide](https://yaooqinn.github.io/kyuubi/docs/configurations.html) -[Authentication/Security Guide](https://yaooqinn.github.io/kyuubi/docs/authentication.html) -[Kyuubi ACL Management Guide](https://yaooqinn.github.io/kyuubi/docs/authorization.html) -[Kyuubi Architecture](https://yaooqinn.github.io/kyuubi/docs/architecture.html) +- Help new users in chat channel or share your success stories w/ us - [![Gitter](https://badges.gitter.im/kyuubi-on-spark/Lobby.svg)](https://gitter.im/kyuubi-on-spark/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) +- Improve Documentation - [![Documentation Status](https://readthedocs.org/projects/kyuubi/badge/?version=latest)](https://kyuubi.readthedocs.io/en/latest/?badge=latest) +- Test releases - [![GitHub release](https://img.shields.io/github/release/yaooqinn/kyuubi.svg)](https://github.com/yaooqinn/kyuubi/releases) +- Improve test coverage - [![codecov](https://codecov.io/gh/yaooqinn/kyuubi/branch/master/graph/badge.svg)](https://codecov.io/gh/yaooqinn/kyuubi) +- Report bugs and better help developers to reproduce +- Review changes +- Make a pull request +- Promote to others +- Click the star button if you like this project diff --git a/docs/quick_start/quick_start_with_jupyter.md b/docs/quick_start/quick_start_with_jupyter.md new file mode 100644 index 000000000..1588a2d6a --- /dev/null +++ b/docs/quick_start/quick_start_with_jupyter.md @@ -0,0 +1,8 @@ +
+ +![](../imgs/kyuubi_logo_simple.png) + +
+ +# Getting Started With Hive Jupyter Lap + diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/SparkSQLEngine.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/SparkSQLEngine.scala index 6e0082d9b..0bb583a30 100644 --- a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/SparkSQLEngine.scala +++ b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/SparkSQLEngine.scala @@ -18,52 +18,26 @@ package org.apache.kyuubi.engine.spark import java.time.Instant -import java.util.concurrent.TimeUnit import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.kyuubi.{Logging, Utils} import org.apache.kyuubi.config.KyuubiConf -import org.apache.kyuubi.engine.spark.session.SparkSQLSessionManager import org.apache.kyuubi.ha.HighAvailabilityConf._ import org.apache.kyuubi.ha.client.{RetryPolicies, ServiceDiscovery} import org.apache.kyuubi.service.Serverable -import org.apache.kyuubi.util.{SignalRegister, ThreadUtils} +import org.apache.kyuubi.util.SignalRegister private[spark] final class SparkSQLEngine(name: String, spark: SparkSession) extends Serverable(name) { def this(spark: SparkSession) = this(classOf[SparkSQLEngine].getSimpleName, spark) - private val timeoutChecker = - ThreadUtils.newDaemonSingleThreadScheduledExecutor(s"$name-timeout-checker") - override private[kyuubi] val backendService = new SparkSQLBackendService(spark) override protected def stopServer(): Unit = { spark.stop() - timeoutChecker.shutdown() - timeoutChecker.awaitTermination(10, TimeUnit.SECONDS) - } - - override def start(): Unit = { - val interval = conf.get(KyuubiConf.ENGINE_CHECK_INTERVAL) - val idleTimeout = conf.get(KyuubiConf.ENGINE_IDLE_TIMEOUT) - - val checkTask = new Runnable { - override def run(): Unit = { - val current = System.currentTimeMillis - val sessionManager = backendService.sessionManager.asInstanceOf[SparkSQLSessionManager] - if (sessionManager.getOpenSessionCount <= 0 && - (current - sessionManager.latestLogoutTime) > idleTimeout) { - info(s"Idled for more than $idleTimeout, terminating") - sys.exit(0) - } - } - } - timeoutChecker.scheduleWithFixedDelay(checkTask, interval, interval, TimeUnit.MILLISECONDS) - super.start() } } diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/SparkSQLOperationManager.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/SparkSQLOperationManager.scala index 0d0ae06cf..7169b71e0 100644 --- a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/SparkSQLOperationManager.scala +++ b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/SparkSQLOperationManager.scala @@ -43,7 +43,7 @@ class SparkSQLOperationManager private (name: String) extends OperationManager(n sessionToSpark.put(sessionHandle, spark) } - def removeSparkSession(sessionHandle: SessionHandle): Unit = { + def removeSparkSession(sessionHandle: SessionHandle): SparkSession = { sessionToSpark.remove(sessionHandle) } diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/session/SparkSQLSessionManager.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/session/SparkSQLSessionManager.scala index ccc55e462..a77691079 100644 --- a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/session/SparkSQLSessionManager.scala +++ b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/session/SparkSQLSessionManager.scala @@ -17,10 +17,13 @@ package org.apache.kyuubi.engine.spark.session +import java.util.concurrent.TimeUnit + import org.apache.hive.service.rpc.thrift.TProtocolVersion import org.apache.spark.sql.SparkSession import org.apache.kyuubi.KyuubiSQLException +import org.apache.kyuubi.config.KyuubiConf import org.apache.kyuubi.engine.spark.operation.SparkSQLOperationManager import org.apache.kyuubi.session._ @@ -39,7 +42,7 @@ class SparkSQLSessionManager private (name: String, spark: SparkSession) val operationManager = new SparkSQLOperationManager() - @volatile private var _latestLogoutTime: Long = Long.MaxValue + @volatile private var _latestLogoutTime: Long = System.currentTimeMillis() def latestLogoutTime: Long = _latestLogoutTime override def openSession( @@ -87,4 +90,26 @@ class SparkSQLSessionManager private (name: String, spark: SparkSession) } override protected def isServer: Boolean = false + + override def start(): Unit = { + startTimeoutChecker() + super.start() + } + + private def startTimeoutChecker(): Unit = { + val interval = conf.get(KyuubiConf.ENGINE_CHECK_INTERVAL) + val idleTimeout = conf.get(KyuubiConf.ENGINE_IDLE_TIMEOUT) + val checkTask = new Runnable { + override def run(): Unit = { + while (getOpenSessionCount > 0 || + System.currentTimeMillis - latestLogoutTime < idleTimeout) { + TimeUnit.MILLISECONDS.sleep(interval) + } + info(s"Idled for more than $idleTimeout ms, terminating") + sys.exit(0) + } + } + submitBackgroundOperation(checkTask) + } + } diff --git a/kyuubi-common/src/main/scala/org/apache/kyuubi/service/KinitAuxiliaryService.scala b/kyuubi-common/src/main/scala/org/apache/kyuubi/service/KinitAuxiliaryService.scala index 27112159d..bc0446b5f 100644 --- a/kyuubi-common/src/main/scala/org/apache/kyuubi/service/KinitAuxiliaryService.scala +++ b/kyuubi-common/src/main/scala/org/apache/kyuubi/service/KinitAuxiliaryService.scala @@ -22,7 +22,7 @@ import java.util.concurrent.TimeUnit import org.apache.hadoop.security.UserGroupInformation import org.apache.kyuubi.config.KyuubiConf -import org.apache.kyuubi.util.ThreadUtils +import org.apache.kyuubi.util.{KyuubiHadoopUtils, ThreadUtils} class KinitAuxiliaryService() extends AbstractService("KinitAuxiliaryService") { @@ -38,6 +38,7 @@ class KinitAuxiliaryService() extends AbstractService("KinitAuxiliaryService") { if (UserGroupInformation.isSecurityEnabled) { val keytab = conf.get(KyuubiConf.SERVER_KEYTAB) val principal = conf.get(KyuubiConf.SERVER_PRINCIPAL) + .map(KyuubiHadoopUtils.getServerPrincipal) kinitInterval = conf.get(KyuubiConf.KINIT_INTERVAL) kinitMaxAttempts = conf.get(KyuubiConf.KINIT_MAX_ATTEMPTS) diff --git a/kyuubi-common/src/main/scala/org/apache/kyuubi/session/SessionManager.scala b/kyuubi-common/src/main/scala/org/apache/kyuubi/session/SessionManager.scala index 8f6dea6e6..23dc3dbe2 100644 --- a/kyuubi-common/src/main/scala/org/apache/kyuubi/session/SessionManager.scala +++ b/kyuubi-common/src/main/scala/org/apache/kyuubi/session/SessionManager.scala @@ -65,6 +65,7 @@ abstract class SessionManager(name: String) extends CompositeService(name) { if (session == null) { throw KyuubiSQLException(s"Invalid $sessionHandle") } + info(s"$sessionHandle is closed, current opening sessions $getOpenSessionCount") session.close() } diff --git a/kyuubi-common/src/main/scala/org/apache/kyuubi/util/KyuubiHadoopUtils.scala b/kyuubi-common/src/main/scala/org/apache/kyuubi/util/KyuubiHadoopUtils.scala index c103eb9ed..e6a009b93 100644 --- a/kyuubi-common/src/main/scala/org/apache/kyuubi/util/KyuubiHadoopUtils.scala +++ b/kyuubi-common/src/main/scala/org/apache/kyuubi/util/KyuubiHadoopUtils.scala @@ -18,6 +18,7 @@ package org.apache.kyuubi.util import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.security.SecurityUtil import org.apache.kyuubi.config.KyuubiConf @@ -28,4 +29,8 @@ object KyuubiHadoopUtils { conf.getAll.foreach { case (k, v) => hadoopConf.set(k, v) } hadoopConf } + + def getServerPrincipal(principal: String): String = { + SecurityUtil.getServerPrincipal(principal, "0.0.0.0") + } } diff --git a/kyuubi-ha/src/main/scala/org/apache/kyuubi/ha/client/ServiceDiscovery.scala b/kyuubi-ha/src/main/scala/org/apache/kyuubi/ha/client/ServiceDiscovery.scala index 8d146158e..34948cb4c 100644 --- a/kyuubi-ha/src/main/scala/org/apache/kyuubi/ha/client/ServiceDiscovery.scala +++ b/kyuubi-ha/src/main/scala/org/apache/kyuubi/ha/client/ServiceDiscovery.scala @@ -28,7 +28,7 @@ import org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode import org.apache.curator.framework.state.{ConnectionState, ConnectionStateListener} import org.apache.curator.framework.state.ConnectionState.{CONNECTED, LOST, RECONNECTED} import org.apache.curator.retry._ -import org.apache.hadoop.security.{SecurityUtil, UserGroupInformation} +import org.apache.hadoop.security.UserGroupInformation import org.apache.hadoop.security.token.delegation.ZKDelegationTokenSecretManager.JaasConfiguration import org.apache.zookeeper.{KeeperException, WatchedEvent, Watcher} import org.apache.zookeeper.CreateMode.PERSISTENT @@ -39,7 +39,7 @@ import org.apache.kyuubi.config.KyuubiConf import org.apache.kyuubi.ha.HighAvailabilityConf._ import org.apache.kyuubi.ha.client.ServiceDiscovery._ import org.apache.kyuubi.service.{AbstractService, Serverable} -import org.apache.kyuubi.util.ThreadUtils +import org.apache.kyuubi.util.{KyuubiHadoopUtils, ThreadUtils} /** * A service for service discovery @@ -234,7 +234,7 @@ object ServiceDiscovery { } System.setProperty("zookeeper.sasl.clientconfig", "KyuubiZooKeeperClient") var principal = maybePrincipal.get - principal = SecurityUtil.getServerPrincipal(principal, "0.0.0.0") + principal = KyuubiHadoopUtils.getServerPrincipal(principal) val jaasConf = new JaasConfiguration("KyuubiZooKeeperClient", principal, keyTabFile.get) Configuration.setConfiguration(jaasConf) } diff --git a/kyuubi-main/src/main/scala/org/apache/kyuubi/session/KyuubiSessionImpl.scala b/kyuubi-main/src/main/scala/org/apache/kyuubi/session/KyuubiSessionImpl.scala index 8cdbbfa88..5df94b109 100644 --- a/kyuubi-main/src/main/scala/org/apache/kyuubi/session/KyuubiSessionImpl.scala +++ b/kyuubi-main/src/main/scala/org/apache/kyuubi/session/KyuubiSessionImpl.scala @@ -68,7 +68,9 @@ class KyuubiSessionImpl( private def getServerHost: Option[(String, Int)] = { try { val hosts = zkClient.getChildren.forPath(zkPath) - hosts.asScala.headOption.map { p => + // TODO: use last one because to avoid touching some maybe-crashed engines + // We need a big improvement here. + hosts.asScala.lastOption.map { p => val path = ZKPaths.makePath(null, zkNamespace, p) val hostPort = new String(zkClient.getData.forPath(path), StandardCharsets.UTF_8) val strings = hostPort.split(":")