[KYUUBI #3828] [PySpark] Support Python style check with spotless in CI style workflow and reformat tool

### _Why are the changes needed?_

to close #3828.

 Python code style checking support.
1. reuse Spotless maven plugin for Python style check
2. add style check to CI style workflow
3. add python linting support to `dev/reformat`. checks whether `black` installed in PATH.

### _How was this patch tested?_
- [ ] Add some test cases that check the changes thoroughly including negative and positive cases if possible

- [ ] Add screenshots for manual tests if appropriate

- [x] [Run test](https://kyuubi.apache.org/docs/latest/develop_tools/testing.html#running-tests) locally before make a pull request

Closes #3823 from bowenliang123/spotless-python.

Closes #3828

4a4de885 [liangbowen] simplify empty tags
0bb9ec7c [liangbowen] simplify empty tag in pom
9dd39531 [liangbowen] lint python code with black via spotless
f85020fa [liangbowen] typo
4c93bce0 [liangbowen] install python 3.9 first
23fc4b96 [liangbowen] ci install black version from added `spotless.python.black.version` property
73f746b0 [Bowen Liang] Update dev/reformat
46667a00 [liangbowen] update style.yml
9c20b434 [liangbowen] update style.yml
21017e5e [liangbowen] update style.yml
8272c0bc [liangbowen] add python style to style checking for CI
e102726c [liangbowen] add profile spotless in dev/reformat if black found in path
062e9bf2 [liangbowen] add python scan for spotless. add new profile `spotless-python` for python file path.

Lead-authored-by: liangbowen <liangbowen@gf.com.cn>
Co-authored-by: Bowen Liang <bowen.liang.123@gmail.com>
Signed-off-by: Cheng Pan <chengpan@apache.org>
This commit is contained in:
liangbowen 2022-11-22 19:44:02 +08:00 committed by Cheng Pan
parent d0f3029908
commit 00f52a2cb2
No known key found for this signature in database
GPG Key ID: 8001952629BCC75D
5 changed files with 90 additions and 46 deletions

View File

@ -45,6 +45,11 @@ jobs:
java-version: 8
cache: 'maven'
check-latest: false
- name: Setup Python 3
uses: actions/setup-python@v4
with:
python-version: '3.9'
cache: 'pip'
- name: Check kyuubi modules avaliable
id: modules-check
run: build/mvn dependency:resolve -DincludeGroupIds="org.apache.kyuubi" -DincludeScope="compile" -DexcludeTransitive=true ${{ matrix.profiles }}
@ -72,8 +77,11 @@ jobs:
cat $log;
fi
done
- name: JavaStyle with maven
run: build/mvn spotless:check ${{ matrix.profiles }}
- name: Spotless style check
run: |
SPOTLESS_BLACK_VERSION=$(build/mvn help:evaluate -Dexpression=spotless.python.black.version -q -DforceStdout)
pip install black==$SPOTLESS_BLACK_VERSION
build/mvn spotless:check ${{ matrix.profiles }} -Pspotless-python
- name: setup npm
uses: actions/setup-node@v3
with:

View File

@ -22,4 +22,12 @@ KYUUBI_HOME="$(cd "`dirname "$0"`/.."; pwd)"
PROFILES="-Pflink-provided,hive-provided,spark-provided,spark-block-cleaner,spark-3.3,spark-3.2,spark-3.1,tpcds"
# python style checks rely on `black` in path
if ! command -v black &> /dev/null
then
echo "Skip Python lint since 'black' is not available."
else
PROFILES="${PROFILES},spotless-python"
fi
${KYUUBI_HOME}/build/mvn spotless:apply $PROFILES

View File

@ -26,7 +26,7 @@ import traceback
from glob import glob
if sys.version_info[0] < 3:
sys.exit('Python < 3 is unsupported.')
sys.exit("Python < 3 is unsupported.")
spark_home = os.environ.get("SPARK_HOME", "")
os.environ["PYSPARK_PYTHON"] = os.environ.get("PYSPARK_PYTHON", sys.executable)
@ -68,7 +68,7 @@ global_dict = {}
class NormalNode(object):
def __init__(self, code):
self.code = compile(code, '<stdin>', 'exec', ast.PyCF_ONLY_AST, 1)
self.code = compile(code, "<stdin>", "exec", ast.PyCF_ONLY_AST, 1)
def execute(self):
to_run_exec, to_run_single = self.code.body[:-1], self.code.body[-1:]
@ -76,12 +76,12 @@ class NormalNode(object):
try:
for node in to_run_exec:
mod = Module([node], [])
code = compile(mod, '<stdin>', 'exec')
code = compile(mod, "<stdin>", "exec")
exec(code, global_dict)
for node in to_run_single:
mod = ast.Interactive([node])
code = compile(mod, '<stdin>', 'single')
code = compile(mod, "<stdin>", "single")
exec(code, global_dict)
except Exception:
# We don't need to log the exception because we're just executing user
@ -115,10 +115,10 @@ def parse_code_into_nodes(code):
except SyntaxError:
normal = []
chunks = []
for i, line in enumerate(code.rstrip().split('\n')):
if line.startswith('%'):
for i, line in enumerate(code.rstrip().split("\n")):
if line.startswith("%"):
if normal:
chunks.append('\n'.join(normal))
chunks.append("\n".join(normal))
normal = []
chunks.append(line)
@ -126,7 +126,7 @@ def parse_code_into_nodes(code):
normal.append(line)
if normal:
chunks.append('\n'.join(normal))
chunks.append("\n".join(normal))
# Convert the chunks into AST nodes. Let exceptions propagate.
for chunk in chunks:
@ -141,46 +141,55 @@ def parse_code_into_nodes(code):
def execute_reply(status, content):
msg = {
'msg_type': 'execute_reply',
'content': dict(
"msg_type": "execute_reply",
"content": dict(
content,
status=status,
)
),
}
return json.dumps(msg)
def execute_reply_ok(data):
return execute_reply("ok", {
"data": data,
})
return execute_reply(
"ok",
{
"data": data,
},
)
def execute_reply_error(exc_type, exc_value, tb):
formatted_tb = traceback.format_exception(exc_type, exc_value, tb, chain=False)
for i in range(len(formatted_tb)):
if TOP_FRAME_REGEX.match(formatted_tb[i]):
formatted_tb = formatted_tb[:1] + formatted_tb[i + 1:]
formatted_tb = formatted_tb[:1] + formatted_tb[i + 1 :]
break
return execute_reply('error', {
'ename': str(exc_type.__name__),
'evalue': str(exc_value),
'traceback': formatted_tb,
})
return execute_reply(
"error",
{
"ename": str(exc_type.__name__),
"evalue": str(exc_value),
"traceback": formatted_tb,
},
)
def execute_reply_internal_error(message, exc_info=None):
return execute_reply('error', {
'ename': 'InternalError',
'evalue': message,
'traceback': [],
})
return execute_reply(
"error",
{
"ename": "InternalError",
"evalue": message,
"traceback": [],
},
)
def execute_request(content):
try:
code = content['code']
code = content["code"]
except KeyError:
return execute_reply_internal_error(
'Malformed message: content object missing "code"', sys.exc_info()
@ -208,7 +217,7 @@ def execute_request(content):
clearOutputs()
output = result.pop('text/plain', '')
output = result.pop("text/plain", "")
if stdout:
output += stdout
@ -220,14 +229,14 @@ def execute_request(content):
# Only add the output if it exists, or if there are no other mimetypes in the result.
if output or not result:
result['text/plain'] = output.rstrip()
result["text/plain"] = output.rstrip()
return execute_reply_ok(result)
# get or create spark session
spark_session = kyuubi_util.get_spark_session()
global_dict['spark'] = spark_session
global_dict["spark"] = spark_session
def main():
@ -248,9 +257,9 @@ def main():
while True:
line = sys_stdin.readline()
if line == '':
if line == "":
break
elif line == '\n':
elif line == "\n":
continue
try:
@ -258,7 +267,7 @@ def main():
except ValueError:
continue
if content['cmd'] == 'exit_worker':
if content["cmd"] == "exit_worker":
break
result = execute_request(content)
@ -272,5 +281,5 @@ def main():
sys.stderr = sys_stderr
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())

View File

@ -34,18 +34,16 @@ def connect_to_exist_gateway() -> "JavaGateway":
if os.environ.get("PYSPARK_PIN_THREAD", "true").lower() == "true":
gateway = ClientServer(
java_parameters=JavaParameters(
port=gateway_port,
auth_token=gateway_secret,
auto_convert=True),
python_parameters=PythonParameters(
port=0,
eager_load=False))
port=gateway_port, auth_token=gateway_secret, auto_convert=True
),
python_parameters=PythonParameters(port=0, eager_load=False),
)
else:
gateway = JavaGateway(
gateway_parameters=GatewayParameters(
port=gateway_port,
auth_token=gateway_secret,
auto_convert=True))
port=gateway_port, auth_token=gateway_secret, auto_convert=True
)
)
# gateway.proc = proc
# Import the classes used by PySpark
@ -67,12 +65,14 @@ def _get_exist_spark_context(self, jconf):
"""
Initialize SparkContext in function to allow subclass specific initialization
"""
return self._jvm.JavaSparkContext(self._jvm.org.apache.spark.SparkContext.getOrCreate(jconf))
return self._jvm.JavaSparkContext(
self._jvm.org.apache.spark.SparkContext.getOrCreate(jconf)
)
def get_spark_session() -> "SparkSession":
SparkContext._initialize_context = _get_exist_spark_context
gateway = connect_to_exist_gateway()
SparkContext._ensure_initialized(gateway=gateway)
spark = SparkSession.builder.master('local').appName('test').getOrCreate()
spark = SparkSession.builder.master("local").appName("test").getOrCreate()
return spark

19
pom.xml
View File

@ -223,6 +223,10 @@
<!-- Package to use when relocating shaded classes. -->
<kyuubi.shade.packageName>org.apache.kyuubi.shade</kyuubi.shade.packageName>
<!-- Needed for Spotless style check-->
<spotless.python.includes/>
<spotless.python.black.version>22.3.0</spotless.python.black.version>
<distMgmtReleaseId>apache.releases.https</distMgmtReleaseId>
<distMgmtReleaseName>Apache Release Distribution Repository</distMgmtReleaseName>
<distMgmtReleaseUrl>https://repository.apache.org/service/local/staging/deploy/maven2</distMgmtReleaseUrl>
@ -2035,6 +2039,14 @@
<file>${maven.multiModuleProjectDirectory}/.scalafmt.conf</file>
</scalafmt>
</scala>
<python>
<includes>
<include>${spotless.python.includes}</include>
</includes>
<black>
<version>${spotless.python.black.version}</version>
</black>
</python>
</configuration>
<executions>
<execution>
@ -2300,6 +2312,13 @@
</modules>
</profile>
<profile>
<id>spotless-python</id>
<properties>
<spotless.python.includes>src/**/*.py</spotless.python.includes>
</properties>
</profile>
<profile>
<id>apache-release</id>
<build>