Skip to content

Commit d588ec1

Browse files
committed
Upgrade Spark to 3.3.1, reduce image file size
1 parent 21c7d05 commit d588ec1

5 files changed

Lines changed: 33 additions & 18 deletions

File tree

infra/docker/Dockerfile

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,35 @@
1-
FROM openjdk:11-jdk-slim
1+
FROM amazoncorretto:11 as corretto-jdk
2+
RUN $JAVA_HOME/bin/jlink \
3+
--verbose \
4+
--add-modules ALL-MODULE-PATH \
5+
--strip-debug \
6+
--no-man-pages \
7+
--no-header-files \
8+
--compress=2 \
9+
--output /opt/jre
10+
11+
FROM debian:stable-slim
212
LABEL maintainer="Luis Belloch <docker@luisbelloch.es>"
13+
ENV JAVA_HOME=/opt/jre
14+
ENV PATH="${JAVA_HOME}/bin:${PATH}"
15+
COPY --from=corretto-jdk /opt/jre $JAVA_HOME
316

417
ENV DEBIAN_FRONTEND=noninteractive
518
RUN apt-get update && \
6-
apt-get install -y --no-install-recommends python3-software-properties python3-numpy curl && \
19+
apt-get install -y --no-install-recommends ca-certificates procps python3-software-properties python3-numpy curl && \
720
rm -rf /var/lib/apt/lists/*
821

9-
ARG SPARK_VERSION=3.1.2
22+
ARG SPARK_VERSION=3.3.1
1023
ENV SPARK_HOME=/opt/spark
11-
RUN mkdir -p /opt/spark && curl -s https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.2.tgz | tar -xz -C "${SPARK_HOME}" --strip-components=1
12-
ENV PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
24+
RUN mkdir -p /opt/spark && curl -s https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz | tar -xz -C "${SPARK_HOME}" --strip-components=1
25+
ENV PATH="${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${PATH}"
1326

14-
RUN cp "${SPARK_HOME}/conf/log4j.properties.template" "${SPARK_HOME}/conf/log4j.properties" && \
15-
sed -ibak 's/rootCategory=INFO/rootCategory=ERROR/g' "${SPARK_HOME}/conf/log4j.properties"
27+
RUN cp "${SPARK_HOME}/conf/log4j2.properties.template" "${SPARK_HOME}/conf/log4j2.properties" && \
28+
sed -ibak 's/rootLogger.level = info/rootLogger.level = error/g' "${SPARK_HOME}/conf/log4j2.properties"
1629

1730
ENV SPARK_NO_DAEMONIZE=true
18-
ENV PYSPARK_PYTHON=python3
31+
ENV PYSPARK_PYTHON=/usr/bin/python3
32+
ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3
1933
EXPOSE 4040 7077 8080
2034

2135
CMD ["pyspark"]
22-

infra/docker/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
SPARK_VERSION:=3.1.2
2-
COURSE_VERSION:=2021.10
1+
SPARK_VERSION:=3.3.1
2+
COURSE_VERSION:=2022.12
33
IMAGE_NAME:=luisbelloch/spark
44

55
.PHONY: help

local_setup.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22
set -euo pipefail
3-
SPARK_URL=${SPARK_URL:-https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz}
3+
SPARK_URL=${SPARK_URL:-https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz}
44
SPARK_PKG=${SPARK_URL##*/}
55
SPARK_HOME=${SPARK_HOME:-$(pwd)/.spark}
66

@@ -27,8 +27,8 @@ mkdir -p "${SPARK_HOME}"
2727
curl -s "${SPARK_URL}" | tar -xz -C "${SPARK_HOME}" --strip-components=1
2828

2929
stderr "${c_step}[2] Reducing log level${c_norm}"
30-
cp "${SPARK_HOME}"/conf/log4j.properties.template "${SPARK_HOME}"/conf/log4j.properties
31-
sed -ibak 's/rootCategory=INFO/rootCategory=ERROR/g' "${SPARK_HOME}"/conf/log4j.properties
30+
cp "${SPARK_HOME}"/conf/log4j2.properties.template "${SPARK_HOME}"/conf/log4j2.properties
31+
sed -ibak 's/rootLogger.level = info/rootLogger.level = error/g' "${SPARK_HOME}/conf/log4j2.properties"
3232

3333
stderr "${c_step}[3] Testing setup${c_norm}"
3434
echo 'sc.parallelize(1 to 100).count()' | "${SPARK_HOME}"/bin/spark-shell

playbook.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
- hosts: all
33
vars:
44
spark_home: /opt/spark
5-
spark_pkg_name: spark-3.1.2-bin-hadoop3.2
6-
spark_pkg_url: https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
5+
spark_pkg_name: spark-3.3.1-bin-hadoop3
6+
spark_pkg_url: https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
77

88
tasks:
99
- name: Update all packages to the latest version

spark/spark

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#!/bin/bash
22
set -euo pipefail
33

4+
# Alternative: SPARK_DOCKER_IMAGE=apache/spark-py ./spark-submit script.py
5+
readonly SPARK_DOCKER_IMAGE=${SPARK_DOCKER_IMAGE:-luisbelloch/spark}
46
readonly SPARK_SUBMIT=/opt/spark/bin/spark-submit
57
readonly DATA_DIR=/tmp/bigdataupv/data
68
readonly WORK_DIR=/tmp/bigdataupv/scripts
@@ -16,7 +18,7 @@ abs_path() {
1618
}
1719

1820
get_data_volume() {
19-
# Probe for source folder first, if it doesn't
21+
# Probe for source folder first, if it doesn't
2022
# exists then it'll try with current folder
2123
if [[ -d "${0}" ]]; then
2224
echo "-v $(abs_path $0):"${DATA_DIR}""
@@ -37,5 +39,5 @@ docker run --rm -ti \
3739
-w "${WORK_DIR}" \
3840
-v "${source_folder}":"${WORK_DIR}" \
3941
$data_volume \
40-
luisbelloch/spark "${SPARK_SUBMIT}" "${WORK_DIR}"/$1 ${@:2}
42+
${SPARK_DOCKER_IMAGE} "${SPARK_SUBMIT}" "${WORK_DIR}"/$1 ${@:2}
4143

0 commit comments

Comments
 (0)