-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathDockerfile-dc
More file actions
177 lines (142 loc) · 6.26 KB
/
Dockerfile-dc
File metadata and controls
177 lines (142 loc) · 6.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
ARG BASE_CONTAINER=define_as_build_arg
FROM $BASE_CONTAINER
LABEL MAINTAINER="CSEL Ops <admin@cs.colorado.edu>"
USER root
# Fix DL4006
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
USER root
# Spark dependencies
# Default values can be overridden at build time
# (ARGS are in lower case to distinguish them from ENV)
ARG spark_version="3.3.0"
ARG hadoop_version="3"
ARG scala_version="2.13"
ARG spark_checksum="1e8234d0c1d2ab4462d6b0dfe5b54f2851dcd883378e0ed756140e10adfb5be4123961b521140f580e364c239872ea5a9f813a20b73c69cb6d4e95da2575c29c"
ARG spark_checksum="4c09dac70e22bf1d5b7b2cabc1dd92aba13237f52a5b682c67982266fc7a0f5e0f964edff9bc76adbd8cb444eb1a00fdc59516147f99e4e2ce068420ff4881f0"
ARG openjdk_version="17"
ENV APACHE_SPARK_VERSION="${spark_version}" \
HADOOP_VERSION="${hadoop_version}"
RUN apt-get -y update && \
apt-get install --no-install-recommends -y openjdk-17-jdk-headless ca-certificates-java && \
# update-alternatives --config java && \
update-alternatives --set java /usr/lib/jvm/java-17-openjdk-amd64/bin/java && \
rm -rf /var/lib/apt/lists/*
# Using the preferred mirror to download Spark
WORKDIR /tmp
RUN if [ -z "${scala_version}" ]; then \
wget -qO "spark.tgz" "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \
else \
wget -qO "spark.tgz" "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \
fi && \
echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \
tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \
rm "spark.tgz"
WORKDIR /usr/local
RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark
# Configure Spark
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
PATH="${PATH}:${SPARK_HOME}/bin"
RUN rm -f "${SPARK_HOME}" ; \
if [ -z "${scala_version}" ]; then \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \
else \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \
fi && \
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \
mkdir -p /usr/local/bin/before-notebook.d && \
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh
# Configure IPython system-wide
COPY Dockerfile-dc-ipython-kernel-config.py "/etc/ipython/ipython_kernel_config.py"
RUN fix-permissions "/etc/ipython/"
#USER ${NB_UID}
# Install pyarrow
RUN mamba install --quiet --yes \
'pyarrow' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
WORKDIR "${HOME}"
##
## Now the all-spark-notebook
##
USER root
# RSpark config
ENV R_LIBS_USER $SPARK_HOME/R/lib
RUN fix-permissions $R_LIBS_USER
# R pre-requisites
RUN apt-get update && \
apt-get install -y --no-install-recommends \
fonts-dejavu \
gfortran \
gcc && \
rm -rf /var/lib/apt/lists/*
# R packages including IRKernel which gets installed globally.
RUN mamba install --quiet --yes \
'r-base' \
'r-ggplot2' \
'r-irkernel' \
'r-rcurl' \
'r-sparklyr' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
# Spylon-kernel
RUN mamba install --quiet --yes 'spylon-kernel=0.4*' && \
mamba clean --all -f -y && \
python -m spylon_kernel install --sys-prefix && \
rm -rf "/home/${NB_USER}/.local" && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
##
## Install hadoop
##
ENV APACHE_HADOOP_VERSION=3.3.4
ARG hadoop_checksum="ca5e12625679ca95b8fd7bb7babc2a8dcb2605979b901df9ad137178718821097b67555115fafc6dbf6bb32b61864ccb6786dbc555e589694a22bf69147780b4"
RUN wget -q $(wget -qO- https://www.apache.org/dyn/closer.lua/hadoop/core/hadoop-${APACHE_HADOOP_VERSION}/hadoop-${APACHE_HADOOP_VERSION}.tar.gz\?as_json | \
python -c "import sys, json; content=json.load(sys.stdin); print(content['preferred']+content['path_info'])")
RUN echo "${hadoop_checksum} *hadoop-${APACHE_HADOOP_VERSION}.tar.gz" | sha512sum -c - && \
tar xzf "hadoop-${APACHE_HADOOP_VERSION}.tar.gz" -C /usr/local --owner $NB_UID --group $NB_GID --no-same-owner && \
rm "hadoop-${APACHE_HADOOP_VERSION}.tar.gz" && \
ln -s "/usr/local/hadoop-${APACHE_HADOOP_VERSION}" "/usr/local/hadoop" && \
ln -s /usr/local/hadoop/lib/native/* /usr/lib && \
ldconfig && \
ldconfig -p && \
fix-permissions "/usr/local/hadoop-${APACHE_HADOOP_VERSION}"
##
## Configure HADOOP
##
ENV HADOOP_HOME=/usr/local/hadoop-${APACHE_HADOOP_VERSION}
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \
HADOOP_INSTALL=$HADOOP_HOME \
HADOOP_MAPRED_HOME=$HADOOP_HOME \
HADOOP_COMMON_HOME=$HADOOP_HOME \
HADOOP_HDFS_HOME=$HADOOP_HOME \
YARN_HOME=$HADOOP_HOME \
HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native \
HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$HADOOP_HOME/lib/native" \
PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin \
JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 \
PATH=$PATH:$HADOOP_HOME/bin
RUN hdfs namenode -format && \
fix-permissions "/usr/local/hadoop-${APACHE_HADOOP_VERSION}"
##
## Install Google cloud utilities -- see https://cloud.google.com/sdk/docs/quickstart
##
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt-get -y install apt-transport-https ca-certificates gnupg && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
apt-get update && sudo apt-get -y install google-cloud-sdk kubectl
RUN pip install grpcio grpcio-tools && \
apt-get -y install libprotobuf-dev protobuf-compiler libgrpc++-dev libgrpc-dev protobuf-compiler-grpc
#RUN apt-get -y install rabbitmq-server
RUN apt-get -y install sqlite3
##
## sql magic
##
USER root
RUN mamba install -y -c conda-forge \
conda-forge::psycopg2 conda-forge::mysqlclient conda-forge::ipython-sql && \
mamba clean -afy
USER $NB_UID
WORKDIR $HOME