Skip to content

Commit e203ece

Browse files
authored
[CHORE] Reduce docker image size by removing pip cache (open-metadata#12708)
* [CHORE] Reduce docker image size by removing pip cache * [CHORE] Reduce image size for ingestion/operators as well * [CHORE] Reduce image size for CI
1 parent 191cee2 commit e203ece

4 files changed

Lines changed: 87 additions & 56 deletions

File tree

ingestion/Dockerfile

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
FROM apache/airflow:2.6.3-python3.9
22
USER root
3-
RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
4-
RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
3+
RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
4+
RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
55
# Install Dependencies (listed in alphabetical order)
6-
RUN apt-get update \
7-
&& apt-get install -y alien \
6+
ENV DEBIAN_FRONTEND=noninteractive
7+
RUN apt-get -qq update \
8+
&& apt-get -qq install -y \
9+
alien \
810
build-essential \
911
default-libmysqlclient-dev \
1012
freetds-bin \
@@ -31,19 +33,18 @@ RUN apt-get update \
3133
unixodbc \
3234
unixodbc-dev \
3335
unzip \
34-
vim \
3536
wget --no-install-recommends \
3637
# Accept MSSQL ODBC License
3738
&& ACCEPT_EULA=Y apt-get install -y msodbcsql18 \
3839
&& rm -rf /var/lib/apt/lists/*
3940

4041
RUN if [[ $(uname -m) == "arm64" || $(uname -m) == "aarch64" ]]; \
4142
then \
42-
wget https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \
43-
unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
43+
wget -q https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \
44+
unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
4445
else \
45-
wget https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \
46-
unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
46+
wget -q https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \
47+
unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
4748
fi
4849

4950
ENV LD_LIBRARY_PATH=/instantclient
@@ -55,25 +56,31 @@ ENV LD_LIBRARY_PATH=/instantclient
5556
# https://security.snyk.io/vuln/SNYK-DEBIAN11-BIND9-3027852
5657
# https://security.snyk.io/vuln/SNYK-DEBIAN11-EXPAT-3023031 we are already installed the latest
5758
RUN echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/backports.list
58-
RUN apt-get update \
59-
&& apt-get install -t bullseye-backports -y \
59+
RUN apt-get -qq update \
60+
&& apt-get -qq install -t bullseye-backports -y \
6061
curl \
6162
libpcre2-8-0 \
6263
postgresql-common \
6364
expat \
64-
bind9
65+
bind9 \
66+
&& rm -rf /var/lib/apt/lists/*
6567

6668
# Required for Starting Ingestion Container in Docker Compose
67-
COPY --chown=airflow:0 ingestion/ingestion_dependency.sh /opt/airflow
69+
COPY --chown=airflow:0 --chmod=775 ingestion/ingestion_dependency.sh /opt/airflow
6870
# Required for Ingesting Sample Data
6971
COPY --chown=airflow:0 ingestion/examples/sample_data /home/airflow/ingestion/examples/sample_data
7072
# Required for Airflow DAGs of Sample Data
7173
COPY --chown=airflow:0 ingestion/examples/airflow/dags /opt/airflow/dags
72-
# Provide Execute Permissions to shell script
73-
RUN chmod +x /opt/airflow/ingestion_dependency.sh
7474
USER airflow
7575
# Argument to provide for Ingestion Dependencies to install. Defaults to all
7676
ARG INGESTION_DEPENDENCY="all"
77+
78+
# Disable pip cache dir
79+
# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching
80+
ENV PIP_NO_CACHE_DIR=1
81+
# Make pip silent
82+
ENV PIP_QUIET=1
83+
7784
RUN pip install --upgrade pip
7885
RUN pip install "openmetadata-managed-apis~=1.1.0.4" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.9.txt"
7986
RUN pip install "openmetadata-ingestion[${INGESTION_DEPENDENCY}]~=1.1.0.4"

ingestion/Dockerfile.ci

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
FROM apache/airflow:2.6.3-python3.9
22
USER root
3-
RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
4-
RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
3+
RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
4+
RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
55
# Install Dependencies (listed in alphabetical order)
6-
RUN apt-get update \
7-
&& apt-get install -y alien \
6+
RUN apt-get -qq update \
7+
&& apt-get -qq install -y \
8+
alien \
89
build-essential \
910
default-libmysqlclient-dev \
1011
freetds-bin \
@@ -34,7 +35,7 @@ RUN apt-get update \
3435
vim \
3536
wget --no-install-recommends \
3637
# Accept MSSQL ODBC License
37-
&& ACCEPT_EULA=Y apt-get install -y msodbcsql18 \
38+
&& ACCEPT_EULA=Y apt-get -qq install -y msodbcsql18 \
3839
&& rm -rf /var/lib/apt/lists/*
3940

4041
RUN if [[ $(uname -m) == "arm64" || $(uname -m) == "aarch64" ]]; \
@@ -55,33 +56,41 @@ ENV LD_LIBRARY_PATH=/instantclient
5556
# https://security.snyk.io/vuln/SNYK-DEBIAN11-BIND9-3027852
5657
# https://security.snyk.io/vuln/SNYK-DEBIAN11-EXPAT-3023031 we are already installed the latest
5758
RUN echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/backports.list
58-
RUN apt-get update \
59-
&& apt-get install -t bullseye-backports -y \
59+
RUN apt-get -qq update \
60+
&& apt-get -qq install -t bullseye-backports -y \
6061
curl \
6162
libpcre2-8-0 \
6263
postgresql-common \
6364
expat \
6465
bind9
6566

6667
# Required for Starting Ingestion Container in Docker Compose
67-
COPY --chown=airflow:0 ingestion/ingestion_dependency.sh /opt/airflow
68+
# Provide Execute Permissions to shell script
69+
COPY --chown=airflow:0 --chmod=775 ingestion/ingestion_dependency.sh /opt/airflow
6870
# Required for Ingesting Sample Data
6971
COPY --chown=airflow:0 ingestion /home/airflow/ingestion
7072

7173
COPY --chown=airflow:0 openmetadata-airflow-apis /home/airflow/openmetadata-airflow-apis
7274
# Required for Airflow DAGs of Sample Data
7375
COPY --chown=airflow:0 ingestion/examples/airflow/dags /opt/airflow/dags
74-
# Provide Execute Permissions to shell script
75-
RUN chmod +x /opt/airflow/ingestion_dependency.sh
76+
7677
USER airflow
7778
ARG AIRFLOW_CONSTRAINTS_LOCATION="https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.9.txt"
78-
# Argument to provide for Ingestion Dependencies to install. Defaults to all
79+
80+
# Disable pip cache dir
81+
# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching
82+
ENV PIP_NO_CACHE_DIR=1
83+
# Make pip silent
84+
ENV PIP_QUIET=1
85+
7986
RUN pip install --upgrade pip
8087

8188
WORKDIR /home/airflow/openmetadata-airflow-apis
8289
RUN pip install "."
8390

8491
WORKDIR /home/airflow/ingestion
92+
93+
# Argument to provide for Ingestion Dependencies to install. Defaults to all
8594
ARG INGESTION_DEPENDENCY="all"
8695
RUN pip install ".[${INGESTION_DEPENDENCY}]"
8796

@@ -105,6 +114,7 @@ RUN pip uninstall psycopg2-binary -y
105114
RUN pip install psycopg2 mysqlclient==2.1.1
106115
# Make required folders for openmetadata-airflow-apis
107116
RUN mkdir -p /opt/airflow/dag_generated_configs
117+
108118
EXPOSE 8080
109119
# This is required as it's responsible to create airflow.cfg file
110120
RUN airflow db init && rm -f /opt/airflow/airflow.db

ingestion/operators/docker/Dockerfile

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
FROM python:3.9-bullseye
22

3-
RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
4-
RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
3+
RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
4+
RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
55

66
# Install Dependencies (listed in alphabetical order)
7-
RUN apt-get update \
8-
&& apt-get install -y alien \
7+
RUN apt-get -qq update \
8+
&& apt-get -qq install -y \
9+
alien \
910
build-essential \
1011
default-libmysqlclient-dev \
1112
freetds-bin \
@@ -34,24 +35,24 @@ RUN apt-get update \
3435
unzip \
3536
wget --no-install-recommends \
3637
# Accept MSSQL ODBC License
37-
&& ACCEPT_EULA=Y apt-get install -y msodbcsql18 \
38+
&& ACCEPT_EULA=Y apt-get -qq install -y msodbcsql18 \
3839
&& rm -rf /var/lib/apt/lists/*
3940

4041
# Add updated postgres/redshift dependencies based on libq
4142
RUN curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add -
4243
RUN echo "deb https://apt.postgresql.org/pub/repos/apt/ buster-pgdg main" > /etc/apt/sources.list.d/pgdg.list; \
43-
apt-get update; \
44-
apt-get install --no-install-recommends -y libpq-dev postgresql-client postgresql-common postgresql postgresql-contrib; \
45-
apt-get autoremove -yqq --purge; \
46-
apt-get clean && rm -rf /var/lib/apt/lists/*
44+
apt-get -qq update; \
45+
apt-get -qq install --no-install-recommends -y libpq-dev postgresql-client postgresql-common postgresql postgresql-contrib; \
46+
apt-get -qq autoremove -yqq --purge; \
47+
apt-get -qq clean && rm -rf /var/lib/apt/lists/*
4748

4849
RUN if [[ $(uname -m) == "arm64" || $(uname -m) == "aarch64" ]]; \
4950
then \
50-
wget https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \
51-
unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
51+
wget -q https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \
52+
unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
5253
else \
53-
wget https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \
54-
unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
54+
wget -q https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \
55+
unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
5556
fi
5657

5758
ENV LD_LIBRARY_PATH=/instantclient
@@ -63,7 +64,7 @@ ENV LD_LIBRARY_PATH=/instantclient
6364
# https://security.snyk.io/vuln/SNYK-DEBIAN11-BIND9-3027852
6465
# https://security.snyk.io/vuln/SNYK-DEBIAN11-EXPAT-3023031 we are already installed the latest
6566
RUN echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/backports.list
66-
RUN apt-get update \
67+
RUN apt-get -qq update \
6768
&& apt-get install -t bullseye-backports -y \
6869
curl \
6970
libpcre2-8-0 \
@@ -76,6 +77,12 @@ WORKDIR ingestion/
7677
# Required for Airflow DockerOperator, as we need to run the workflows from a `python main.py` command in the container.
7778
COPY ingestion/operators/docker/*.py .
7879

80+
# Disable pip cache dir
81+
# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching
82+
ENV PIP_NO_CACHE_DIR=1
83+
# Make pip silent
84+
ENV PIP_QUIET=1
85+
7986
RUN pip install --upgrade pip
8087

8188
ARG INGESTION_DEPENDENCY="all"

ingestion/operators/docker/Dockerfile-dev

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
FROM python:3.9-bullseye
22

3-
RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
4-
RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
3+
RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
4+
RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
55

66
# Install Dependencies (listed in alphabetical order)
7-
RUN apt-get update \
8-
&& apt-get install -y alien \
7+
RUN apt-get -qq update \
8+
&& apt-get -qq install -y \
9+
alien \
910
build-essential \
1011
default-libmysqlclient-dev \
1112
freetds-bin \
@@ -35,24 +36,24 @@ RUN apt-get update \
3536
vim \
3637
wget --no-install-recommends \
3738
# Accept MSSQL ODBC License
38-
&& ACCEPT_EULA=Y apt-get install -y msodbcsql18 \
39+
&& ACCEPT_EULA=Y apt-get -qq install -y msodbcsql18 \
3940
&& rm -rf /var/lib/apt/lists/*
4041

4142
# Add updated postgres/redshift dependencies based on libq
42-
RUN curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add -
43+
RUN curl -sS https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add -
4344
RUN echo "deb https://apt.postgresql.org/pub/repos/apt/ buster-pgdg main" > /etc/apt/sources.list.d/pgdg.list; \
44-
apt-get update; \
45-
apt-get install --no-install-recommends -y libpq-dev postgresql-client postgresql-common postgresql postgresql-contrib; \
46-
apt-get autoremove -yqq --purge; \
47-
apt-get clean && rm -rf /var/lib/apt/lists/*
45+
apt-get -qq update; \
46+
apt-get -qq install --no-install-recommends -y libpq-dev postgresql-client postgresql-common postgresql postgresql-contrib; \
47+
apt-get -qq autoremove -yqq --purge; \
48+
apt-get -qq clean && rm -rf /var/lib/apt/lists/*
4849

4950
RUN if [[ $(uname -m) == "arm64" || $(uname -m) == "aarch64" ]]; \
5051
then \
51-
wget https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \
52-
unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
52+
wget -q https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \
53+
unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
5354
else \
54-
wget https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \
55-
unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
55+
wget -q https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \
56+
unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \
5657
fi
5758

5859
ENV LD_LIBRARY_PATH=/instantclient
@@ -64,8 +65,8 @@ ENV LD_LIBRARY_PATH=/instantclient
6465
# https://security.snyk.io/vuln/SNYK-DEBIAN11-BIND9-3027852
6566
# https://security.snyk.io/vuln/SNYK-DEBIAN11-EXPAT-3023031 we are already installed the latest
6667
RUN echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/backports.list
67-
RUN apt-get update \
68-
&& apt-get install -t bullseye-backports -y \
68+
RUN apt-get -qq update \
69+
&& apt-get -qq install -t bullseye-backports -y \
6970
curl \
7071
libpcre2-8-0 \
7172
postgresql-common \
@@ -77,6 +78,12 @@ WORKDIR ingestion/
7778
# For the dev build, we copy all files
7879
COPY ingestion/ .
7980

81+
# Disable pip cache dir
82+
# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching
83+
ENV PIP_NO_CACHE_DIR=1
84+
# Make pip silent
85+
ENV PIP_QUIET=1
86+
8087
RUN pip install --upgrade pip setuptools
8188

8289
ARG INGESTION_DEPENDENCY="all"

0 commit comments

Comments
 (0)