Skip to content

Commit 30f85da

Browse files
authored
Merge pull request #57 from godatadriven/upgrade-airflow-base-image
Use official docker image as base
2 parents 6634e1b + e6ecf5e commit 30f85da

File tree

24 files changed

+115
-111
lines changed

24 files changed

+115
-111
lines changed

docker/airflow-python/Dockerfile

+14-56
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,16 @@
1-
ARG PYTHON_VERSION=3.6
2-
FROM python:${PYTHON_VERSION}-slim as airflow-base
1+
ARG AIRFLOW_VERSION=1.10.12
2+
ARG PYTHON_VERSION=3.7
3+
FROM apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION}
34

4-
ENV AIRFLOW_BUILD_DEPS="freetds-dev python-dev libkrb5-dev libssl-dev libffi-dev libpq-dev git"
5-
ENV AIRFLOW_APT_DEPS="libsasl2-dev sasl2-bin libsasl2-2 libsasl2-dev libsasl2-modules freetds-bin build-essential default-libmysqlclient-dev apt-utils curl rsync netcat locales"
5+
USER root
66

7-
ENV AIRFLOW_VERSION=1.10.5
8-
ENV AIRFLOW_HOME /usr/local/airflow
9-
ENV AIRFLOW_GPL_UNIDECODE=yes
10-
ENV SLUGIFY_USES_TEXT_UNIDECODE=yes
7+
ARG AIRFLOW_VERSION=1.10.12
8+
ENV AIRFLOW_VERSION=${AIRFLOW_VERSION}
9+
#ENV AIRFLOW_HOME /usr/local/airflow
1110
ENV WHIRL_SETUP_FOLDER=/etc/airflow/whirl.setup.d
1211

1312
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
1413

15-
RUN mkdir -p /usr/share/man/man1 \
16-
&& update-ca-certificates -f \
17-
&& apt-get update \
18-
&& apt-get install -y --no-install-recommends --reinstall build-essential \
19-
&& apt-get install -y --no-install-recommends --allow-unauthenticated \
20-
software-properties-common \
21-
wget \
22-
dnsutils \
23-
vim \
24-
git \
25-
default-libmysqlclient-dev \
26-
gcc \
27-
${AIRFLOW_BUILD_DEPS} \
28-
${AIRFLOW_APT_DEPS} \
29-
nginx \
30-
gosu \
31-
sudo \
32-
&& apt-get clean \
33-
&& apt-get autoremove -yqq --purge \
34-
&& rm -rf /var/cache/apk/* /var/lib/apt/lists/* \
35-
&& (find /usr/share/doc -type f -and -not -name copyright -print0 | xargs -0 -r rm)
36-
37-
FROM airflow-base as main_image
38-
39-
# Optimizing installation of Cassandra driver
40-
# Speeds up building the image - cassandra driver without CYTHON saves around 10 minutes
41-
ARG CASS_DRIVER_NO_CYTHON="1"
42-
# Build cassandra driver on multiple CPUs
43-
ARG CASS_DRIVER_BUILD_CONCURRENCY="8"
44-
45-
ENV CASS_DRIVER_BUILD_CONCURRENCY=${CASS_DRIVER_BUILD_CONCURRENCY}
46-
ENV CASS_DRIVER_NO_CYTHON=${CASS_DRIVER_NO_CYTHON}
47-
48-
# By default PIP install run without cache to make image smaller
49-
ARG PIP_NO_CACHE_DIR="true"
50-
ENV PIP_NO_CACHE_DIR=${PIP_NO_CACHE_DIR}
51-
52-
RUN pip install --upgrade pip \
53-
&& pip install apache-airflow[all_dbs,atlas,async,cassandra,celery,cgroups,cloudant,crypto,dask,databricks,datadog,doc,docker,druid,elasticsearch,gcp_api,github_enterprise,google_auth,hdfs,hive,jdbc,jira,kerberos,ldap,mongo,mssql,mysql,oracle,password,pinot,postgres,qds,rabbitmq,redis,salesforce,samba,sendgrid,segment,slack,snowflake,ssh,statsd,vertica,webhdfs,winrm]=="${AIRFLOW_VERSION}" \
54-
'pymssql<3.0' \
55-
&& mkdir -p "${AIRFLOW_HOME}/dags"
56-
5714
RUN mkdir -p "${WHIRL_SETUP_FOLDER}/env.d"
5815
RUN mkdir -p "${WHIRL_SETUP_FOLDER}/dag.d"
5916

@@ -75,18 +32,19 @@ RUN mkdir -p /etc/nginx/ssl \
7532
COPY nginx-ssl.conf /etc/nginx/conf.d/
7633

7734
# Harden Image
78-
COPY harden.sh .
79-
RUN chmod +x harden.sh && \
80-
sh harden.sh airflow
35+
#COPY harden.sh .
36+
#RUN chmod +x harden.sh && \
37+
# sh harden.sh airflow
8138

82-
RUN chown -R airflow.airflow ${AIRFLOW_HOME}
39+
RUN echo "airflow ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
8340
USER airflow
8441

8542
EXPOSE 5000
8643

87-
COPY --chown=airflow:airflow entrypoint.sh delete_all_airflow_connections.py /
44+
COPY --chown=airflow:airflow entrypoint.sh /entrypoint-whirl.sh
8845
COPY includes /etc/airflow/functions
8946
COPY pip.conf /home/airflow/.config/pip/pip.conf
9047

9148
ENV PATH="${PATH}:/home/airflow/.local/bin"
92-
ENTRYPOINT ["/entrypoint.sh"]
49+
ENTRYPOINT ["/entrypoint-whirl.sh"]
50+
CMD ["airflow", "--help"]

docker/airflow-python/entrypoint.sh

+33-24
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,26 @@
11
#!/usr/bin/env bash
2-
echo "========================================="
3-
echo "== Initialize Airflow ==================="
4-
echo "========================================="
5-
airflow upgradedb
6-
echo "========================================="
7-
echo "== Reset Airflow ========================"
8-
echo "========================================="
9-
rm -rf ${AIRFLOW_HOME}/*.pid
10-
rm -rf ${AIRFLOW_HOME}/*.err
11-
rm -rf ${AIRFLOW_HOME}/*.log
12-
rm -rf ${AIRFLOW_HOME}/logs/*
13-
echo "y" | airflow resetdb
14-
echo "Removing airflows default connections"
15-
python /delete_all_airflow_connections.py
2+
# Might be empty
3+
AIRFLOW_COMMAND="${1}"
4+
if [[ ${AIRFLOW_COMMAND} == "scheduler" || ${AIRFLOW_COMMAND} == "webserver" ]]; then
5+
echo "wait a while for the other systems to be started"
6+
sleep 15
7+
fi
8+
9+
if [[ ${AIRFLOW_COMMAND} == "scheduler" || ${AIRFLOW_COMMAND} == "singlemachine" ]]; then
10+
echo "========================================="
11+
echo "== Reset Airflow ========================"
12+
echo "========================================="
13+
rm -rf ${AIRFLOW_HOME}/*.pid
14+
rm -rf ${AIRFLOW_HOME}/*.err
15+
rm -rf ${AIRFLOW_HOME}/*.log
16+
rm -rf ${AIRFLOW_HOME}/logs/*
17+
echo "y" | airflow resetdb
18+
else
19+
if [[ ${AIRFLOW_COMMAND} == "webserver" ]]; then
20+
echo "wait a bit more to let the scheduler do the database reset."
21+
sleep 15
22+
fi
23+
fi
1624

1725
echo "========================================="
1826
echo "== Setup environment specifics =========="
@@ -38,14 +46,9 @@ for filename in ${WHIRL_SETUP_FOLDER}/dag.d/*.sh; do
3846
fi
3947
done
4048

41-
echo "Starting Airflow scheduler..."
42-
nohup airflow scheduler -D &
49+
if [[ ${AIRFLOW_COMMAND} == "webserver" || ${AIRFLOW_COMMAND} == "singlemachine" ]]; then
4350

44-
echo "wait a while for the scheduler to be started"
45-
sleep 15
46-
47-
echo "If needed, unpause dags..."
48-
if [ "${UNPAUSE_DAG}" = true ]; then
51+
if [ "${UNPAUSE_DAG}" = true ]; then
4952
echo "================================="
5053
echo "== Enabling all available DAGs =="
5154
echo "================================="
@@ -57,8 +60,14 @@ if [ "${UNPAUSE_DAG}" = true ]; then
5760
echo "Enabling DAG ${d}"
5861
airflow unpause "${d}" || true
5962
done
60-
end
63+
fi
6164
fi
6265

63-
echo "Starting Airflow webserver..."
64-
airflow webserver -p 5000
66+
if [[ ${AIRFLOW_COMMAND} == "singlemachine" ]]; then
67+
nohup /entrypoint scheduler -D &
68+
# echo "wait a while for the scheduler to be started"
69+
# sleep 15
70+
/entrypoint webserver -p 5000
71+
else
72+
/entrypoint "${@}"
73+
fi

docker/airflow-python/includes/date_replacement.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ function replace_date() {
4444
echo $RENAMES
4545
}
4646

47-
SCHEDULE=$(grep -oP "schedule_interval=\K[^,]*" /usr/local/airflow/dags/*/*.py | head -n1 | sed -e "s/['\"]//g")
47+
SCHEDULE=$(grep -oP "schedule_interval=\K[^,]*" /opt/airflow/dags/*/*.py | head -n1 | sed -e "s/['\"]//g")
4848

4949
# Change Airflow schedule annotations to crontab schedule values
5050
if [[ ${SCHEDULE} == *"@"* ]]; then

docker/aws-spark/Dockerfile

+4-4
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,21 @@ ARG SPARK_VERSION=latest
22
FROM godatadriven/spark:${SPARK_VERSION}
33

44
ENV POSTGRES_JDBC_CHECKSUM=7ffa46f8c619377cdebcd17721b6b21ecf6659850179f96fec3d1035cf5a0cdc
5-
ENV HADOOP_AWS_CHECKSUM=af9f18a0fcef4c564deea6f3ca1eec040b59be3d1cfd7fa557975d25d90e23f6
5+
ENV HADOOP_AWS_CHECKSUM=acf05db5e92f79b287444c9e6bd71f27f125193c47ef59149460ef02ef73a72c
66
ENV AWS_SDK_CHECKSUM=ab74b9bd8baf700bbb8c1270c02d87e570cd237af2464bafa9db87ca1401143a
77

88
RUN apt-get update && \
99
apt-get install -y curl && \
1010
apt-get clean
1111

12-
RUN curl -o /usr/spark/jars/aws-java-sdk-1.7.4.jar http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar && \
12+
RUN curl -o /usr/spark/jars/aws-java-sdk-1.7.4.jar https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar && \
1313
echo "$AWS_SDK_CHECKSUM /usr/spark/jars/aws-java-sdk-1.7.4.jar" | sha256sum -c -
1414

1515
RUN curl -o /usr/spark/jars/postgresql-42.2.5.jar https://jdbc.postgresql.org/download/postgresql-42.2.5.jar && \
1616
echo "$POSTGRES_JDBC_CHECKSUM /usr/spark/jars/postgresql-42.2.5.jar" | sha256sum -c -
1717

18-
RUN curl -o /usr/spark/jars/hadoop-aws-2.7.3.jar http://central.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar && \
19-
echo "$HADOOP_AWS_CHECKSUM /usr/spark/jars/hadoop-aws-2.7.3.jar" | sha256sum -c -
18+
RUN curl -o /usr/spark/jars/hadoop-aws-2.7.4.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.4/hadoop-aws-2.7.4.jar && \
19+
echo "$HADOOP_AWS_CHECKSUM /usr/spark/jars/hadoop-aws-2.7.4.jar" | sha256sum -c -
2020

2121
RUN echo "spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem" > ${SPARK_HOME}/conf/spark-defaults.conf && \
2222
echo "spark.hadoop.fs.s3a.connection.ssl.enabled=false" >> ${SPARK_HOME}/conf/spark-defaults.conf && \

envs/airflow-s3-logging/.whirl.env

+1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ PORT_WEB_UI=8080
88
# Airflow variables
99
AIRFLOW__CORE__EXPOSE_CONFIG=True
1010
AIRFLOW__WEBSERVER__EXPOSE_CONFIG=True
11+
AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
1112
AIRFLOW__CORE__LOAD_EXAMPLES=False

envs/airflow-s3-logging/docker-compose.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ version: '3'
33
services:
44
airflow:
55
image: docker-whirl-airflow:py-${PYTHON_VERSION}-local
6+
command: ["singlemachine"]
67
ports:
78
- '5000:5000' # HTTP (Airflow Web UI)
89
env_file:
@@ -11,7 +12,7 @@ services:
1112
- WHIRL_SETUP_FOLDER
1213
- UNPAUSE_DAG
1314
volumes:
14-
- ${DAG_FOLDER}:/usr/local/airflow/dags/$PROJECTNAME
15+
- ${DAG_FOLDER}:/opt/airflow/dags/$PROJECTNAME
1516
- ${ENVIRONMENT_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/env.d/
1617
- ${DAG_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/dag.d/
1718
depends_on:

envs/api-python-s3/.whirl.env

+1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ PORT_WEB_UI=8080
88
# Airflow env vars
99
AIRFLOW__CORE__EXPOSE_CONFIG=True
1010
AIRFLOW__WEBSERVER__EXPOSE_CONFIG=True
11+
AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
1112
AIRFLOW__CORE__LOAD_EXAMPLES=False

envs/api-python-s3/docker-compose.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ version: '3'
33
services:
44
airflow:
55
image: docker-whirl-airflow:py-${PYTHON_VERSION}-local
6+
command: ["singlemachine"]
67
ports:
78
- '5000:5000' # HTTP (Airflow Web UI)
89
env_file:
@@ -11,7 +12,7 @@ services:
1112
- WHIRL_SETUP_FOLDER
1213
- UNPAUSE_DAG
1314
volumes:
14-
- ${DAG_FOLDER}:/usr/local/airflow/dags/$PROJECTNAME
15+
- ${DAG_FOLDER}:/opt/airflow/dags/$PROJECTNAME
1516
- ${ENVIRONMENT_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/env.d/
1617
- ${DAG_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/dag.d/
1718
depends_on:
@@ -21,7 +22,7 @@ services:
2122
- s3server:${DEMO_BUCKET}.s3server
2223

2324
mockserver:
24-
image: jamesdbloom/mockserver:mockserver-5.6.1
25+
image: jamesdbloom/mockserver:mockserver-5.11.1
2526
ports:
2627
- 1080:1080
2728
- 1081:1081

envs/external-airflow-db/.whirl.env

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
# postgres env vars
22
POSTGRES_HOST=postgresdb
33
POSTGRES_PORT=5432
4-
POSTGRES_PASSWORD=p@ssw0rd
4+
POSTGRES_PASSWORD=pAssw0rd
55
POSTGRES_USER=airflow
66
POSTGRES_DB=airflow
77

88
# Airflow variables
99
AIRFLOW__CORE__EXPOSE_CONFIG=True
1010
AIRFLOW__WEBSERVER__EXPOSE_CONFIG=True
11+
AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
1112
AIRFLOW__CORE__LOAD_EXAMPLES=False
1213
AIRFLOW__CORE__FERNET_KEY=W5gmA+dp84hkZEzpxPw4LTmhbXA1uVxKZsgIfay8wno=
1314
AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}

envs/external-airflow-db/docker-compose.yml

+19-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
version: '3'
22

33
services:
4-
airflow:
4+
webserver:
55
image: docker-whirl-airflow:py-${PYTHON_VERSION}-local
6+
command: ["webserver", "-p", "5000"]
67
ports:
78
- '5000:5000' # HTTP (Airflow Web UI)
89
env_file:
@@ -11,14 +12,29 @@ services:
1112
- WHIRL_SETUP_FOLDER
1213
- UNPAUSE_DAG
1314
volumes:
14-
- ${DAG_FOLDER}:/usr/local/airflow/dags/$PROJECTNAME
15+
- ${DAG_FOLDER}:/opt/airflow/dags/$PROJECTNAME
16+
- ${ENVIRONMENT_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/env.d/
17+
- ${DAG_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/dag.d/
18+
depends_on:
19+
- postgresdb
20+
21+
scheduler:
22+
image: docker-whirl-airflow:py-${PYTHON_VERSION}-local
23+
command: ["scheduler"]
24+
env_file:
25+
- .whirl.env
26+
environment:
27+
- WHIRL_SETUP_FOLDER
28+
- UNPAUSE_DAG
29+
volumes:
30+
- ${DAG_FOLDER}:/opt/airflow/dags/$PROJECTNAME
1531
- ${ENVIRONMENT_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/env.d/
1632
- ${DAG_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/dag.d/
1733
depends_on:
1834
- postgresdb
1935

2036
postgresdb:
21-
image: postgres:11
37+
image: postgres:13
2238
ports:
2339
- 5432:5432
2440
environment:

envs/external-smtp-config/.whirl.env

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Airflow variables
22
AIRFLOW__CORE__EXPOSE_CONFIG=True
33
AIRFLOW__WEBSERVER__EXPOSE_CONFIG=True
4+
AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
45
AIRFLOW__CORE__LOAD_EXAMPLES=False
56
AIRFLOW__SMTP__SMTP_HOST=smtp-server
67
AIRFLOW__SMTP__SMTP_STARTTLS=False

envs/external-smtp-config/docker-compose.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ version: '3'
33
services:
44
airflow:
55
image: docker-whirl-airflow:py-${PYTHON_VERSION}-local
6+
command: ["singlemachine"]
67
ports:
78
- '5000:5000' # HTTP (Airflow Web UI)
89
env_file:
@@ -11,7 +12,7 @@ services:
1112
- WHIRL_SETUP_FOLDER
1213
- UNPAUSE_DAG
1314
volumes:
14-
- ${DAG_FOLDER}:/usr/local/airflow/dags/$PROJECTNAME
15+
- ${DAG_FOLDER}:/opt/airflow/dags/$PROJECTNAME
1516
- ${ENVIRONMENT_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/env.d/
1617
- ${DAG_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/dag.d/
1718
depends_on:

envs/local-ssh/.whirl.env

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Airflow env vars
22
AIRFLOW__CORE__EXPOSE_CONFIG=True
33
AIRFLOW__WEBSERVER__EXPOSE_CONFIG=True
4+
AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
45
AIRFLOW__CORE__LOAD_EXAMPLES=False

envs/local-ssh/docker-compose.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ version: '3'
33
services:
44
airflow:
55
image: docker-whirl-airflow:py-${PYTHON_VERSION}-local
6+
command: ["singlemachine"]
67
ports:
78
- '5000:5000' # HTTP (Airflow Web UI)
89
env_file:
@@ -11,6 +12,6 @@ services:
1112
- WHIRL_SETUP_FOLDER
1213
- UNPAUSE_DAG
1314
volumes:
14-
- ${DAG_FOLDER}:/usr/local/airflow/dags/$PROJECTNAME
15+
- ${DAG_FOLDER}:/opt/airflow/dags/$PROJECTNAME
1516
- ${ENVIRONMENT_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/env.d/
1617
- ${DAG_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/dag.d/

envs/postgres-s3-external-spark/.whirl.env

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@ POSTGRES_USER=postgres
1313
POSTGRES_DB=postgresdb
1414

1515
# Spark variables
16-
SPARK_VERSION=2.4.1
16+
SPARK_VERSION=3.0
1717

1818
# Airflow env vars
1919
AIRFLOW__CORE__EXPOSE_CONFIG=True
2020
AIRFLOW__WEBSERVER__EXPOSE_CONFIG=True
21+
AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
2122
AIRFLOW__CORE__LOAD_EXAMPLES=False

0 commit comments

Comments
 (0)