diff --git a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile index 1b39637340..6596229af5 100644 --- a/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile +++ b/docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile @@ -19,7 +19,6 @@ FROM ubuntu:22.04 ARG shared_workspace=/opt/workspace ARG spark_version=3.4.1 -ARG hadoop_version=3 ARG hadoop_s3_version=3.3.4 ARG aws_sdk_version=1.12.402 ARG spark_xml_version=0.16.0 @@ -29,8 +28,7 @@ ARG spark_extension_version=2.11.0 # Set up envs ENV SHARED_WORKSPACE=${shared_workspace} -ENV SPARK_HOME /opt/spark -RUN mkdir ${SPARK_HOME} +ENV SPARK_HOME /usr/local/lib/python3.10/dist-packages/pyspark ENV SEDONA_HOME /opt/sedona RUN mkdir ${SEDONA_HOME} @@ -44,7 +42,7 @@ COPY ./ ${SEDONA_HOME}/ RUN chmod +x ${SEDONA_HOME}/docker/spark.sh RUN chmod +x ${SEDONA_HOME}/docker/sedona.sh -RUN ${SEDONA_HOME}/docker/spark.sh ${spark_version} ${hadoop_version} ${hadoop_s3_version} ${aws_sdk_version} ${spark_xml_version} +RUN ${SEDONA_HOME}/docker/spark.sh ${spark_version} ${hadoop_s3_version} ${aws_sdk_version} ${spark_xml_version} # Install Python dependencies COPY docker/sedona-spark-jupyterlab/requirements.txt /opt/requirements.txt diff --git a/docker/spark.sh b/docker/spark.sh index 8cca154a34..bd935e8a45 100755 --- a/docker/spark.sh +++ b/docker/spark.sh @@ -19,10 +19,9 @@ set -e # Define variables spark_version=$1 -hadoop_version=$2 -hadoop_s3_version=$3 -aws_sdk_version=$4 -spark_xml_version=$5 +hadoop_s3_version=$2 +aws_sdk_version=$3 +spark_xml_version=$4 # Set up OS libraries apt-get update @@ -30,9 +29,6 @@ apt-get install -y openjdk-19-jdk-headless curl python3-pip maven pip3 install --upgrade pip && pip3 install pipenv # Download Spark jar and set up PySpark -curl https://archive.apache.org/dist/spark/spark-"${spark_version}"/spark-"${spark_version}"-bin-hadoop"${hadoop_version}".tgz -o spark.tgz -tar -xf spark.tgz && mv spark-"${spark_version}"-bin-hadoop"${hadoop_version}"/* "${SPARK_HOME}"/ -rm spark.tgz && rm -rf spark-"${spark_version}"-bin-hadoop"${hadoop_version}" pip3 install pyspark=="${spark_version}" # Add S3 jars @@ -42,9 +38,6 @@ curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/"${aws_sdk # Add spark-xml jar curl https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/"${spark_xml_version}"/spark-xml_2.12-"${spark_xml_version}".jar -o "${SPARK_HOME}"/jars/spark-xml_2.12-"${spark_xml_version}".jar -# Set up master IP address and executor memory -cp "${SPARK_HOME}"/conf/spark-defaults.conf.template "${SPARK_HOME}"/conf/spark-defaults.conf - # Install required libraries for GeoPandas on Apple chip mac apt-get install -y gdal-bin libgdal-dev