diff --git a/.asf.yaml b/.asf.yaml index 0b72df504c..6851f3e1f5 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -1,10 +1,10 @@ notifications: - commits: commits@sedona.apache.org - issues_status: dev@sedona.apache.org - issues_comment: issues@sedona.apache.org - pullrequests_status: dev@sedona.apache.org - pullrequests_comment: issues@sedona.apache.org - jira_options: link label worklog + commits: commits@sedona.apache.org + issues_status: dev@sedona.apache.org + issues_comment: issues@sedona.apache.org + pullrequests_status: dev@sedona.apache.org + pullrequests_comment: issues@sedona.apache.org + jira_options: link label worklog github: description: "A cluster computing framework for processing large-scale geospatial data" homepage: https://sedona.apache.org/ diff --git a/.github/linters/.yaml-lint.yml b/.github/linters/.yaml-lint.yml new file mode 100644 index 0000000000..117e78e6b3 --- /dev/null +++ b/.github/linters/.yaml-lint.yml @@ -0,0 +1,11 @@ +--- +# https://yamllint.readthedocs.io/en/stable/ +extends: default + +rules: + colons: disable + comments: disable + comments-indentation: disable + document-start: disable + line-length: disable + truthy: false diff --git a/.github/linters/codespell.txt b/.github/linters/codespell.txt index b828d8b767..a20350263c 100644 --- a/.github/linters/codespell.txt +++ b/.github/linters/codespell.txt @@ -1,7 +1,9 @@ actualy afterall +atmost bu celle +checkin eiter errorprone fpt diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 4aa8108d41..b0c2bb413d 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -42,28 +42,28 @@ jobs: shell: bash steps: - - uses: actions/checkout@v4 - - uses: actions/setup-java@v4 - with: - distribution: 'zulu' - java-version: 11 - - name: Cache Maven packages - uses: actions/cache@v3 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} - restore-keys: ${{ runner.os }}-m2 - - name: Setup docker (missing on macOS) - if: runner.os == 'macos' - run: | - brew install docker - colima start - DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker} - mkdir -p $DOCKER_CONFIG/cli-plugins - curl -SL https://github.com/docker/buildx/releases/download/v0.14.1/buildx-v0.14.1.darwin-amd64 -o $DOCKER_CONFIG/cli-plugins/docker-buildx - chmod +x $DOCKER_CONFIG/cli-plugins/docker-buildx - - env: - SPARK_VERSION: ${{ matrix.spark }} - SEDONA_VERSION: ${{ matrix.sedona }} - GEOTOOLS_VERSION: ${{ matrix.geotools }} - run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION} ${SEDONA_VERSION} local ${GEOTOOLS_VERSION} + - uses: actions/checkout@v4 + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: 11 + - name: Cache Maven packages + uses: actions/cache@v3 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2 + - name: Setup docker (missing on macOS) + if: runner.os == 'macos' + run: | + brew install docker + colima start + DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker} + mkdir -p $DOCKER_CONFIG/cli-plugins + curl -SL https://github.com/docker/buildx/releases/download/v0.14.1/buildx-v0.14.1.darwin-amd64 -o $DOCKER_CONFIG/cli-plugins/docker-buildx + chmod +x $DOCKER_CONFIG/cli-plugins/docker-buildx + - env: + SPARK_VERSION: ${{ matrix.spark }} + SEDONA_VERSION: ${{ matrix.sedona }} + GEOTOOLS_VERSION: ${{ matrix.geotools }} + run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION} ${SEDONA_VERSION} local ${GEOTOOLS_VERSION} diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 9163040ef8..b312ef1bb2 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -83,39 +83,39 @@ jobs: skipTests: '' steps: - - uses: actions/checkout@v4 - - uses: actions/setup-java@v4 - with: - distribution: 'zulu' - java-version: ${{ matrix.jdk }} - - uses: actions/setup-python@v5 - with: - python-version: '3.8' - - name: Cache Maven packages - uses: actions/cache@v3 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} - restore-keys: ${{ runner.os }}-m2 - - run: sudo apt-get -y install python3-pip python-dev-is-python3 - - env: - SPARK_VERSION: ${{ matrix.spark }} - run: (pip install pyspark==${SPARK_VERSION};pip install pandas==1.3.5;pip install shapely==1.8.5;pip install pyarrow==10.0.1) - - env: - SPARK_VERSION: ${{ matrix.spark }} - SCALA_VERSION: ${{ matrix.scala }} - SKIP_TESTS: ${{ matrix.skipTests }} - run: | - SPARK_COMPAT_VERSION="3.0" - if [ ${SPARK_VERSION:2:1} -gt "3" ]; then - SPARK_COMPAT_VERSION=${SPARK_VERSION:0:3} - fi - mvn -q clean install -Dspark=${SPARK_COMPAT_VERSION} -Dscala=${SCALA_VERSION:0:4} -Dspark.version=${SPARK_VERSION} ${SKIP_TESTS} - - run: mkdir staging - - run: cp spark-shaded/target/sedona-*.jar staging - - run: | - [ -d "flink-shaded/target/" ] && cp flink-shaded/target/sedona-*.jar staging 2>/dev/null || true - - uses: actions/upload-artifact@v3 - with: - name: generated-jars ${{ matrix.spark }} ${{ matrix.scala }} ${{ matrix.jdk }} - path: staging + - uses: actions/checkout@v4 + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: ${{ matrix.jdk }} + - uses: actions/setup-python@v5 + with: + python-version: '3.8' + - name: Cache Maven packages + uses: actions/cache@v3 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2 + - run: sudo apt-get -y install python3-pip python-dev-is-python3 + - env: + SPARK_VERSION: ${{ matrix.spark }} + run: (pip install pyspark==${SPARK_VERSION};pip install pandas==1.3.5;pip install shapely==1.8.5;pip install pyarrow==10.0.1) + - env: + SPARK_VERSION: ${{ matrix.spark }} + SCALA_VERSION: ${{ matrix.scala }} + SKIP_TESTS: ${{ matrix.skipTests }} + run: | + SPARK_COMPAT_VERSION="3.0" + if [ ${SPARK_VERSION:2:1} -gt "3" ]; then + SPARK_COMPAT_VERSION=${SPARK_VERSION:0:3} + fi + mvn -q clean install -Dspark=${SPARK_COMPAT_VERSION} -Dscala=${SCALA_VERSION:0:4} -Dspark.version=${SPARK_VERSION} ${SKIP_TESTS} + - run: mkdir staging + - run: cp spark-shaded/target/sedona-*.jar staging + - run: | + [ -d "flink-shaded/target/" ] && cp flink-shaded/target/sedona-*.jar staging 2>/dev/null || true + - uses: actions/upload-artifact@v3 + with: + name: generated-jars ${{ matrix.spark }} ${{ matrix.scala }} ${{ matrix.jdk }} + path: staging diff --git a/.github/workflows/python-extension.yml b/.github/workflows/python-extension.yml index 5536e8929e..f5e31fd8b6 100644 --- a/.github/workflows/python-extension.yml +++ b/.github/workflows/python-extension.yml @@ -38,40 +38,40 @@ jobs: shell: bash steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python }} - - name: Install pipenv - run: pip install -U pipenv - - name: Install dependencies - run: | - cd python - pipenv --python ${{ matrix.python }} - pipenv install --dev - - name: Build extension - run: | - cd python - pipenv run python setup.py build_ext --inplace - - name: Run tests - run: | - cd python - pipenv run pytest tests/utils/test_geomserde_speedup.py - - name: Run tests on Shapely 2.0 - run: | - cd python - pipenv install shapely~=2.0 - pipenv run pytest tests/utils/test_geomserde_speedup.py - - name: Run tests on Shapley 1.7 - # Shapely 1.7 only provides wheels for cp36 ~ cp39, so we'll skip running - # this test for recent python versions. - if: ${{ matrix.python == '3.9' || matrix.python == '3.8' }} - run: | - cd python - pipenv install shapely~=1.7 - pipenv run pytest tests/utils/test_geomserde_speedup.py - - name: Install from sdist - run: | - cd python - pipenv run python setup.py sdist - pipenv run python -m pip install dist/*sedona-*.tar.gz + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install pipenv + run: pip install -U pipenv + - name: Install dependencies + run: | + cd python + pipenv --python ${{ matrix.python }} + pipenv install --dev + - name: Build extension + run: | + cd python + pipenv run python setup.py build_ext --inplace + - name: Run tests + run: | + cd python + pipenv run pytest tests/utils/test_geomserde_speedup.py + - name: Run tests on Shapely 2.0 + run: | + cd python + pipenv install shapely~=2.0 + pipenv run pytest tests/utils/test_geomserde_speedup.py + - name: Run tests on Shapley 1.7 + # Shapely 1.7 only provides wheels for cp36 ~ cp39, so we'll skip running + # this test for recent python versions. + if: ${{ matrix.python == '3.9' || matrix.python == '3.8' }} + run: | + cd python + pipenv install shapely~=1.7 + pipenv run pytest tests/utils/test_geomserde_speedup.py + - name: Install from sdist + run: | + cd python + pipenv run python setup.py sdist + pipenv run python -m pip install dist/*sedona-*.tar.gz diff --git a/.github/workflows/python-wheel.yml b/.github/workflows/python-wheel.yml index 00f0ac2e21..03ad49eb63 100644 --- a/.github/workflows/python-wheel.yml +++ b/.github/workflows/python-wheel.yml @@ -34,22 +34,22 @@ jobs: shell: bash steps: - - uses: actions/checkout@v4 - - name: Set up QEMU - if: runner.os == 'Linux' - uses: docker/setup-qemu-action@v3 - with: - platforms: all - - name: Build wheels - uses: pypa/cibuildwheel@v2.18.1 - env: - CIBW_SKIP: 'pp* *musl*' - CIBW_ARCHS_LINUX: 'x86_64 aarch64' - CIBW_ARCHS_WINDOWS: 'AMD64 ARM64' - CIBW_ARCHS_MACOS: 'x86_64 arm64' - with: - package-dir: python - - uses: actions/upload-artifact@v3 - with: - name: wheels - path: ./wheelhouse/*.whl + - uses: actions/checkout@v4 + - name: Set up QEMU + if: runner.os == 'Linux' + uses: docker/setup-qemu-action@v3 + with: + platforms: all + - name: Build wheels + uses: pypa/cibuildwheel@v2.18.1 + env: + CIBW_SKIP: 'pp* *musl*' + CIBW_ARCHS_LINUX: 'x86_64 aarch64' + CIBW_ARCHS_WINDOWS: 'AMD64 ARM64' + CIBW_ARCHS_MACOS: 'x86_64 arm64' + with: + package-dir: python + - uses: actions/upload-artifact@v3 + with: + name: wheels + path: ./wheelhouse/*.whl diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 71aadaf8f6..7a89665cb4 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -95,66 +95,66 @@ jobs: hadoop: '2.7' steps: - - uses: actions/checkout@v4 - - uses: actions/setup-java@v4 - with: - distribution: 'zulu' - java-version: '8' - - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python }} - - name: Cache Maven packages - uses: actions/cache@v3 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} - restore-keys: ${{ runner.os }}-m2 - - env: - SPARK_VERSION: ${{ matrix.spark }} - SCALA_VERSION: ${{ matrix.scala }} - run: | - SPARK_COMPAT_VERSION="3.0" - if [ ${SPARK_VERSION:2:1} -gt "3" ]; then - SPARK_COMPAT_VERSION=${SPARK_VERSION:0:3} + - uses: actions/checkout@v4 + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '8' + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Cache Maven packages + uses: actions/cache@v3 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2 + - env: + SPARK_VERSION: ${{ matrix.spark }} + SCALA_VERSION: ${{ matrix.scala }} + run: | + SPARK_COMPAT_VERSION="3.0" + if [ ${SPARK_VERSION:2:1} -gt "3" ]; then + SPARK_COMPAT_VERSION=${SPARK_VERSION:0:3} + fi + mvn -q clean install -DskipTests -Dspark=${SPARK_COMPAT_VERSION} -Dscala=${SCALA_VERSION:0:4} -Dgeotools + - env: + SPARK_VERSION: ${{ matrix.spark }} + HADOOP_VERSION: ${{ matrix.hadoop }} + run: | + wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz + wget https://repo.osgeo.org/repository/release/javax/media/jai_core/${JAI_CORE_VERSION}/jai_core-${JAI_CORE_VERSION}.jar + wget https://repo.osgeo.org/repository/release/javax/media/jai_codec/${JAI_CODEC_VERSION}/jai_codec-${JAI_CODEC_VERSION}.jar + wget https://repo.osgeo.org/repository/release/javax/media/jai_imageio/${JAI_IMAGEIO_VERSION}/jai_imageio-${JAI_IMAGEIO_VERSION}.jar + tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz + mv -v jai_core-${JAI_CORE_VERSION}.jar spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/jars/ + mv -v jai_codec-${JAI_CODEC_VERSION}.jar spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/jars/ + mv -v jai_imageio-${JAI_IMAGEIO_VERSION}.jar spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/jars/ + - run: sudo apt-get -y install python3-pip python-dev-is-python3 + - run: sudo pip3 install -U setuptools + - run: sudo pip3 install -U wheel + - run: sudo pip3 install -U virtualenvwrapper + - run: python3 -m pip install pipenv==2023.9.1 + - run: cd python; python3 setup.py build_ext --inplace + - env: + SPARK_VERSION: ${{ matrix.spark }} + PYTHON_VERSION: ${{ matrix.python }} + SHAPELY_VERSION: ${{ matrix.shapely }} + run: | + cd python + if [ "${SHAPELY_VERSION}" == "1" ]; then + echo "Patching Pipfile to use Shapely 1.x" + sed -i 's/^shapely.*$/shapely="<2.0.0"/g' Pipfile fi - mvn -q clean install -DskipTests -Dspark=${SPARK_COMPAT_VERSION} -Dscala=${SCALA_VERSION:0:4} -Dgeotools - - env: - SPARK_VERSION: ${{ matrix.spark }} - HADOOP_VERSION: ${{ matrix.hadoop }} - run: | - wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz - wget https://repo.osgeo.org/repository/release/javax/media/jai_core/${JAI_CORE_VERSION}/jai_core-${JAI_CORE_VERSION}.jar - wget https://repo.osgeo.org/repository/release/javax/media/jai_codec/${JAI_CODEC_VERSION}/jai_codec-${JAI_CODEC_VERSION}.jar - wget https://repo.osgeo.org/repository/release/javax/media/jai_imageio/${JAI_IMAGEIO_VERSION}/jai_imageio-${JAI_IMAGEIO_VERSION}.jar - tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz - mv -v jai_core-${JAI_CORE_VERSION}.jar spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/jars/ - mv -v jai_codec-${JAI_CODEC_VERSION}.jar spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/jars/ - mv -v jai_imageio-${JAI_IMAGEIO_VERSION}.jar spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/jars/ - - run: sudo apt-get -y install python3-pip python-dev-is-python3 - - run: sudo pip3 install -U setuptools - - run: sudo pip3 install -U wheel - - run: sudo pip3 install -U virtualenvwrapper - - run: python3 -m pip install pipenv==2023.9.1 - - run: cd python; python3 setup.py build_ext --inplace - - env: - SPARK_VERSION: ${{ matrix.spark }} - PYTHON_VERSION: ${{ matrix.python }} - SHAPELY_VERSION: ${{ matrix.shapely }} - run: | - cd python - if [ "${SHAPELY_VERSION}" == "1" ]; then - echo "Patching Pipfile to use Shapely 1.x" - sed -i 's/^shapely.*$/shapely="<2.0.0"/g' Pipfile - fi - pipenv --python ${PYTHON_VERSION} - pipenv install pyspark==${SPARK_VERSION} - pipenv install --dev - pipenv graph - - env: - SPARK_VERSION: ${{ matrix.spark }} - HADOOP_VERSION: ${{ matrix.hadoop }} - run: find spark-shaded/target -name sedona-*.jar -exec cp {} spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/jars/ \; - - env: - SPARK_VERSION: ${{ matrix.spark }} - HADOOP_VERSION: ${{ matrix.hadoop }} - run: (export SPARK_HOME=$PWD/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION};export PYTHONPATH=$SPARK_HOME/python;cd python;pipenv run pytest tests) + pipenv --python ${PYTHON_VERSION} + pipenv install pyspark==${SPARK_VERSION} + pipenv install --dev + pipenv graph + - env: + SPARK_VERSION: ${{ matrix.spark }} + HADOOP_VERSION: ${{ matrix.hadoop }} + run: find spark-shaded/target -name sedona-*.jar -exec cp {} spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/jars/ \; + - env: + SPARK_VERSION: ${{ matrix.spark }} + HADOOP_VERSION: ${{ matrix.hadoop }} + run: (export SPARK_HOME=$PWD/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION};export PYTHONPATH=$SPARK_HOME/python;cd python;pipenv run pytest tests) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index be10a1ba9e..30b28f19de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - id: identity - id: check-hooks-apply - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell name: Run codespell @@ -19,7 +19,7 @@ repos: args: [--ignore-words=.github/linters/codespell.txt] exclude: ^docs/image|^spark/common/src/test/resources|^docs/usecases|^tools/maven/scalafmt - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.4 + rev: v0.4.10 hooks: - id: ruff args: [--config=.github/linters/ruff.toml, --fix] @@ -58,7 +58,7 @@ repos: args: [--markdown-linebreak-ext=md] exclude: ^docs-overrides/main\.html$|\.Rd$ - repo: https://github.com/igorshubovych/markdownlint-cli - rev: v0.40.0 + rev: v0.41.0 hooks: - id: markdownlint name: Run markdownlint @@ -67,3 +67,13 @@ repos: exclude: ^\.github/.*$ types: [markdown] files: \.(md|mdown|markdown)$ + - repo: https://github.com/adrienverge/yamllint + rev: v1.35.1 + hooks: + - id: yamllint + name: Run yamllint + description: Check YAML files with yamllint + args: [--strict, -c=.github/linters/.yaml-lint.yml] + exclude: ^mkdocs\.yml$ + types: [yaml] + files: \.ya?ml$ diff --git a/R/_pkgdown.yml b/R/_pkgdown.yml index 74ce0d0e6a..4280bf60d2 100644 --- a/R/_pkgdown.yml +++ b/R/_pkgdown.yml @@ -25,43 +25,43 @@ home: [Homepage](https://sedona.apache.org/) reference: -- title: "Reading and Writing Spatial DataFrames" - desc: "Functions for reading and writing Spark DataFrames." - contents: - - starts_with("spark_read") - - starts_with("spark_write") - - sedona_save_spatial_rdd -- title: "Reading and Writing Spatial RDDs" - desc: "Functions for reading and writing Spatial RDDs." - contents: - - starts_with("sedona_read") - - starts_with("sedona_write") -- title: "Conversion" - desc: "Functions to convert between Spark DataFrames and Spatial RDDs." - contents: - - contains("sdf_register") - - to_spatial_rdd -- title: "RDD functions" -- subtitle: "Visualization" - contents: - - starts_with("sedona_visualization_routines") - - starts_with("sedona_render") -- subtitle: "Joins" - contents: - - sedona_spatial_join - - sedona_spatial_join_count_by_key -- subtitle: "Query" - contents: - - sedona_knn_query - - sedona_range_query -- subtitle: "Others" - contents: - - sedona_apply_spatial_partitioner - - sedona_build_index - - approx_count - - crs_transform - - minimum_bounding_box - - new_bounding_box + - title: "Reading and Writing Spatial DataFrames" + desc: "Functions for reading and writing Spark DataFrames." + contents: + - starts_with("spark_read") + - starts_with("spark_write") + - sedona_save_spatial_rdd + - title: "Reading and Writing Spatial RDDs" + desc: "Functions for reading and writing Spatial RDDs." + contents: + - starts_with("sedona_read") + - starts_with("sedona_write") + - title: "Conversion" + desc: "Functions to convert between Spark DataFrames and Spatial RDDs." + contents: + - contains("sdf_register") + - to_spatial_rdd + - title: "RDD functions" + - subtitle: "Visualization" + contents: + - starts_with("sedona_visualization_routines") + - starts_with("sedona_render") + - subtitle: "Joins" + contents: + - sedona_spatial_join + - sedona_spatial_join_count_by_key + - subtitle: "Query" + contents: + - sedona_knn_query + - sedona_range_query + - subtitle: "Others" + contents: + - sedona_apply_spatial_partitioner + - sedona_build_index + - approx_count + - crs_transform + - minimum_bounding_box + - new_bounding_box repo: url: home: https://github.com/apache/sedona/