diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 1fe07741..79f0dfb8 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -83,7 +83,7 @@ updates: groups: preset: patterns: - - "*requirements.txt" + - "*" package-ecosystem: pip schedule: interval: weekly @@ -95,3 +95,11 @@ updates: package-ecosystem: pip schedule: interval: weekly + - directory: enterprise/redhat/openshift-ai/gaudi/docker + groups: + gaudi-openshift: + patterns: + - "*" + package-ecosystem: pip + schedule: + interval: weekly diff --git a/.github/release/v0.4.0.json b/.github/release/v0.4.0.json new file mode 100644 index 00000000..671769e9 --- /dev/null +++ b/.github/release/v0.4.0.json @@ -0,0 +1,230 @@ +[ + { + "base": "ubuntu:22.04", + "dockerfile": "python/Dockerfile", + "repo": "intel/python", + "tag": "3.10-core" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "python/Dockerfile", + "repo": "intel/python", + "tag": "3.10-full" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "classical-ml/Dockerfile", + "repo": "intel/intel-optimized-ml", + "tag": "2024.6.0-pip-base" + }, + { + "base": "intel/intel-optimized-ml:2024.6.0-pip-base", + "dockerfile": "classical-ml/Dockerfile", + "repo": "intel/intel-optimized-ml", + "tag": "2024.6.0-pip-jupyter" + }, + { + "base": "intel/python:3.10-core", + "dockerfile": "classical-ml/Dockerfile", + "repo": "intel/intel-optimized-ml", + "tag": "2024.6.0-idp-base" + }, + { + "base": "intel/intel-optimized-ml:2024.6.0-idp-base", + "dockerfile": "classical-ml/Dockerfile", + "repo": "intel/intel-optimized-ml", + "tag": "2024.6.0-idp-jupyter" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.0.1-xpu-pip-base" + }, + { + "base": "intel/intel-optimized-tensorflow:2.15.0.1-xpu-pip-base", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.0.1-xpu-pip-jupyter" + }, + { + "base": "intel/python:3.10-core", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.0.1-xpu-idp-base" + }, + { + "base": "intel/intel-optimized-tensorflow:2.15.0.1-xpu-idp-base", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.0.1-xpu-idp-jupyter" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.1-pip-base" + }, + { + "base": "intel/intel-optimized-tensorflow:2.15.1-pip-base", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.1-pip-jupyter" + }, + { + "base": "intel/intel-optimized-tensorflow:2.15.1-pip-base", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.1-pip-multinode" + }, + { + "base": "intel/python:3.10-core", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.0-idp-base" + }, + { + "base": "intel/intel-optimized-tensorflow:2.15.0-idp-base", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.0-idp-jupyter" + }, + { + "base": "intel/intel-optimized-tensorflow:2.15.0-idp-base", + "dockerfile": "tensorflow/Dockerfile", + "repo": "intel/intel-optimized-tensorflow", + "tag": "2.15.0-idp-multinode" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.1.40-xpu-pip-base" + }, + { + "base": "intel/intel-optimized-pytorch:2.1.40-xpu-pip-base", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.1.40-xpu-pip-jupyter" + }, + { + "base": "intel/python:3.10-core", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.1.40-xpu-idp-base" + }, + { + "base": "intel/intel-optimized-pytorch:2.1.40-xpu-idp-base", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.1.40-xpu-idp-jupyter" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-pip-base" + }, + { + "base": "intel/intel-optimized-pytorch:2.4.0-pip-base", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-pip-jupyter" + }, + { + "base": "intel/intel-optimized-pytorch:2.4.0-pip-base", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-pip-multinode" + }, + { + "base": "intel/intel-optimized-pytorch:2.4.0-pip-multinode", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-pip-hf-4.44.0-genai" + }, + { + "base": "intel/python:3.10-core", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-idp-base" + }, + { + "base": "intel/intel-optimized-pytorch:2.4.0-idp-base", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-idp-jupyter" + }, + { + "base": "intel/intel-optimized-pytorch:2.4.0-idp-base", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-idp-multinode" + }, + { + "base": "intel/intel-optimized-pytorch:2.4.0-idp-multinode", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-idp-hf-4.44.0-genai" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-serving-cpu" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "pytorch/Dockerfile", + "repo": "intel/intel-optimized-pytorch", + "tag": "2.4.0-serving-xpu" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "preset/classical-ml/Dockerfile", + "repo": "intel/classical-ml", + "tag": "latest-py3.9" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "preset/classical-ml/Dockerfile", + "repo": "intel/classical-ml", + "tag": "latest-py3.10" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "preset/data-analytics/Dockerfile", + "repo": "intel/data-analytics", + "tag": "latest-py3.9" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "preset/data-analytics/Dockerfile", + "repo": "intel/data-analytics", + "tag": "latest-py3.10" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "preset/deep-learning/Dockerfile", + "repo": "intel/deep-learning", + "tag": "latest-py3.9" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "preset/deep-learning/Dockerfile", + "repo": "intel/deep-learning", + "tag": "latest-py3.10" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "preset/inference-optimization/Dockerfile", + "repo": "intel/inference-optimization", + "tag": "latest-py3.9" + }, + { + "base": "ubuntu:22.04", + "dockerfile": "preset/inference-optimization/Dockerfile", + "repo": "intel/inference-optimization", + "tag": "latest-py3.10" + } +] diff --git a/.github/workflows/apptainer-ci.yaml b/.github/workflows/apptainer-ci.yaml index 62083391..330dcc0c 100644 --- a/.github/workflows/apptainer-ci.yaml +++ b/.github/workflows/apptainer-ci.yaml @@ -72,7 +72,7 @@ jobs: - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - uses: eWaterCycle/setup-apptainer@4bb22c52d4f63406c49e94c804632975787312b3 # v2.0.0 with: - apptainer-version: 1.3.3 + apptainer-version: 1.3.4 - name: Apptainer login to registry env: APPTAINER_DOCKER_USERNAME: ${{ secrets.REGISTRY_USER }} diff --git a/.github/workflows/chart-ci.yaml b/.github/workflows/chart-ci.yaml index 24b5404e..6f698aa6 100644 --- a/.github/workflows/chart-ci.yaml +++ b/.github/workflows/chart-ci.yaml @@ -26,7 +26,7 @@ jobs: runs-on: kubectl steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 diff --git a/.github/workflows/container-ci.yaml b/.github/workflows/container-ci.yaml index 08669ffa..0c8c8f94 100644 --- a/.github/workflows/container-ci.yaml +++ b/.github/workflows/container-ci.yaml @@ -63,10 +63,10 @@ jobs: setup-build: outputs: matrix: ${{ steps.build-matrix.outputs.matrix }} - runs-on: ubuntu-latest # ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} + runs-on: ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -79,13 +79,14 @@ jobs: build-containers: needs: [setup-build] env: ${{ matrix }} - runs-on: ubuntu-latest # ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} + runs-on: ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} strategy: matrix: ${{ fromJson(needs.setup-build.outputs.matrix) }} fail-fast: false outputs: group: ${{ steps.build-group.outputs.container-group }} steps: + - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 if: ${{ !inputs.no_build }} - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0 @@ -111,12 +112,12 @@ jobs: setup-scan: needs: [build-containers] if: ${{ github.event_name == 'pull_request' }} - runs-on: ubuntu-latest # ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} + runs-on: ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} outputs: matrix: ${{ steps.scan-matrix.outputs.matrix }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 @@ -135,7 +136,7 @@ jobs: fail-fast: false steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -154,7 +155,7 @@ jobs: - name: Cleanup if: always() run: docker rmi -f ${{ secrets.REGISTRY }}/${{ secrets.REPO }}:${{ matrix.container }} - - uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a # v3.25.15 + - uses: github/codeql-action/upload-sarif@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 with: sarif_file: '${{ matrix.container }}-scan.sarif' category: '${{ matrix.container }}' @@ -164,12 +165,12 @@ jobs: #################################################################################################### setup-test: needs: [build-containers] - runs-on: ubuntu-latest # ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} + runs-on: ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} outputs: matrix: ${{ steps.test-matrix.outputs.matrix }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -186,7 +187,7 @@ jobs: experimental: [true] fail-fast: false steps: - - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + - uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 diff --git a/.github/workflows/dependency-review.yaml b/.github/workflows/dependency-review.yaml index 9feca423..635a8176 100644 --- a/.github/workflows/dependency-review.yaml +++ b/.github/workflows/dependency-review.yaml @@ -34,7 +34,7 @@ jobs: pull-requests: write steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 diff --git a/.github/workflows/dockerhub-description.yml b/.github/workflows/dockerhub-description.yml index f3bbd9bf..1dbd23b9 100644 --- a/.github/workflows/dockerhub-description.yml +++ b/.github/workflows/dockerhub-description.yml @@ -24,7 +24,7 @@ jobs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -39,7 +39,7 @@ jobs: fail-fast: false steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 900afb5f..e51dddbb 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -32,11 +32,11 @@ jobs: pages: write steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 cache: pip diff --git a/.github/workflows/integration-test.yaml b/.github/workflows/integration-test.yaml index af6f4cc2..10bc3879 100644 --- a/.github/workflows/integration-test.yaml +++ b/.github/workflows/integration-test.yaml @@ -26,7 +26,7 @@ jobs: groups: ${{ steps.group-list.outputs.FOLDERS }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -113,12 +113,12 @@ jobs: path: output.txt recreate: true status-check: - needs: [group-diff, pipeline-ci] + needs: [group-diff, pipeline-ci, merge-logs] runs-on: ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} if: always() steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - run: exit 1 diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 0d170a62..057aab1e 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -31,7 +31,7 @@ jobs: statuses: write steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 diff --git a/.github/workflows/scorecard.yaml b/.github/workflows/scorecard.yaml index 72abc0f8..99ada495 100644 --- a/.github/workflows/scorecard.yaml +++ b/.github/workflows/scorecard.yaml @@ -36,7 +36,7 @@ jobs: actions: read steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -48,11 +48,11 @@ jobs: results_format: sarif repo_token: ${{ secrets.GITHUB_TOKEN }} publish_results: true - - uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5 + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: SARIF file path: results.sarif retention-days: 5 - - uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a # v3.25.15 + - uses: github/codeql-action/upload-sarif@8214744c546c1e5c8f03dde8fab3a7353211988d # v3.26.7 with: sarif_file: results.sarif diff --git a/.github/workflows/security-report.yaml b/.github/workflows/security-report.yaml index f1ccde65..2aaa7655 100644 --- a/.github/workflows/security-report.yaml +++ b/.github/workflows/security-report.yaml @@ -27,7 +27,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: rsdmike/github-security-report-action@a149b24539044c92786ec39af8ba38c93496495d # v3.0.4 @@ -35,7 +35,7 @@ jobs: sarifReportDir: ${{ github.workspace }} template: report token: ${{ secrets.GITHUB_TOKEN }} - - uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5 + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: Security Report Summary path: ./*.pdf diff --git a/.github/workflows/test-runner-ci.yaml b/.github/workflows/test-runner-ci.yaml index 6ef0e617..e9f4bb88 100644 --- a/.github/workflows/test-runner-ci.yaml +++ b/.github/workflows/test-runner-ci.yaml @@ -33,7 +33,7 @@ jobs: fail-fast: true steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -45,7 +45,7 @@ jobs: registry: ${{ secrets.REGISTRY }} username: ${{ secrets.REGISTRY_USER }} password: ${{ secrets.REGISTRY_TOKEN }} - - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python }} - name: Install requirements @@ -66,7 +66,7 @@ jobs: runs-on: ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: coverallsapp/github-action@643bc377ffa44ace6394b2b5d0d3950076de9f63 # v2.3.0 @@ -76,7 +76,7 @@ jobs: runs-on: ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -88,7 +88,7 @@ jobs: registry: ${{ secrets.REGISTRY }} username: ${{ secrets.REGISTRY_USER }} password: ${{ secrets.REGISTRY_TOKEN }} - - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: "3.8" - name: Test Container Group diff --git a/.github/workflows/weekly-test.yaml b/.github/workflows/weekly-test.yaml index 41c8a1df..edbfb7a8 100644 --- a/.github/workflows/weekly-test.yaml +++ b/.github/workflows/weekly-test.yaml @@ -25,7 +25,7 @@ jobs: groups: ${{ steps.group-list.outputs.FOLDERS }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 @@ -52,27 +52,27 @@ jobs: group_dir: ${{ matrix.group }} ref: main secrets: inherit - helm-ci: - runs-on: kubectl - steps: - - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 - with: - egress-policy: audit - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - with: - fetch-depth: 0 - - uses: intel/ai-containers/workflows/charts@main - with: - config: '--all --namespace helm-ci' - list_changed: false - kubeconfig_path: ${{ secrets.KUBECONFIG_PATH }} + # helm-ci: + # runs-on: kubectl + # steps: + # - name: Harden Runner + # uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 + # with: + # egress-policy: audit + # - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + # with: + # fetch-depth: 0 + # - uses: intel/ai-containers/workflows/charts@main + # with: + # config: '--all --namespace helm-ci' + # list_changed: false + # kubeconfig_path: ${{ secrets.KUBECONFIG_PATH }} scan: name: gitleaks runs-on: ${{ github.repository_owner == 'intel' && 'intel-ubuntu-latest' || 'ubuntu-latest' }} steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 diff --git a/.gitignore b/.gitignore index f3e1d08f..d6f27a92 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ logs/ models-perf/ output/ site +test-runner-summary-output.json venv/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3b972cfa..fc2fc4a1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - hooks: - id: gitleaks repo: https://github.com/gitleaks/gitleaks - rev: v8.18.4 + rev: v8.19.2 - hooks: - args: [--license-filepath=.github/license_template.txt, --use-current-year, --detect-license-in-X-top-lines=40, --skip-license-insertion-comment=Copyright] files: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f431ef44..9d61a8ac 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ # Contributing -Thank you for considering contributing to Intel® AI Containers! We welcome your help to make this project better. Contributing to an open source project can be a daunting task, but the Intel AI Containers team is here to help you through the process. If at any point in this process you feel out of your depth or confused by our processes, please don't hesitate to reach out to a maintainer or file an [issue](https://github.com/intel/ai-containers/issues). +Thank you for considering contributing to AI Containers! We welcome your help to make this project better. Contributing to an open source project can be a daunting task, but the Intel AI Containers team is here to help you through the process. If at any point in this process you feel out of your depth or confused by our processes, please don't hesitate to reach out to a maintainer or file an [issue](https://github.com/intel/ai-containers/issues). ## Getting Started @@ -138,4 +138,4 @@ commit automatically with `git commit -s`. ## License -Intel® AI Containers is licensed under the terms in [LICENSE](./LICENSE). By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. +AI Containers is licensed under the terms in [LICENSE](./LICENSE). By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. diff --git a/README.md b/README.md index 23705123..a152105c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Intel® AI Containers +# AI Containers [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/8270/badge)](https://www.bestpractices.dev/projects/8270) [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/intel/ai-containers/badge)](https://securityscorecards.dev/viewer/?uri=github.com/intel/ai-containers) @@ -28,7 +28,7 @@ docker login $REGISTRY docker pull $REGISTRY/$REPO:latest ``` -The maintainers of Intel® AI Containers use Azure to store containers, but an open source container registry like [harbor](https://github.com/goharbor/harbor) is preferred. +The maintainers of AI Containers use Azure to store containers, but an open source container registry like [harbor](https://github.com/goharbor/harbor) is preferred. > [!WARNING] > You can optionally skip this step and use some placeholder values, however some container groups depend on other images and will pull from a registry that you have not defined and result in an error. diff --git a/apptainer/python/requirements.txt b/apptainer/python/requirements.txt index 9dede726..a18c568e 100644 --- a/apptainer/python/requirements.txt +++ b/apptainer/python/requirements.txt @@ -1,6 +1,6 @@ -numpy==2.0.1 -setuptools==72.1.0 +numpy==2.1.1 +setuptools==75.1.0 psutil==6.0.0 -mkl==2024.2.0 -mkl-include==2024.2.0 -intel-openmp==2024.2.0 +mkl==2024.2.1 +mkl-include==2024.2.1 +intel-openmp==2024.2.1 diff --git a/classical-ml/.actions.json b/classical-ml/.actions.json index 36e21ad8..e7e793d3 100644 --- a/classical-ml/.actions.json +++ b/classical-ml/.actions.json @@ -1,5 +1,5 @@ { "PACKAGE_OPTION": ["idp", "pip"], "experimental": [true], - "runner_label": ["PVC"] + "runner_label": ["clx"] } diff --git a/classical-ml/README.md b/classical-ml/README.md index 06cfa613..97cc50c2 100644 --- a/classical-ml/README.md +++ b/classical-ml/README.md @@ -10,7 +10,8 @@ The images below include [Intel® Extension for Scikit-learn*] and [XGBoost*]. | Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | | ------------------------------------------------- | -------------- | ------------ | -------- | --------------- | -| `2024.5.0-pip-base`, `latest` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.6.0-pip-base`, `latest` | [v2024.6.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.5.0-pip-base` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | | `2024.3.0-pip-base` | [v2024.3.0] | [v1.4.2] | [v2.0.3] | [v0.4.0-Beta] | | `2024.2.0-xgboost-2.0.3-pip-base` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | | `scikit-learning-2024.0.0-xgboost-2.0.2-pip-base` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | @@ -19,6 +20,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | | ---------------------------------------------------- | -------------- | ------------ | -------- | --------------- | +| `2024.6.0-pip-jupyter` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | | `2024.5.0-pip-jupyter` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | | `2024.3.0-pip-jupyter` | [v2024.3.0] | [v1.4.2] | [v2.0.3] | [v0.4.0-Beta] | | `2024.2.0-xgboost-2.0.3-pip-jupyter` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | @@ -43,7 +45,9 @@ The images below include [Intel® Distribution for Python*]: | Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | | ------------------------------------------------- | -------------- | ------------ | -------- | --------------- | -| `2024.3.0-idp-base` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.6.0-idp-base` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | +| `2024.5.0-idp-base` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.3.0-idp-base` | [v2024.3.0] | [v1.4.1] | [v2.1.0] | [v0.4.0] | | `2024.2.0-xgboost-2.0.3-idp-base` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | | `scikit-learning-2024.0.0-xgboost-2.0.2-idp-base` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | @@ -51,13 +55,15 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | | ---------------------------------------------------- | -------------- | ------------ | -------- | --------------- | -| `2024.3.0-idp-jupyter` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.6.0-idp-jupyter` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | +| `2024.5.0-idp-jupyter` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.3.0-idp-jupyter` | [v2024.3.0] | [v1.4.0] | [v2.1.0] | [v0.4.0] | | `2024.2.0-xgboost-2.0.3-idp-jupyter` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | | `scikit-learning-2024.0.0-xgboost-2.0.2-idp-jupyter` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | ## Build from Source -To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: +To build the images from source, clone the [AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: ```bash cd classical-ml @@ -89,16 +95,19 @@ It is the image user's responsibility to ensure that any use of The images below [Scikit-learn*]: https://scikit-learn.org/stable/ [XGBoost*]: https://github.com/dmlc/xgboost +[v2024.6.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.6.0 [v2024.5.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.5.0 [v2024.3.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.3.0 [v2024.2.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.2.0 [v2024.0.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.0.0 +[v1.5.1]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.5.1 [v1.5.0]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.5.0 [v1.4.2]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.4.2 [v1.4.1]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.4.1 [v1.3.2]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.3.2 +[v2.1.1]: https://github.com/dmlc/xgboost/releases/tag/v2.1.1 [v2.1.0]: https://github.com/dmlc/xgboost/releases/tag/v2.1.0 [v2.0.3]: https://github.com/dmlc/xgboost/releases/tag/v2.0.3 [v2.0.2]: https://github.com/dmlc/xgboost/releases/tag/v2.0.2 diff --git a/classical-ml/docker-compose.yaml b/classical-ml/docker-compose.yaml index 0a775bdc..491005de 100644 --- a/classical-ml/docker-compose.yaml +++ b/classical-ml/docker-compose.yaml @@ -40,21 +40,21 @@ services: org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.name: "intel/intel-optimized-ml" org.opencontainers.image.title: "Intel® Optimized ML Base Image" - org.opencontainers.image.version: ${SKLEARN_VERSION:-2024.4.0}-${PACKAGE_OPTION:-pip}-base + org.opencontainers.image.version: ${SKLEARN_VERSION:-2024.6.0}-${PACKAGE_OPTION:-pip}-base target: ml-base-${PACKAGE_OPTION:-pip} command: > bash -c "python -c 'import sklearnex, sklearn; import xgboost as xgb; print(\"Scikit version:\", sklearn.__version__, \"\\nXGBoost version:\", xgb.__version__)'" depends_on: - ${PACKAGE_OPTION:-pip} - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SCIKIT_VERSION:-2024.5.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-base + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SKLEARN_VERSION:-2024.6.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-base pull_policy: always jupyter: build: labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-ml:${SKLEARN_VERSION:-2024.4.0}-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-ml:${SKLEARN_VERSION:-2024.6.0}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Optimized ML Jupyter Base Image" - org.opencontainers.image.version: ${SKLEARN_VERSION:-2024.4.0}-${PACKAGE_OPTION:-pip}-jupyter + org.opencontainers.image.version: ${SKLEARN_VERSION:-2024.6.0}-${PACKAGE_OPTION:-pip}-jupyter target: jupyter command: > bash -c "python -m jupyter --version" @@ -62,5 +62,5 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} extends: ml-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SCIKIT_VERSION:-2024.5.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-jupyter + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SKLEARN_VERSION:-2024.6.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-jupyter network_mode: host diff --git a/classical-ml/jupyter-requirements.txt b/classical-ml/jupyter-requirements.txt index 2cae0f91..d98ce88b 100644 --- a/classical-ml/jupyter-requirements.txt +++ b/classical-ml/jupyter-requirements.txt @@ -1,4 +1,4 @@ -jupyterlab==4.2.4 +jupyterlab==4.2.5 jupyterhub==5.1.0 -notebook==7.2.1 +notebook==7.2.2 jupyter-server-proxy>=4.1.2 diff --git a/classical-ml/requirements.txt b/classical-ml/requirements.txt index 484856df..b7ff293b 100644 --- a/classical-ml/requirements.txt +++ b/classical-ml/requirements.txt @@ -1,5 +1,5 @@ daal4py==2024.6.0 -matplotlib==3.9.1.post1 +matplotlib==3.9.2 numpy==1.26.4 scikit-learn-intelex==2024.6.0 threadpoolctl==3.5.0 diff --git a/classical-ml/tests/tests.yaml b/classical-ml/tests/tests.yaml index 0016987b..197dd285 100644 --- a/classical-ml/tests/tests.yaml +++ b/classical-ml/tests/tests.yaml @@ -14,13 +14,13 @@ --- classical-ml-import-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SCIKIT_VERSION:-2024.5.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SKLEARN_VERSION:-2024.6.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-base cmd: python -c "from sklearnex import patch_sklearn; patch_sklearn();import xgboost as xgb; print(xgb.__version__)" classical-ml-import-${PACKAGE_OPTION:-pip}-jupyter: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SCIKIT_VERSION:-2024.5.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SKLEARN_VERSION:-2024.6.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-jupyter cmd: sh -c "python -m jupyter --version" classical-ml-performance-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SCIKIT_VERSION:-2024.5.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-scikit-learn-${SKLEARN_VERSION:-2024.6.0}-xgboost-${XGBOOST_VERSION:-2.1.0}-base cmd: python /tests/performance.py volumes: - src: $PWD/classical-ml/tests diff --git a/docs/requirements.txt b/docs/requirements.txt index bbcf99a8..33207ff8 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,9 +1,9 @@ mkdocs-callouts>=1.13.2 mkdocs-git-authors-plugin>=0.8.0 mkdocs-git-revision-date-localized-plugin>=1.2.5 -mkdocs-material==9.5.31 +mkdocs-material==9.5.34 mkdocs-table-reader-plugin>=2.1.0 -mkdocs==1.6.0 +mkdocs==1.6.1 pandas>=2.0.3 pymdown-extensions>=10.8.1 python_on_whales>=0.71.0 diff --git a/docs/roadmap.md b/docs/roadmap.md index 8e22e7c8..018808b6 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -10,7 +10,7 @@ - Granite Rapids Support - CLS Support -- Intel Developer Cloud Support +- Intel Tiber Developer Cloud Support - AI Tools 2024.3/2025.0 Support ## Q4'24 diff --git a/docs/scripts/hook.py b/docs/scripts/hook.py index 3b862bdf..2f0c96ec 100644 --- a/docs/scripts/hook.py +++ b/docs/scripts/hook.py @@ -34,6 +34,7 @@ def create_support_matrix(): compose_to_csv("pytorch", "serving") compose_to_csv("tensorflow", None) compose_to_csv("classical-ml", None) + compose_to_csv("jax", None) # get_repo(models) compose_to_csv("preset/data-analytics", "data_analytics") diff --git a/docs/scripts/readmes.py b/docs/scripts/readmes.py index 3e7d5e09..8eb2553b 100644 --- a/docs/scripts/readmes.py +++ b/docs/scripts/readmes.py @@ -17,6 +17,7 @@ readmes = [ "classical-ml/README.md", + "jax/README.md", "preset/README.md", "python/README.md", "pytorch/README.md", diff --git a/enterprise/redhat/openshift-ai/gaudi/README.md b/enterprise/redhat/openshift-ai/gaudi/README.md new file mode 100644 index 00000000..a5682adf --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/README.md @@ -0,0 +1,68 @@ +# Intel® Gaudi AI Software Tools Containers on OpenShift AI + +Intel® Gaudi AI Software Tools for OpenShift AI(RedHat OpenShift Data Science/RHODS) is a suite of containers that enables the AI practitioners to utilize Intel® Gaudi accelerator for AI workflows on OpenShift platforms. You can access these containers using the RHODS Jupyter dashboard. More details about each container is described in the table below. + +## Gaudi Notebook Containers + +| Notebook Container Name | Tools | Image Name | +| -----------------------------| ------------- | ------------- | +| Intel Gaudi Notebook Container | [Intel® Gaudi Software Stack*](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html), [Intel® Gaudi PyTorch](https://docs.habana.ai/en/latest/PyTorch/index.html), [Intel® Gaudi vLLM](https://github.com/HabanaAI/vllm-fork.git), [Intel® Gaudi DeepSpeed](https://github.com/HabanaAI/DeepSpeed) | [`registry.connect.redhat.com/intel/gaudi-notebooks:1.17.0-495-rhel-9.2`](registry.connect.redhat.com/intel/gaudi-notebooks@sha256:a62baf968caa7dd23b7f4cdcddc26e109d894f1436e247b4ea1e2fb4a5c94d54) | + +## Run Gaudi Notebook Containers + +You can access the Intel® Gaudi AI SW Tools containers from OpenShift* AI dashboard. + +### Prerequisite + +1. Make sure you have access to [OpenShift* Container Platform](https://docs.openshift.com/container-platform/4.14/installing/index.html) and [OpenShift* AI operator](https://docs.redhat.com/en/documentation/red_hat_openshift_ai_cloud_service/1/html/installing_and_uninstalling_openshift_ai_cloud_service/installing-and-deploying-openshift-ai_install#installing-and-deploying-openshift-ai_install) is installed if you want to access the containers from OpenShift* AI dashboard. + +2. To utilize the Intel® Gaudi accelerator with the notebook please, install the Intel® Gaudi Base Operator for OpenShift([instructions](https://catalog.redhat.com/software/container-stacks/detail/6683b2cce45daa25e36bddcb)) and the accelerate profile using the following command on your machine connected to the OCP cluster. You need to be logged into the OCP cluster for this command to work. + + ```bash + oc apply -f https://raw.githubusercontent.com/intel/intel-technology-enabling-for-openshift/main/e2e/inference/accelerator_profile_gaudi.yaml + ``` + +3. Install the [Intel® Gaudi AI SW Tool Operator](https://catalog.redhat.com/software/container-stacks/detail/66d7aa630eb66a02febc8103). + +4. Create the CRD to install Gaudi notebook into OpenShift AI Jupyter dashboard using following command. + + ```bash + oc apply -f https://raw.githubusercontent.com/intel/ai-containers/main/enterprise/redhat/openshift-ai/gaudi/crd-sample.yaml + ``` + +### Start the jupyter notebook from RHODS dashboard + +To access the jupyter notebook from the jupyter server running inside the Gaudi notebook container in OpenShift AI follow the steps below. + +1. Once the OpenShift AI operator is installed correctly, you can access the dashboard by following the instructions below. + + 1. Go to the *Routes* menu in your OCP console in the menu left side in the *Networking* tab. + + 2. Select the project as `redhat-ods-applications` from the menu at the top. + + 3. You will see list of all the routes available to you in the project. You will also see the corresponding link to access the routes. Please select the link in the row named *rhods-dashboard* as shown in the picture. This will take you to the OpenShift AI dashboard. Please refer to the screenshot below for more details. + + ![Step-1](../oneapi/assets/step-1.png) + +2. Once on the OpenShift AI dashboard you can select the link to `Launch Application` inside the *Jupyter* tile in the *Enabled* applications from the left menu. Please refer to the screenshot below for more details. + + ![Step-2](../oneapi/assets/step-2.png) + +3. If you've followed step 3 in [prerequisites](#prerequisite) to import images you should be able to see the Intel® Gaudi AI Software Tools images in the dashboard as shown in the screenshot below. + + ![Step-3](./assets/step-3.png) + +4. Select the size of the resources you want to request from the dropdown menu *Container Size*. The options available are `Small`, `Medium`, `Large`, `X-Large`. The sizes describes the request of resources like CPU, RAM and Disk Space for the jupyter server container. + +5. (*Optional*) To utilize the Intel® Gaudi accelerators select the accelerator profile from the dropdown menu as shown in the screenshot below. + + ![Step-4](./assets/step-4.png) + +6. Once all options are selected click on the *Start Server* button to start the jupyter server. + +## Troubleshooting and Support + +If you need more help feel free to submit an [issue](https://github.com/intel/ai-containers/issues). + +--- +\* Other names and brands may be claimed as the property of others. Trademarks diff --git a/enterprise/redhat/openshift-ai/gaudi/assets/step-3.png b/enterprise/redhat/openshift-ai/gaudi/assets/step-3.png new file mode 100644 index 00000000..a561bbb2 Binary files /dev/null and b/enterprise/redhat/openshift-ai/gaudi/assets/step-3.png differ diff --git a/enterprise/redhat/openshift-ai/gaudi/assets/step-4.png b/enterprise/redhat/openshift-ai/gaudi/assets/step-4.png new file mode 100644 index 00000000..a64a2558 Binary files /dev/null and b/enterprise/redhat/openshift-ai/gaudi/assets/step-4.png differ diff --git a/enterprise/redhat/openshift-ai/gaudi/crd-sample.yaml b/enterprise/redhat/openshift-ai/gaudi/crd-sample.yaml new file mode 100644 index 00000000..2468eab9 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/crd-sample.yaml @@ -0,0 +1,28 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: aitools.intel/v1 +kind: GaudiAIToolsContainer +metadata: + name: intel-gaudi-1.17.0-495 +spec: + nameOverride: "" + fullnameOverride: "" + imagespec: + registry: registry.connect.redhat.com + repo: intel/gaudi-notebooks + tags: + - gaudi_software: "1.17.0-495" + rhel_os: "9.2" + namespace: redhat-ods-applications diff --git a/enterprise/redhat/openshift-ai/gaudi/demo/oneapi-sample.ipynb b/enterprise/redhat/openshift-ai/gaudi/demo/oneapi-sample.ipynb new file mode 100644 index 00000000..9bd94af3 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/demo/oneapi-sample.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1e973d1b-c6d0-48a5-a774-0f114101e81e", + "metadata": {}, + "source": [ + "# Getting started with PyTorch on Intel® Gaudi.\n", + "\n", + "This notebook is to help you get started quickly using the Intel® Gaudi accelerator in this container. A simple MNIST model is trained on the Gaudi acclerator. You can tune some of the parameters below to change configuration of the training. For more information and reference please refer to the official documentation of [Intel® Gaudi acclerator](https://docs.habana.ai/en/latest/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "7eaacf55-bea2-43be-bb48-163848db1a30", + "metadata": { + "tags": [] + }, + "source": [ + "### Setup modes for training\n", + "\n", + "1. lazy_mode: Set to True(False) to enable(disable) lazy mode.\n", + "2. enable_amp: Set to True(False) to enable Automatic Mixed Precision.\n", + "3. epochs: Number of epochs for training\n", + "4. lr: Learning rate for training\n", + "5. batch_size: Number of samples in a batch\n", + "6. milestones: Milestone epochs for the stepLR scheduler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e7cf831-6fe6-46ed-a6fd-f2651cc226af", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "lazy_mode = False\n", + "enable_amp = False\n", + "epochs = 20\n", + "batch_size = 128\n", + "lr = 0.01\n", + "milestones = [10,15]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cee8ad90-c52d-4a50-876f-ce0762cb1b62", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ['HABANA_LOGS']='/opt/app-root/logs'\n", + "if lazy_mode:\n", + " os.environ['PT_HPU_LAZY_MODE'] = '1'\n", + "else:\n", + " os.environ['PT_HPU_LAZY_MODE'] = '0'" + ] + }, + { + "cell_type": "markdown", + "id": "6eac33d0-2e64-4233-8b3f-40bb7217fef8", + "metadata": { + "tags": [] + }, + "source": [ + "### Import packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06ad44ff-9744-4d6f-af90-375e64717b59", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import torch.nn.functional as F\n", + "import torchvision\n", + "import torchvision.transforms as transforms\n", + "import os\n", + "\n", + "# Import Habana Torch Library\n", + "import habana_frameworks.torch.core as htcore" + ] + }, + { + "cell_type": "markdown", + "id": "062de7f3-4561-4af3-a9ed-2c4cfc918f2f", + "metadata": {}, + "source": [ + "### Define Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9df57abb-0b63-4e1c-9d9b-87e74964300e", + "metadata": {}, + "outputs": [], + "source": [ + "class SimpleModel(nn.Module):\n", + " def __init__(self):\n", + " super(SimpleModel, self).__init__()\n", + "\n", + " self.fc1 = nn.Linear(784, 256)\n", + " self.fc2 = nn.Linear(256, 64)\n", + " self.fc3 = nn.Linear(64, 10)\n", + "\n", + " def forward(self, x):\n", + "\n", + " out = x.view(-1,28*28)\n", + " out = F.relu(self.fc1(out))\n", + " out = F.relu(self.fc2(out))\n", + " out = self.fc3(out)\n", + "\n", + " return out" + ] + }, + { + "cell_type": "markdown", + "id": "d899885b-5b4d-4557-a90c-9d507875c2ee", + "metadata": {}, + "source": [ + "### Define training routine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b17e9aa-fa11-4870-a7d4-183b803177ab", + "metadata": {}, + "outputs": [], + "source": [ + "def train(net,criterion,optimizer,trainloader,device):\n", + "\n", + " net.train()\n", + " if not lazy_mode:\n", + " net = torch.compile(net,backend=\"hpu_backend\")\n", + " train_loss = 0.0\n", + " correct = 0\n", + " total = 0\n", + "\n", + " for batch_idx, (data, targets) in enumerate(trainloader):\n", + "\n", + " data, targets = data.to(device), targets.to(device)\n", + "\n", + " optimizer.zero_grad()\n", + " if enable_amp:\n", + " with torch.autocast(device_type=\"hpu\", dtype=torch.bfloat16):\n", + " outputs = net(data)\n", + " loss = criterion(outputs, targets)\n", + " else:\n", + " outputs = net(data)\n", + " loss = criterion(outputs, targets)\n", + "\n", + " loss.backward()\n", + " \n", + " # API call to trigger execution\n", + " if lazy_mode:\n", + " htcore.mark_step()\n", + " \n", + " optimizer.step()\n", + "\n", + " # API call to trigger execution\n", + " if lazy_mode:\n", + " htcore.mark_step()\n", + "\n", + " train_loss += loss.item()\n", + " _, predicted = outputs.max(1)\n", + " total += targets.size(0)\n", + " correct += predicted.eq(targets).sum().item()\n", + "\n", + " train_loss = train_loss/(batch_idx+1)\n", + " train_acc = 100.0*(correct/total)\n", + " print(\"Training loss is {} and training accuracy is {}\".format(train_loss,train_acc))" + ] + }, + { + "cell_type": "markdown", + "id": "b7a22d69-a91f-48e1-8fac-e1cfe68590b7", + "metadata": {}, + "source": [ + "### Define testing routine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9aa379b-b376-4623-9b5c-f778c3d90ce7", + "metadata": {}, + "outputs": [], + "source": [ + "def test(net,criterion,testloader,device):\n", + "\n", + " net.eval()\n", + " test_loss = 0\n", + " correct = 0\n", + " total = 0\n", + "\n", + " with torch.no_grad():\n", + "\n", + " for batch_idx, (data, targets) in enumerate(testloader):\n", + "\n", + " data, targets = data.to(device), targets.to(device)\n", + " \n", + " if enable_amp:\n", + " with torch.autocast(device_type=\"hpu\", dtype=torch.bfloat16):\n", + " outputs = net(data)\n", + " loss = criterion(outputs, targets)\n", + " else:\n", + " outputs = net(data)\n", + " loss = criterion(outputs, targets)\n", + "\n", + "\n", + " # API call to trigger execution\n", + " if lazy_mode:\n", + " htcore.mark_step()\n", + "\n", + " test_loss += loss.item()\n", + " _, predicted = outputs.max(1)\n", + " total += targets.size(0)\n", + " correct += predicted.eq(targets).sum().item()\n", + "\n", + " test_loss = test_loss/(batch_idx+1)\n", + " test_acc = 100.0*(correct/total)\n", + " print(\"Testing loss is {} and testing accuracy is {}\".format(test_loss,test_acc))" + ] + }, + { + "cell_type": "markdown", + "id": "22e76af9-e355-4299-b84d-f34c9a25e76d", + "metadata": {}, + "source": [ + "### Run the main routine to train and test the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c8ddfb1-d4f7-44b2-aff0-f86f1db8c971", + "metadata": {}, + "outputs": [], + "source": [ + "load_path = './data'\n", + "save_path = './checkpoints'\n", + "\n", + "if(not os.path.exists(save_path)):\n", + " os.makedirs(save_path)\n", + "\n", + "# Target the Gaudi HPU device\n", + "device = torch.device(\"hpu\")\n", + "\n", + "# Data\n", + "transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + "])\n", + "\n", + "trainset = torchvision.datasets.MNIST(root=load_path, train=True,\n", + " download=True, transform=transform)\n", + "trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,\n", + " shuffle=True, num_workers=2)\n", + "testset = torchvision.datasets.MNIST(root=load_path, train=False,\n", + " download=True, transform=transform)\n", + "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,\n", + " shuffle=False, num_workers=2)\n", + "\n", + "net = SimpleModel()\n", + "net.to(device)\n", + "\n", + "criterion = nn.CrossEntropyLoss()\n", + "optimizer = optim.SGD(net.parameters(), lr=lr,\n", + " momentum=0.9, weight_decay=5e-4)\n", + "scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)\n", + "\n", + "for epoch in range(1, epochs+1):\n", + " print(\"=====================================================================\")\n", + " print(\"Epoch : {}\".format(epoch))\n", + " train(net,criterion,optimizer,trainloader,device)\n", + " test(net,criterion,testloader,device)\n", + "\n", + " torch.save(net.state_dict(), os.path.join(save_path,'epoch_{}.pth'.format(epoch)))\n", + "\n", + " scheduler.step()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2 b/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2 new file mode 100644 index 00000000..d1449fd8 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2 @@ -0,0 +1,236 @@ +ARG BASE_IMAGE +ARG BASE_TAG +FROM ${BASE_IMAGE}:${BASE_TAG} AS gaudi-base +ARG ARTIFACTORY_URL +ARG VERSION +ARG REVISION + +LABEL vendor="Intel Corporation" +LABEL release="${VERSION}-${REVISION}" + +ENV HOME="/opt/app-root/src" +WORKDIR /opt/app-root/src + +RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + dnf clean all && rm -rf /var/cache/yum + +RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo + +RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo + +RUN dnf install -y \ + clang \ + cmake3 \ + cpp \ + gcc \ + gcc-c++ \ + glibc \ + glibc-headers \ + glibc-devel \ + jemalloc \ + libarchive \ + libksba \ + unzip \ + llvm \ + lsof \ + python3-devel \ + openssh-clients \ + openssl \ + openssl-devel \ + libjpeg-devel \ + openssh-server \ + lsb_release \ + wget \ + git \ + libffi-devel \ + bzip2-devel \ + zlib-devel \ + mesa-libGL \ + iproute \ + python3-dnf-plugin-versionlock && \ + # update pkgs (except OS version) for resolving potentials CVEs + dnf versionlock add redhat-release* && \ + dnf update -y && \ + dnf clean all && rm -rf /var/cache/yum + +RUN mkdir -p /licenses && \ + wget -O /licenses/LICENSE https://raw.githubusercontent.com/intel/ai-containers/main/LICENSE + +ENV PYTHON_VERSION=3.10 +COPY install-python310.sh . +RUN ./install-python310.sh rhel9.2 && rm install-python310.sh +ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + +COPY install_efa.sh . +RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh + +ENV LIBFABRIC_VERSION="1.20.0" +ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" +ENV MPI_ROOT=/opt/amazon/openmpi +ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH +ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH +ENV OPAL_PREFIX=${MPI_ROOT} +ENV MPICC=${MPI_ROOT}/bin/mpicc +ENV RDMAV_FORK_SAFE=1 +ENV FI_EFA_USE_DEVICE_RDMA=1 + +RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ + echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo + +# for Habana GPG key with SHA-1 signature +RUN update-crypto-policies --set DEFAULT:SHA1 + +RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ + habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ + habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ + habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ + rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && rm -rf /tmp/* && \ + dnf clean all && rm -rf /var/cache/yum + +RUN rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph + +# There is no need to store pip installation files inside docker image +ENV PIP_NO_CACHE_DIR=on +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 +ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src +ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib + +RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/ofiwg/libfabric/releases/download/v${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ + cd /tmp/ && tar xf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ + cd /tmp/libfabric-${LIBFABRIC_VERSION} && \ + ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ + make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} + +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ + make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ + cd / && \ + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main + +ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +ENV HABANA_LOGS=/opt/app-root/log/habana_logs/ +ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw +ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins + +ENV APP_ROOT="/opt/app-root" + +RUN python3.10 -m pip install "pip>=23.3" "setuptools>=70.0.0" "wheel==0.38.4" + +WORKDIR ${APP_ROOT} + +RUN python3.10 -m venv ${APP_ROOT} && \ + wget -O ${APP_ROOT}/bin/fix-permissions \ + https://raw.githubusercontent.com/sclorg/s2i-python-container/master/3.9-minimal/root/usr/bin/fix-permissions && \ + chown -R 1001:0 ${APP_ROOT} && \ + chmod +x ${APP_ROOT}/bin/fix-permissions && \ + ${APP_ROOT}/bin/fix-permissions ${APP_ROOT} -P && \ + echo "unset BASH_ENV PROMPT_COMMAND ENV" >> ${APP_ROOT}/bin/activate + +USER 1001 + +ENV BASH_ENV="${APP_ROOT}/bin/activate" +ENV ENV="${APP_ROOT}/bin/activate" +ENV PROMPT_COMMAND=". ${APP_ROOT}/bin/activate" + +SHELL ["/bin/bash", "-c"] + +RUN python -m pip install habana_media_loader=="${VERSION}"."${REVISION}" + + +FROM gaudi-base AS gaudi-pytorch + +ARG PT_VERSION +ARG VERSION +ARG REVISION +ARG ARTIFACTORY_URL +ENV BASE_NAME=rhel9.2 + +LABEL name="PyTorch Installer" +LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2" +LABEL description="Image with pre installed Habanalabs packages for PyTorch" + +RUN echo "/usr/lib/habanalabs" > $(python -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth + +USER root + +RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo + +RUN dnf install --allowerasing -y \ + curl \ + cairo-devel \ + numactl-devel \ + iproute \ + which \ + zlib-devel \ + lapack-devel \ + openblas-devel \ + numactl \ + gperftools-devel && \ + dnf clean all && rm -rf /var/cache/yum + +RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ + dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \ + dnf clean all && rm -rf /var/cache/yum + +# Set LD_PRELOAD after all required installations to +# avoid warnings during docker creation +ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 + +RUN rm -rf /tmp/* + +USER 1001 + +COPY --chown=1001:0 install_packages.sh . +RUN ./install_packages.sh && rm -f install_packages.sh + +USER root + +RUN /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ + chown 1001:0 ~/.bashrc + +USER 1001 + +FROM gaudi-pytorch AS gaudi-notebooks + +WORKDIR ${APP_ROOT}/src + +COPY --chown=1001:0 requirements.txt requirements.txt +COPY --chown=1001:0 start-notebook.sh /opt/app-root/bin +COPY --chown=1001:0 builder /opt/app-root/builder +COPY --chown=1001:0 utils /opt/app-root/bin/utils + +USER 1001 + +RUN python -m pip install -r requirements.txt && \ + chmod -R g+w ${APP_ROOT}/lib/python3.10/site-packages && \ + fix-permissions ${APP_ROOT} -P && \ + chmod -R g+w /opt/app-root/src && \ + sed -i -e "s/Python.*/$(python --version | cut -d '.' -f-2)\",/" /opt/app-root/share/jupyter/kernels/python3/kernel.json && \ + jupyter labextension disable "@jupyterlab/apputils-extension:announcements" + +RUN cd ${APP_ROOT}/ && \ + git clone https://github.com/HabanaAI/vllm-fork.git && \ + cd vllm-fork && \ + VLLM_TARGET_DEVICE=hpu pip install -e . + +WORKDIR ${APP_ROOT}/src +ENV NOTEBOOK_SAMPLE_LINK="https://raw.githubusercontent.com/sharvil10/ai-containers/main/enterprise/redhat/openshift-ai/gaudi/demo/Getting-started.ipynb" + +ENTRYPOINT ["bash", "-c", "/opt/app-root/builder/run"] diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4 b/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4 new file mode 100644 index 00000000..18eeef28 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4 @@ -0,0 +1,249 @@ +ARG BASE_IMAGE +ARG BASE_TAG +FROM ${BASE_IMAGE}:${BASE_TAG} AS gaudi-base +ARG ARTIFACTORY_URL +ARG VERSION +ARG REVISION + +LABEL vendor="Intel Corporation" +LABEL release="${VERSION}-${REVISION}" + +ENV HOME="/opt/app-root/src" +WORKDIR /opt/app-root/src + +RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "name=CentOS Linux 9 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo + +RUN echo "[centos9]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "name=CentOS Linux 9 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo + +RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo + +RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + dnf clean all && rm -rf /var/cache/yum + +RUN dnf install -y \ + clang \ + cmake3 \ + cpp \ + gcc \ + gcc-c++ \ + glibc \ + glibc-headers \ + glibc-devel \ + jemalloc \ + libarchive \ + libksba \ + unzip \ + llvm \ + lsof \ + python3-devel \ + openssh-clients \ + openssl-1:3.0.7-27.el9 \ + openssl-devel-1:3.0.7-27.el9 \ + libjpeg-devel \ + openssh-server \ + lsb_release \ + wget \ + git \ + libffi-devel \ + bzip2-devel \ + zlib-devel \ + mesa-libGL \ + iproute \ + python3.11 \ + python3.11-pip \ + python3.11-devel \ + ffmpeg-free \ + perl-Net-SSLeay-1.92-2.el9 \ + python3-dnf-plugin-versionlock && \ + # update pkgs (except OS version) for resolving potentials CVEs + dnf versionlock add redhat-release* openssl* perl-Net-SSLeay && \ + dnf update -y && \ + dnf clean all && rm -rf /var/cache/yum + +RUN mkdir -p /licenses && \ + wget -O /licenses/LICENSE https://raw.githubusercontent.com/intel/ai-containers/main/LICENSE + +RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ + alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ + alternatives --set python3 /usr/bin/python3.11 && \ + alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.11 2 && \ + alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.9 1 && \ + alternatives --set pip3 /usr/bin/pip3.11 + +COPY install_efa.sh . +RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh + +ENV LIBFABRIC_VERSION="1.20.0" +ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" +ENV MPI_ROOT=/opt/amazon/openmpi +ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH +ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH +ENV OPAL_PREFIX=${MPI_ROOT} +ENV MPICC=${MPI_ROOT}/bin/mpicc +ENV RDMAV_FORK_SAFE=1 +ENV FI_EFA_USE_DEVICE_RDMA=1 + +RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ + echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo + +# for Habana GPG key with SHA-1 signature +RUN update-crypto-policies --set DEFAULT:SHA1 + +RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ + habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ + habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ + habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ + rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && rm -rf /tmp/* && \ + dnf clean all && rm -rf /var/cache/yum + +RUN rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph + +# There is no need to store pip installation files inside docker image +ENV PIP_NO_CACHE_DIR=on +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 +ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src +ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib + +RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/ofiwg/libfabric/releases/download/v${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ + cd /tmp/ && tar xf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ + cd /tmp/libfabric-${LIBFABRIC_VERSION} && \ + ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ + make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} + +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ + make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ + cd / && \ + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main + +ENV APP_ROOT="/opt/app-root" + +RUN python3.11 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 + +WORKDIR ${APP_ROOT} + +RUN python3.11 -m venv ${APP_ROOT} && \ + wget -O ${APP_ROOT}/bin/fix-permissions \ + https://raw.githubusercontent.com/sclorg/s2i-python-container/master/3.9-minimal/root/usr/bin/fix-permissions && \ + chown -R 1001:0 ${APP_ROOT} && \ + chmod +x ${APP_ROOT}/bin/fix-permissions && \ + ${APP_ROOT}/bin/fix-permissions ${APP_ROOT} -P && \ + echo "unset BASH_ENV PROMPT_COMMAND ENV" >> ${APP_ROOT}/bin/activate + +USER 1001 + +ENV BASH_ENV="${APP_ROOT}/bin/activate" +ENV ENV="${APP_ROOT}/bin/activate" +ENV PROMPT_COMMAND=". ${APP_ROOT}/bin/activate" + +SHELL ["/bin/bash", "-c"] + +RUN python -m pip install habana_media_loader=="${VERSION}"."${REVISION}" + +ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +ENV HABANA_LOGS=/opt/app-root/log/habana_logs/ +ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw +ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins + +FROM gaudi-base AS gaudi-pytorch + +ARG PT_VERSION +ARG VERSION +ARG REVISION +ARG ARTIFACTORY_URL +ENV BASE_NAME=rhel9.4 + +LABEL name="PyTorch Installer" +LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2" +LABEL description="Image with pre installed Habanalabs packages for PyTorch" + +RUN echo "/usr/lib/habanalabs" > $(python -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt + +USER root + +RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "baseurl=https://mirror.stream.centos.org/9-stream/CRB/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo + +RUN dnf install --allowerasing -y \ + curl \ + cairo-devel \ + numactl-devel \ + iproute \ + which \ + zlib-devel \ + lapack-devel \ + openblas-devel \ + numactl \ + gperftools-devel && \ + dnf clean all && rm -rf /var/cache/yum + +RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ + dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \ + dnf clean all && rm -rf /var/cache/yum + +RUN rm -rf /tmp/* + +USER 1001 + +COPY --chown=1001:0 install_packages.sh . + +# Set LD_PRELOAD after all required installations to +# avoid warnings during docker creation +ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 + +RUN ./install_packages.sh && rm -f install_packages.sh + +USER root + +RUN /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ + chown 1001:0 ~/.bashrc + +USER 1001 + +FROM gaudi-pytorch AS gaudi-notebooks + +WORKDIR ${APP_ROOT}/src + +COPY --chown=1001:0 requirements.txt requirements.txt +COPY --chown=1001:0 start-notebook.sh /opt/app-root/bin +COPY --chown=1001:0 builder /opt/app-root/builder +COPY --chown=1001:0 utils /opt/app-root/bin/utils + +USER 1001 + +RUN python -m pip install -r requirements.txt && \ + chmod -R g+w ${APP_ROOT}/lib/python3.11/site-packages && \ + fix-permissions ${APP_ROOT} -P && \ + chmod -R g+w /opt/app-root/src && \ + sed -i -e "s/Python.*/$(python --version | cut -d '.' -f-2)\",/" /opt/app-root/share/jupyter/kernels/python3/kernel.json && \ + jupyter labextension disable "@jupyterlab/apputils-extension:announcements" + +RUN cd ${APP_ROOT}/ && \ + git clone https://github.com/HabanaAI/vllm-fork.git && \ + cd vllm-fork && \ + VLLM_TARGET_DEVICE=hpu pip install -e . + +WORKDIR ${APP_ROOT}/src +ENV JUPYTER_PRELOAD_REPOS="https://github.com/IntelAI/oneAPI-samples" +ENV REPO_BRANCH="main" +ENTRYPOINT ["bash", "-c", "/opt/app-root/builder/run"] diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/builder/run b/enterprise/redhat/openshift-ai/gaudi/docker/builder/run new file mode 100755 index 00000000..f91d869e --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/builder/run @@ -0,0 +1,39 @@ +#!/bin/bash + +set -eo pipefail + +set -x + +APP_ROOT=${APP_ROOT:-/opt/app-root} + +# Pre-clone repositories defined in JUPYTER_PRELOAD_REPOS +if [ -n "${JUPYTER_PRELOAD_REPOS}" ]; then + for repo in $(echo "${JUPYTER_PRELOAD_REPOS}" | tr ',' ' '); do + # Check for the presence of "@branch" in the repo string + REPO_BRANCH=$(echo "${repo}" | cut -s -d'@' -f2) + if [[ -n ${REPO_BRANCH} ]]; then + # Remove the branch from the repo string and convert REPO_BRANCH to git clone arg + repo=$(echo "${repo}" | cut -d'@' -f1) + REPO_BRANCH="-b ${REPO_BRANCH}" + fi + echo "Checking if repository $repo exists locally" + REPO_DIR=$(basename "${repo}") + if [ -d "${REPO_DIR}" ]; then + pushd "${REPO_DIR}" + # Do nothing if the repo already exists + echo "The ${repo} has already been cloned" + : + popd + else + GIT_SSL_NO_VERIFY=true git clone "${repo}" "${REPO_DIR}" "${REPO_BRANCH}" + fi + done +fi + +if [ -n "${NOTEBOOK_SAMPLES_LINK}" ]; then + for link in $(echo "${NOTEBOOK_SAMPLES_LINK}" | tr ',' ' '); do + wget "${link}" + done +fi + +"${APP_ROOT}"/bin/start-notebook.sh "$@" diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml b/enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml new file mode 100644 index 00000000..d2901e32 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml @@ -0,0 +1,64 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +services: + gaudi-base: + build: + args: + BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi} + BASE_TAG: ${RHEL_OS:-9.2} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: "" + ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai} + VERSION: ${VERSION:-1.17.0} + REVISION: ${REVISION:-495} + context: . + target: gaudi-base + dockerfile: Dockerfile.rhel${RHEL_OS:-9.2} + image: gaudi-base:${RHEL_OS:-9.2}-${VERSION:-1.17.0}-${REVISION:-495} + gaudi-pytorch: + build: + args: + BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi} + BASE_TAG: ${RHEL_OS:-9.2} + BASE_NAME: rhel${RHEL_OS:-rhel9.2} + PT_VERSION: ${PT_VERSION:-2.3.1} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: "" + ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai} + VERSION: ${VERSION:-1.17.0} + REVISION: ${REVISION:-495} + context: . + target: gaudi-pytorch + dockerfile: Dockerfile.rhel${RHEL_OS:-9.2} + image: gaudi-pytorch:${RHEL_OS:-9.2}-${VERSION:-1.17.0}-${REVISION:-495} + gaudi-notebooks: + build: + args: + BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi} + BASE_TAG: ${RHEL_OS:-9.2} + BASE_NAME: ${BASE_NAME:-rhel9.2} + PT_VERSION: ${PT_VERSION:-2.3.1} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: "" + ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai} + VERSION: ${VERSION:-1.17.0} + REVISION: ${REVISION:-495} + context: . + target: gaudi-notebooks + dockerfile: Dockerfile.rhel${RHEL_OS:-9.2} + image: gaudi-notebooks:${RHEL_OS:-9.2}-${VERSION:-1.17.0}-${REVISION:-495} diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/install-python310.sh b/enterprise/redhat/openshift-ai/gaudi/docker/install-python310.sh new file mode 100755 index 00000000..a9d25005 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/install-python310.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +_BASE_NAME=${1:-"ubuntu22.04"} +_SSL_LIB="" + +# preinstall dependencies and define variables +case "${_BASE_NAME}" in +*ubuntu22.04*) + echo "Skip install Python3.10 from source on Ubuntu22.04" + exit 0 + ;; +*debian* | *ubuntu*) + apt update + apt install -y libsqlite3-dev libreadline-dev + ;; +*rhel*) + yum install -y sqlite-devel readline-devel xz-devel + ;; +*tencentos3.1*) + dnf install -y sqlite-devel readline-devel zlib-devel xz-devel bzip2-devel libffi-devel + wget -nv -O /opt/openssl-1.1.1w.tar.gz https://github.com/openssl/openssl/releases/download/OpenSSL_1_1_1w/openssl-1.1.1w.tar.gz && + cd /opt/ && + tar xzf openssl-1.1.1w.tar.gz && + rm -rf openssl-1.1.1w.tar.gz && + cd openssl-1.1.1w && + ./config --prefix=/usr/local/openssl-1.1.1w shared zlib && + make && make install + ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem + + PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin + LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH + _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w" + ;; +*amzn2*) + yum install -y sqlite-devel readline-devel + wget -nv -O /opt/openssl-1.1.1w.tar.gz https://github.com/openssl/openssl/releases/download/OpenSSL_1_1_1w/openssl-1.1.1w.tar.gz && + cd /opt/ && + tar xzf openssl-1.1.1w.tar.gz && + rm -rf openssl-1.1.1w.tar.gz && + cd openssl-1.1.1w && + ./config --prefix=/usr/local/openssl-1.1.1w shared zlib && + make && make install + ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem + + PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin + LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH + _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w" + ;; +esac + +# install Python +wget -nv -O /opt/Python-3.10.14.tgz https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz +cd /opt/ +tar xzf Python-3.10.14.tgz +rm -f Python-3.10.14.tgz +cd Python-3.10.14 +./configure --enable-optimizations --enable-loadable-sqlite-extensions --enable-shared $_SSL_LIB +make -j && make altinstall + +# post install +case "${_BASE_NAME}" in +*rhel9*) + alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 2 && + alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && + alternatives --set python3 /usr/local/bin/python3.10 + export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + ;; +*tencentos3.1*) + alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 4 && + alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 3 && + alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && + alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && + alternatives --set python3 /usr/local/bin/python3.10 + export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + ;; +*amzn2*) + update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 3 && + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 + ;; +*debian*) + update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 3 + update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.8 2 + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 + ;; +esac + +python3 -m pip install --upgrade pip setuptools diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh b/enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh new file mode 100755 index 00000000..4175e8f8 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh @@ -0,0 +1,40 @@ +#!/bin/bash -ex + +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DEFAULT_EFA_INSTALLER_VER=1.29.0 +efa_installer_version=${1:-$DEFAULT_EFA_INSTALLER_VER} + +tmp_dir=$(mktemp -d) +wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-"$efa_installer_version".tar.gz -P "$tmp_dir" +tar -xf "$tmp_dir"/aws-efa-installer-"$efa_installer_version".tar.gz -C "$tmp_dir" +pushd "$tmp_dir"/aws-efa-installer +# shellcheck disable=SC1091 +case $( + . /etc/os-release + echo -n "$ID" +) in +rhel) + # we cannot install dkms packages on RHEL images due to OCP rules + rm -f RPMS/RHEL8/x86_64/dkms*.rpm + ;; +tencentos) + dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-46.0-1.el8.x86_64.rpm RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-utils-46.0-1.el8.x86_64.rpm + patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch + ;; +esac +./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify +popd +rm -rf "$tmp_dir" diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/install_packages.sh b/enterprise/redhat/openshift-ai/gaudi/docker/install_packages.sh new file mode 100755 index 00000000..d67bb4f3 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/install_packages.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -ex + +pt_package_name="pytorch_modules-v${PT_VERSION}_${VERSION}_${REVISION}.tgz" +os_string="ubuntu${OS_NUMBER}" +case "${BASE_NAME}" in +*rhel9.2*) + os_string="rhel92" + ;; +*rhel9.4*) + os_string="rhel94" + ;; +*rhel8*) + os_string="rhel86" + ;; +*amzn2*) + os_string="amzn2" + ;; +*tencentos*) + os_string="tencentos31" + ;; +esac +pt_artifact_path="https://${ARTIFACTORY_URL}/artifactory/gaudi-pt-modules/${VERSION}/${REVISION}/pytorch/${os_string}" + +tmp_path=$(mktemp --directory) +wget --no-verbose "${pt_artifact_path}/${pt_package_name}" +tar -xf "${pt_package_name}" -C "${tmp_path}"/. +pushd "${tmp_path}" +./install.sh "$VERSION" "$REVISION" +popd +# cleanup +rm -rf "${tmp_path}" "${pt_package_name}" diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/requirements.txt b/enterprise/redhat/openshift-ai/gaudi/docker/requirements.txt new file mode 100644 index 00000000..e3140984 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/requirements.txt @@ -0,0 +1,43 @@ +# LLM Packages +deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0 + +# Datascience and useful extensions +kafka-python~=2.0.2 +matplotlib~=3.9.2 +pandas~=2.2.3 +plotly~=5.24.1 +scikit-learn +scipy~=1.14.1 +skl2onnx~=1.17.0 +codeflare-sdk~=0.20.2 + +# DB connectors +pymongo~=4.9.1 +psycopg~=3.2.2 +pyodbc~=5.1.0 +mysql-connector-python~=9.0.0 + +# JupyterLab packages +odh-elyra~=3.16.7 +jupyterlab~=4.2.5 # Wait on upgrade till plugins are ready +jupyter-bokeh~=4.0.5 # Upgrade would bring in jupyterlab 4 +jupyter-server~=2.14.2 +jupyter-server-proxy~=4.4.0 # Upgrade would bring in jupyterlab 4 +jupyter-server-terminals~=0.5.3 +jupyterlab-git~=0.50.1 +jupyterlab-lsp~=5.1.0 +jupyterlab-widgets~=3.0.13 +jupyter-resource-usage~=1.1.0 +nbdime~=4.0.2 +nbgitpuller~=1.2.1 + +# pycodestyle is dependency of below packages +# and to achieve compatible of pycodestyle with python-lsp-server[all] +# pinned the below packages +autopep8~=2.3.1 +flake8~=7.1.1 +# Base packages +wheel~=0.44.0 +setuptools>=70.0.0 +pip>=23.3 +aiohttp==3.10.5 diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/start-notebook.sh b/enterprise/redhat/openshift-ai/gaudi/docker/start-notebook.sh new file mode 100755 index 00000000..f13aa7d8 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/start-notebook.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Load bash libraries +SCRIPT_DIR=${APP_ROOT}/bin +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}"/utils/process.sh + +if [ -f "${SCRIPT_DIR}/utils/setup-elyra.sh" ]; then + # shellcheck disable=SC1091 + source "${SCRIPT_DIR}"/utils/setup-elyra.sh +fi + +# Initialize notebooks arguments variable +NOTEBOOK_PROGRAM_ARGS="" + +# Set default ServerApp.port value if NOTEBOOK_PORT variable is defined +if [ -n "${NOTEBOOK_PORT}" ]; then + NOTEBOOK_PROGRAM_ARGS+="--ServerApp.port=${NOTEBOOK_PORT} " +fi + +# Set default ServerApp.base_url value if NOTEBOOK_BASE_URL variable is defined +if [ -n "${NOTEBOOK_BASE_URL}" ]; then + NOTEBOOK_PROGRAM_ARGS+="--ServerApp.base_url=${NOTEBOOK_BASE_URL} " +fi + +# Set default ServerApp.root_dir value if NOTEBOOK_ROOT_DIR variable is defined +if [ -n "${NOTEBOOK_ROOT_DIR}" ]; then + NOTEBOOK_PROGRAM_ARGS+="--ServerApp.root_dir=${NOTEBOOK_ROOT_DIR} " +else + NOTEBOOK_PROGRAM_ARGS+="--ServerApp.root_dir=${HOME} " +fi + +# Add additional arguments if NOTEBOOK_ARGS variable is defined +if [ -n "${NOTEBOOK_ARGS}" ]; then + NOTEBOOK_PROGRAM_ARGS+=${NOTEBOOK_ARGS} +fi + +echo "${NOTEBOOK_PROGRAM_ARGS}" + +# Start the JupyterLab notebook +# shellcheck disable=SC2086 +start_process jupyter lab ${NOTEBOOK_PROGRAM_ARGS} \ + --ServerApp.ip=0.0.0.0 \ + --ServerApp.allow_origin="*" \ + --ServerApp.open_browser=False diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/utils/process.sh b/enterprise/redhat/openshift-ai/gaudi/docker/utils/process.sh new file mode 100755 index 00000000..95028188 --- /dev/null +++ b/enterprise/redhat/openshift-ai/gaudi/docker/utils/process.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +function start_process() { + trap stop_process TERM INT + + echo "Running command:" "$@" + echo -e "$@" + "$@" & + + PID=$! + wait $PID + trap - TERM INT + wait $PID + STATUS=$? + exit $STATUS +} + +function stop_process() { + kill -TERM "$PID" +} diff --git a/enterprise/redhat/openshift-ai/README.md b/enterprise/redhat/openshift-ai/oneapi/README.md similarity index 100% rename from enterprise/redhat/openshift-ai/README.md rename to enterprise/redhat/openshift-ai/oneapi/README.md diff --git a/enterprise/redhat/openshift-ai/assets/step-1.png b/enterprise/redhat/openshift-ai/oneapi/assets/step-1.png similarity index 100% rename from enterprise/redhat/openshift-ai/assets/step-1.png rename to enterprise/redhat/openshift-ai/oneapi/assets/step-1.png diff --git a/enterprise/redhat/openshift-ai/assets/step-2.png b/enterprise/redhat/openshift-ai/oneapi/assets/step-2.png similarity index 100% rename from enterprise/redhat/openshift-ai/assets/step-2.png rename to enterprise/redhat/openshift-ai/oneapi/assets/step-2.png diff --git a/enterprise/redhat/openshift-ai/assets/step-3.png b/enterprise/redhat/openshift-ai/oneapi/assets/step-3.png similarity index 100% rename from enterprise/redhat/openshift-ai/assets/step-3.png rename to enterprise/redhat/openshift-ai/oneapi/assets/step-3.png diff --git a/enterprise/redhat/openshift-ai/assets/step-4.png b/enterprise/redhat/openshift-ai/oneapi/assets/step-4.png similarity index 100% rename from enterprise/redhat/openshift-ai/assets/step-4.png rename to enterprise/redhat/openshift-ai/oneapi/assets/step-4.png diff --git a/enterprise/redhat/openshift-ai/manifests/intel-optimized-ml.yaml b/enterprise/redhat/openshift-ai/oneapi/manifests/intel-optimized-ml.yaml similarity index 100% rename from enterprise/redhat/openshift-ai/manifests/intel-optimized-ml.yaml rename to enterprise/redhat/openshift-ai/oneapi/manifests/intel-optimized-ml.yaml diff --git a/enterprise/redhat/openshift-ai/manifests/intel-optimized-pytorch.yaml b/enterprise/redhat/openshift-ai/oneapi/manifests/intel-optimized-pytorch.yaml similarity index 100% rename from enterprise/redhat/openshift-ai/manifests/intel-optimized-pytorch.yaml rename to enterprise/redhat/openshift-ai/oneapi/manifests/intel-optimized-pytorch.yaml diff --git a/enterprise/redhat/openshift-ai/manifests/intel-optimized-tensorflow.yaml b/enterprise/redhat/openshift-ai/oneapi/manifests/intel-optimized-tensorflow.yaml similarity index 100% rename from enterprise/redhat/openshift-ai/manifests/intel-optimized-tensorflow.yaml rename to enterprise/redhat/openshift-ai/oneapi/manifests/intel-optimized-tensorflow.yaml diff --git a/jax/.actions.json b/jax/.actions.json new file mode 100644 index 00000000..36e21ad8 --- /dev/null +++ b/jax/.actions.json @@ -0,0 +1,5 @@ +{ + "PACKAGE_OPTION": ["idp", "pip"], + "experimental": [true], + "runner_label": ["PVC"] +} diff --git a/jax/Dockerfile b/jax/Dockerfile new file mode 100644 index 00000000..0d0118fa --- /dev/null +++ b/jax/Dockerfile @@ -0,0 +1,104 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# +# +# This file was assembled from multiple pieces, whose use is documented +# throughout. Please refer to the TensorFlow dockerfiles documentation +# for more information. + +ARG REGISTRY +ARG REPO +ARG GITHUB_RUN_NUMBER +ARG BASE_IMAGE_NAME +ARG BASE_IMAGE_TAG +ARG PACKAGE_OPTION=pip +ARG PYTHON_VERSION +ARG PYTHON_BASE=${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER}-${BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${PACKAGE_OPTION}-py${PYTHON_VERSION}-base +ARG TORCHSERVE_BASE=${PYTHON_BASE} +FROM ${PYTHON_BASE} AS xpu-base + +RUN apt-get update && \ + apt-get install -y --no-install-recommends --fix-missing \ + apt-utils \ + build-essential \ + clinfo \ + git \ + gnupg2 \ + gpg-agent \ + rsync \ + unzip && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ + gpg --dearmor --yes --output /usr/share/keyrings/intel-graphics.gpg +RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | \ + tee /etc/apt/sources.list.d/intel-gpu-jammy.list + +ARG ICD_VER +ARG LEVEL_ZERO_GPU_VER +ARG LEVEL_ZERO_VER +ARG LEVEL_ZERO_DEV_VER + +RUN apt-get update && \ + apt-get install -y --no-install-recommends --fix-missing \ + intel-opencl-icd=${ICD_VER} \ + intel-level-zero-gpu=${LEVEL_ZERO_GPU_VER} \ + libze1=${LEVEL_ZERO_VER} \ + libze-dev=${LEVEL_ZERO_DEV_VER} && \ + rm -rf /var/lib/apt/lists/* + +RUN no_proxy="" NO_PROXY="" wget --progress=dot:giga -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ + | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ + | tee /etc/apt/sources.list.d/oneAPI.list + +ARG DPCPP_VER +ARG MKL_VER +ARG CCL_VER + +RUN apt-get update && \ + apt-get install -y --no-install-recommends --fix-missing \ + intel-oneapi-runtime-dpcpp-cpp=${DPCPP_VER} \ + intel-oneapi-runtime-mkl=${MKL_VER} \ + intel-oneapi-runtime-ccl=${CCL_VER} && \ + rm -rf /var/lib/apt/lists/* + +RUN rm -rf /etc/apt/sources.list.d/intel-gpu-jammy.list /etc/apt/sources.list.d/oneAPI.list + +ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors + +FROM xpu-base AS jax-base + +WORKDIR / +COPY requirements.txt . + +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt + +FROM jax-base AS jupyter + +WORKDIR /jupyter +COPY jupyter-requirements.txt . + +RUN python -m pip install --no-cache-dir -r jupyter-requirements.txt && \ + rm -rf jupyter-requirements.txt + +RUN mkdir -p /jupyter/ && chmod -R a+rwx /jupyter/ +RUN mkdir /.local && chmod a+rwx /.local + +EXPOSE 8888 + +CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root --ServerApp.token= --ServerApp.password= --ServerApp.allow_origin=* --ServerApp.base_url=$NB_PREFIX"] diff --git a/jax/README.md b/jax/README.md new file mode 100644 index 00000000..67ea81ed --- /dev/null +++ b/jax/README.md @@ -0,0 +1,86 @@ +# Intel® Optimized OpenXLA\* + +Transformable numerical computing at scale combined with [Intel® Extension for OpenXLA\*], which includes a PJRT plugin implementation to seamlessly runs [JAX\*] models on Intel GPUs. + +## Images + +The images below include [JAX\*] and [Intel® Extension for OpenXLA\*]. + +| Tag(s) | [JAX\*] | [Intel® Extension for OpenXLA\*] | [Flax] | Dockerfile | +| -------------------------- | --------- | ----------------- | -------- | --------------- | +| `0.4.0-pip-base`, `latest` | [v0.4.32] | [v0.4.0-jax] | [v0.9.0] | [v0.4.0] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | [JAX\*] | [Intel® Extension for OpenXLA\*] | [Flax] | Dockerfile | +| ------------------- | --------- | ----------------- | -------- | --------------- | +| `0.4.0-pip-jupyter` | [v0.4.32] | [v0.4.0-jax] | [v0.9.0] | [v0.4.0] | + +### Run the Jupyter Container + +```bash +docker run -it --rm \ + -p 8888:8888 \ + --net=host \ + -v $PWD/workspace:/workspace \ + -w /workspace \ + intel/intel-optimized-xla:0.4.0-pip-jupyter +``` + +After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. + +## Images with Intel® Distribution for Python* + +The images below include [Intel® Distribution for Python*]: + +| Tag(s) | [JAX\*] | [Intel® Extension for OpenXLA\*] | [Flax] | Dockerfile | +| ---------------- | --------- | ----------------- | -------- | --------------- | +| `0.4.0-idp-base` | [v0.4.32] | [v0.4.0-jax] | [v0.9.0] | [v0.4.0] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | [JAX\*] | [Intel® Extension for OpenXLA\*] | [Flax] | Dockerfile | +| ------------------- | --------- | ----------------- | -------- | --------------- | +| `0.4.0-idp-jupyter` | [v0.4.32] | [v0.4.0-jax] | [v0.9.0] | [v0.4.0] | + +## Build from Source + +To build the images from source, clone the [AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: + +```bash +cd jax +docker compose build jax-base +docker compose run -it jax-base +``` + +You can find the list of services below for each container in the group: + +| Service Name | Description | +| ------------ | ----------------------------------------------- | +| `jax-base` | Base image with [Intel® Extension for OpenXLA\*] | +| `jupyter` | Adds Jupyter Notebook server | + +## License + +View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [Intel® Distribution for Python]. + +The images below also contain other software which may be under other licenses (such as Pytorch*, Jupyter*, Bash, etc. from the base). + +It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. + +\* Other names and brands may be claimed as the property of others. + + + +[Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html#gs.9bos9m +[Intel® Extension for OpenXLA\*]: https://github.com/intel/intel-extension-for-openxla +[JAX\*]: https://github.com/google/jax +[Flax]: https://github.com/google/flax + +[v0.4.32]: https://github.com/google/jax/releases/tag/jax-v0.4.32 + +[v0.4.0-jax]: https://github.com/intel/intel-extension-for-openxla/releases/tag/0.4.0 + +[v0.9.0]: https://github.com/google/Flax/releases/tag/v0.9.0 + +[v0.4.0]: https://github.com/intel/ai-containers/blob/v0.4.0/jax/Dockerfile diff --git a/jax/docker-compose.yaml b/jax/docker-compose.yaml new file mode 100644 index 00000000..e2c47d63 --- /dev/null +++ b/jax/docker-compose.yaml @@ -0,0 +1,87 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include: + - path: + - ../python/docker-compose.yaml +services: + jax-base: + build: + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: "" + BASE_IMAGE_NAME: ${BASE_IMAGE_NAME:-ubuntu} + BASE_IMAGE_TAG: ${BASE_IMAGE_TAG:-22.04} + CCL_VER: ${CCL_VER:-2021.13.1-31} + DPCPP_VER: ${DPCPP_VER:-2024.2.1-1079} + GITHUB_RUN_NUMBER: ${GITHUB_RUN_NUMBER:-0} + ICD_VER: ${ICD_VER:-24.22.29735.27-914~22.04} + LEVEL_ZERO_DEV_VER: ${LEVEL_ZERO_DEV_VER:-1.17.6-914~22.04} + LEVEL_ZERO_GPU_VER: ${LEVEL_ZERO_GPU_VER:-1.3.29735.27-914~22.04} + LEVEL_ZERO_VER: ${LEVEL_ZERO_VER:-1.17.6-914~22.04} + MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Linux-x86_64} + MKL_VER: ${MKL_VER:-2024.2.1-103} + NO_PROXY: '' + PACKAGE_OPTION: ${PACKAGE_OPTION:-pip} + PYTHON_VERSION: ${PYTHON_VERSION:-3.10} + REGISTRY: ${REGISTRY} + REPO: ${REPO} + context: . + labels: + dependency.python: ${PYTHON_VERSION:-3.10} + dependency.apt.build-essential: true + dependency.apt.clinfo: true + dependency.apt.git: true + dependency.apt.gnupg2: true + dependency.apt.gpg-agent: true + dependency.apt.intel-level-zero-gpu: ${LEVEL_ZERO_GPU_VER:-1.3.29735.27-914~22.04} + dependency.apt.intel-oneapi-runtime-ccl: ${CCL_VER:-2021.13.1-31} + dependency.apt.intel-oneapi-runtime-dpcpp-cpp: ${DPCPP_VER:-2024.2.1-1079} + dependency.apt.intel-oneapi-runtime-mkl: ${MKL_VER:-2024.2.1-103} + dependency.apt.intel-opencl-icd: ${ICD_VER:-23.43.27642.40-803~22.04} + dependency.apt.level-zero: ${LEVEL_ZERO_VER:-1.17.6-914~22.04} + dependency.apt.level-zero-dev: ${LEVEL_ZERO_DEV_VER:-1.17.6-914~22.04} + dependency.apt.rsync: true + dependency.apt.unzip: true + dependency.idp.pip: false + dependency.python.pip: requirements.txt + docs: jax + org.opencontainers.base.name: "intel/python:3.10-core" + org.opencontainers.image.name: "intel/intel-optimized-xla" + org.opencontainers.image.title: "Intel® Optimized XLA Base Image" + org.opencontainers.image.version: ${INTEL_XLA_VERSION:-v0.4.0}-${PACKAGE_OPTION:-pip}-base + target: jax-base + command: > + bash -c "python -c 'import jax; print(\"Jax Version:\", jax.__version__)'" + depends_on: + - ${PACKAGE_OPTION:-pip} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-xla-${INTEL_XLA_VERSION:-v0.4.0}-base + pull_policy: always + jupyter: + build: + labels: + dependency.python.pip: jupyter-requirements.txt + org.opencontainers.base.name: "intel/intel-optimized-xla:${INTEL_XLA_VERSION:-v0.4.0}-base" + org.opencontainers.image.title: "Intel® Optimized XLA Jupyter Base Image" + org.opencontainers.image.version: ${INTEL_XLA_VERSION:-v0.4.0}-jupyter + target: jupyter + command: > + bash -c "python -m jupyter --version" + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + extends: jax-base + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-xla-${INTEL_XLA_VERSION:-v0.4.0}-jupyter + network_mode: host diff --git a/jax/jupyter-requirements.txt b/jax/jupyter-requirements.txt new file mode 100644 index 00000000..d98ce88b --- /dev/null +++ b/jax/jupyter-requirements.txt @@ -0,0 +1,4 @@ +jupyterlab==4.2.5 +jupyterhub==5.1.0 +notebook==7.2.2 +jupyter-server-proxy>=4.1.2 diff --git a/jax/requirements.txt b/jax/requirements.txt new file mode 100644 index 00000000..09d7cb7f --- /dev/null +++ b/jax/requirements.txt @@ -0,0 +1,5 @@ +flax==0.8.2 +intel-extension-for-openxla==0.4.0 +jax==0.4.26 +jaxlib==0.4.26 +cython==3.0.11 diff --git a/jax/tests/example.py b/jax/tests/example.py new file mode 100644 index 00000000..9227d066 --- /dev/null +++ b/jax/tests/example.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: skip-file + +import jax +import jax.numpy as jnp + +print("jax.local_devices(): ", jax.local_devices()) + + +@jax.jit +def lax_conv(): + key = jax.random.PRNGKey(0) + lhs = jax.random.uniform(key, (2, 1, 9, 9), jnp.float32) + rhs = jax.random.uniform(key, (1, 1, 4, 4), jnp.float32) + side = jax.random.uniform(key, (1, 1, 1, 1), jnp.float32) + out = jax.lax.conv_with_general_padding( + lhs, rhs, (1, 1), ((0, 0), (0, 0)), (1, 1), (1, 1) + ) + out = jax.nn.relu(out) + out = jnp.multiply(out, side) + return out + + +print(lax_conv()) diff --git a/jax/tests/tests.yaml b/jax/tests/tests.yaml new file mode 100644 index 00000000..419dbf3f --- /dev/null +++ b/jax/tests/tests.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +jax-import-${PACKAGE_OPTION:-pip}: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-xla-${INTEL_XLA_VERSION:-v0.4.0}-base + cmd: python -c 'import jax; print("Jax Version:", jax.__version__); print(jax.devices())' + device: ["/dev/dri"] +jax-import-jupyter-${PACKAGE_OPTION:-pip}: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-xla-${INTEL_XLA_VERSION:-v0.4.0}-jupyter + cmd: sh -c "python -m jupyter --version" +jax-xpu-example-${PACKAGE_OPTION:-pip}: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-xla-${INTEL_XLA_VERSION:-v0.4.0}-base + cmd: python /tests/example.py + device: ["/dev/dri"] + volumes: + - src: $PWD/jax/tests + dst: /tests diff --git a/mkdocs.yml b/mkdocs.yml index 20c73487..94322e7d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -51,7 +51,7 @@ plugins: - read_csv repo_name: intel/ai-containers repo_url: https://github.com/intel/ai-containers -site_name: Intel® AI Containers +site_name: AI Containers #TODO: Get previous container versions in an easy way # https://squidfunk.github.io/mkdocs-material/setup/setting-up-versioning/ theme: diff --git a/preset/classical-ml/.actions.json b/preset/classical-ml/.actions.json index 639f025c..bc955304 100644 --- a/preset/classical-ml/.actions.json +++ b/preset/classical-ml/.actions.json @@ -1,5 +1,5 @@ { "PYTHON_VERSION": ["3.9", "3.10"], "experimental": [true], - "runner_label": ["PVC"] + "runner_label": ["clx"] } diff --git a/preset/classical-ml/Dockerfile b/preset/classical-ml/Dockerfile index a9666e3a..bd6cebde 100644 --- a/preset/classical-ml/Dockerfile +++ b/preset/classical-ml/Dockerfile @@ -12,40 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. + ARG BASE_IMAGE="ubuntu" ARG BASE_TAG="22.04" -FROM ${BASE_IMAGE}:${BASE_TAG} as classical-ml-base +FROM ${BASE_IMAGE}:${BASE_TAG} as classical-ml ENV DEBIAN_FRONTEND=noninteractive -# See http://bugs.python.org/issue19846 - ENV LANG=C.UTF-8 SHELL ["/bin/bash", "-c"] RUN apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ - bzip2 \ - ca-certificates \ - diffutils \ - gcc \ - git \ - gzip \ - make \ - patch \ - rsync \ - unzip \ - wget \ - xz-utils && \ + bzip2 \ + ca-certificates \ + diffutils \ + gcc \ + git \ + gzip \ + make \ + patch \ + rsync \ + unzip \ + wget \ + xz-utils && \ rm -rf /var/lib/apt/lists/* -FROM classical-ml-base as classical-ml-python - -# Setting up non-root directories RUN useradd --uid 1000 -d /home/dev -s /bin/bash -m dev -# Set a password for the user (Optional) RUN echo 'dev:password' | chpasswd USER dev WORKDIR /home/dev @@ -56,68 +51,69 @@ ARG PYTHON_VERSION ARG IDP_VERSION ARG INTEL_CHANNEL -RUN wget --progress=dot:giga --no-check-certificate https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-${MINIFORGE_VERSION}.sh -O miniforge.sh && \ +RUN wget --progress=dot:giga --no-check-certificate https://github.com/conda-forge/miniforge/releases/latest/download/${MINIFORGE_VERSION}.sh -O miniforge.sh && \ chmod +x miniforge.sh && \ ./miniforge.sh -b -p "${CONDA_ROOT}" && \ rm ./miniforge.sh && \ - ln -s "${CONDA_ROOT}" "${CONDA_ROOT}/../miniforge3" && \ + ln -s "${CONDA_ROOT}" "${CONDA_ROOT}/../miniforge" && \ export PATH="${CONDA_ROOT}/bin/:${PATH}" && \ - conda update -y conda && \ - conda config --add channels conda-forge && \ - conda config --add channels https://software.repos.intel.com/python/conda/ && \ conda init --all && \ conda install -y \ - 'jupyterlab>=4.1.8' \ - 'notebook>=7.1.3' \ - 'jupyterhub>=4.1.5' \ - 'jupyter-server-proxy>=4.1.2' \ - 'mako>=1.2.2' \ - 'pyjwt>=2.4.0' \ - 'cryptography>=42.0.5' \ - 'nodejs>=20.12.2' \ - 'aiohttp>=3.9.4' \ + 'colorama==0.4.6' \ + 'conda==24.5.0' \ + 'jupyterhub==5.1.0' \ + 'jupyter-server-proxy==4.3.0' \ + 'mamba==1.5.8' \ + 'networkx==3.3' \ + 'notebook==7.2.1' \ + 'pip==24.0' \ + 'python==3.10.14' \ 'idna>=3.7' \ - 'oauthlib>=3.2.2' \ - && \ - jupyter labextension disable "@jupyterlab/apputils-extension:announcements" && \ - conda clean -y --all + 'requests>=2.32.0' \ + 'setuptools>=70.0.0' \ + 'tqdm>=4.66.3' \ + 'urllib3>=2.2.2' \ + 'nodejs==22.5.1' \ + && \ + jupyter labextension disable "@jupyterlab/apputils-extension:announcements" \ + && \ + conda clean -y --all \ + && \ + conda config --add channels ${INTEL_CHANNEL} ENV PATH ${CONDA_ROOT}/condabin:${CONDA_ROOT}/bin/:${PATH} +RUN conda config --set pip_interop_enabled True ARG IDP_VERSION +ARG DAAL4PY_VERSION ARG DPNP_VERSION ARG XGBOOST_VERSION ARG MODIN_VERSION ARG NUMPY_VERSION ARG SKLEARNEX_VERSION -# Conda packages -RUN conda create -yn classical-ml -c ${INTEL_CHANNEL} -c conda-forge \ - dpnp=${DPNP_VERSION} \ - numpy=${NUMPY_VERSION} \ - python=${PYTHON_VERSION} \ - scikit-learn-intelex==${SKLEARNEX_VERSION} \ - xgboost=${XGBOOST_VERSION} \ - modin-ray=${MODIN_VERSION} \ - 'python-dotenv>=1.0.1' \ - 'tqdm>=4.66.2' \ - 'matplotlib-base>=3.4.3' \ - 'threadpoolctl>=3.3.0' \ - 'ipython>=8.18.1' \ - 'ipykernel>=6.29.3' \ - 'kernda>=0.3.0' \ - 'protobuf>=4.24' \ - 'pillow>=10.2.0' \ - 'tornado>=6.3.3' && \ +RUN conda create -yn classical-ml \ + "python=${PYTHON_VERSION}" \ + "daal4py=${DAAL4PY_VERSION}" \ + "dpnp=${DPNP_VERSION}" \ + 'ipykernel==6.29.5' \ + 'kernda==0.3.0' \ + 'matplotlib-base==3.8.4' \ + "modin-ray=${MODIN_VERSION}" \ + 'python-dotenv==1.0.1' \ + "scikit-learn-intelex=${SKLEARNEX_VERSION}" \ + 'tqdm==4.66.4' \ + "xgboost=${XGBOOST_VERSION}" \ + 'idna>=3.7' \ + 'requests>=2.32.0' \ + 'setuptools>=70.0.0' \ + 'tqdm>=4.66.3' \ + 'urllib3>=2.2.2' \ + && \ conda clean -y --all - - -# PyPI packages RUN conda run -n classical-ml python -m pip install --no-deps --no-cache-dir \ - 'dataset-librarian==1.0.4' \ - 'cloud-data-connector==1.0.3' - + 'dataset-librarian==1.0.4' ENV PYTHONSTARTUP=~/.patch_sklearn.py COPY base/.patch_sklearn.py ~/.patch_sklearn.py @@ -125,8 +121,6 @@ COPY base/.patch_sklearn.py ~/.patch_sklearn.py ENV PYTHONSTARTUP=/home/dev/.patch_sklearn.py COPY base/.patch_sklearn.py /home/dev/.patch_sklearn.py -FROM classical-ml-python as classical-ml-jupyter - EXPOSE 8888 RUN mkdir -p ~/jupyter/ && chmod -R a+rwx ~/jupyter/ && \ @@ -136,10 +130,10 @@ WORKDIR /home/dev COPY --chown=dev notebooks /home/dev/jupyter COPY --chown=dev tests /home/dev/sample-tests -RUN "${CONDA_ROOT}/envs/classical-ml/bin/python" -m ipykernel install --user --name classical-ml --display-name "Classical ML" && \ - "${CONDA_ROOT}/envs/classical-ml/bin/kernda" -o -y "$HOME/.local/share/jupyter/kernels/$(echo classical-ml | sed -e 's/\(.*\)/\L\1/')/kernel.json" && \ - "${CONDA_ROOT}/envs/classical-ml/bin/python" -m ipykernel.kernelspec --user && \ - conda clean -y --all +RUN KERNEL_DIR="${CONDA_ROOT}/share/jupyter/kernels/classical-ml" && \ + conda run -n classical-ml python -m ipykernel install --prefix "$CONDA_ROOT" --name classical-ml --display-name "Classical ML" && \ + conda run -n classical-ml kernda -o -y "$KERNEL_DIR/kernel.json" && \ + conda run -n base jupyter kernelspec list CMD ["bash", "-c", "source activate classical-ml && jupyter lab --notebook-dir=~/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root"] diff --git a/preset/classical-ml/docker-compose.yaml b/preset/classical-ml/docker-compose.yaml index a6e06fbd..c2dc9c1a 100644 --- a/preset/classical-ml/docker-compose.yaml +++ b/preset/classical-ml/docker-compose.yaml @@ -15,6 +15,7 @@ # -*- coding: utf-8 -*- # + version: '3' services: classical-ml: @@ -22,28 +23,30 @@ services: args: BASE_IMAGE: ${BASE_IMAGE:-ubuntu} BASE_TAG: ${BASE_TAG:-22.04} - DPNP_VERSION: ${NUMBA_DPEX_VERSION:-0.14.0} - IDP_VERSION: ${IDP_VERSION:-2024.1.0} + DAAL4PY_VERSION: ${DAAL4PY_VERSION:-2024.5.0} + DPNP_VERSION: ${DPNP_VERSION:-0.15.0} + IDP_VERSION: ${IDP_VERSION:-2024.2} INTEL_CHANNEL: ${INTEL_CHANNEL:-https://software.repos.intel.com/python/conda/} - MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Linux-x86_64} - MODIN_VERSION: ${MODIN_VERSION:-0.26.1} - MPI_VERSION: ${MPI_VERSION:-2021.12.0} - NUMBA_DPEX_VERSION: ${NUMBA_DPEX_VERSION:-0.22.1} + MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Miniforge3-Linux-x86_64} + MODIN_VERSION: ${MODIN_VERSION:-0.30.0} + MPI_VERSION: ${MPI_VERSION:-2021.13} + NUMBA_DPEX_VERSION: ${NUMBA_DPEX_VERSION:-0.23.0} NUMPY_VERSION: ${NUMPY_VERSION:-1.26.4} - PYTHON_VERSION: ${PYTHON_VERSION:-3.10} - SKLEARNEX_VERSION: ${SKLEARNEX_VERSION:-2024.2.0} + PYTHON_VERSION: ${PYTHON_VERSION:-3.9} + SKLEARNEX_VERSION: ${SKLEARNEX_VERSION:-2024.5.0} XGBOOST_VERSION: ${XGBOOST_VERSION:-2.0.3} http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: '' context: . + target: classical-ml labels: docs: classical_ml org.opencontainers.image.title: "Intel® AI Tools Selector Preset Containers - Classical ML" org.opencontainers.base.name: "ubuntu:22.04" org.opencontainers.image.name: "intel/classical-ml" - org.opencontainers.image.version: 2024.1.0-py${PYTHON_VERSION:-3.10} - dependency.python: ${PYTHON_VERSION:-3.10} + org.opencontainers.image.version: 2024.2.0-py${PYTHON_VERSION:-3.9} + dependency.python: ${PYTHON_VERSION:-3.9} dependency.python.pip: requirements.txt dependency.apt.bzip2: true dependency.apt.ca-certificates: true @@ -57,39 +60,26 @@ services: dependency.apt.unzip: true dependency.apt.wget: true dependency.apt.xz-utils: true - dependency.conda.jupyterlab: '>=4.1.8' - dependency.conda.notebook: '>=7.1.3' - dependency.conda.jupyterhub: '>=4.1.5' - dependency.conda.jupyter-server-proxy: '>=4.1.2' - dependency.conda.mako: '>=1.2.2' - dependency.conda.pyjwt: '>=2.4.0' - dependency.conda.cryptography: '>=42.0.5' - dependency.conda.nodejs: '>=20.12.2' - dependency.conda.aiohttp: '>=3.9.4' - dependency.conda.idna: '>=3.7' - dependency.conda.oauthlib: '>=3.2.2' - dependency.conda.dpnp: '>=0.14.0' - dependency.conda.numpy: '>=1.26.4' - dependency.conda.python: "=${PYTHON_VERSION:-3.10}" - dependency.conda.scikit-learn-intelex: '>=2024.2.0' - dependency.conda.xgboost: '>=2.0.3' - dependency.conda.modin-ray: '>=0.26.1' - dependency.conda.python-dotenv: '>=1.0.1' - dependency.conda.tqdm: '>=4.66.2' - dependency.conda.matplotlib-base: '>=3.4.3' - dependency.conda.dataset_librarian: '>=1.0.4' - dependency.conda.threadpoolctl: '>=3.3.0' - dependency.conda.ipython: '>=8.18.1' - dependency.conda.ipykernel: '>=6.29.3' - dependency.conda.kernda: '>=0.3.0' - dependency.conda.protobuf: '>=4.24' - dependency.conda.pillow: '>=10.2.0' - dependency.conda.tornado: '>=6.3.3' - target: classical-ml-jupyter - command: | - bash -c "conda run -n classical-ml python -c 'import sklearn; import xgboost; print(\"SciKit:\", sklearn.__version__, \" XGBoost:\",xgboost.__version__)' && \ - conda run -n classical-ml python -c 'import modin.pandas as pd, modin.config as cfg; cfg.Engine.put(\"Ray\"); df = pd.DataFrame([1]);print(df+1)'" - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-2024.1.0-py${PYTHON_VERSION:-3.10} + dependency.conda.colorama: '==0.4.6' + dependency.conda.conda: '==24.5.0' + dependency.conda.daal4py: '=2024.5.0' + dependency.conda.dpnp: '=0.15.0' + dependency.conda.ipykernel: '==6.29.5' + dependency.conda.jupyterhub: '==5.1.0' + dependency.conda.jupyter-server-proxy: '==4.3.0' + dependency.conda.kernda: '==0.3.0' + dependency.conda.mamba: '==1.5.8' + dependency.conda.matplotlib-base: '==3.8.4' + dependency.conda.modin-ray: '=0.30.0' + dependency.conda.networkx: '==3.3' + dependency.conda.notebook: '==7.2.1' + dependency.conda.pip: '==24.0' + dependency.conda.python: '==3.10.14' + dependency.conda.python-dotenv: '==1.0.1' + dependency.conda.scikit-learn-intelex: '=2024.5.0' + dependency.conda.tqdm: '==4.66.4' + dependency.conda.xgboost: '=2.0.3' + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} environment: http_proxy: ${http_proxy} https_proxy: ${https_proxy} @@ -97,3 +87,9 @@ services: shm_size: 12GB volumes: - /dev/dri/by-path:/dev/dri/by-path + command: > + bash -c " conda run -n classical-ml python -c 'import sklearn;import xgboost;print(\"SciKit:\", + sklearn.__version__, \" XGBoost:\", xgboost.__version__)' && + + conda run -n classical-ml python -c 'import modin.pandas as pd;import modin.config + as cfg;cfg.Engine.put(\"Ray\");df = pd.DataFrame([1]);print(df+1)' " diff --git a/preset/classical-ml/requirements.txt b/preset/classical-ml/requirements.txt index d231202d..8fe3dfff 100644 --- a/preset/classical-ml/requirements.txt +++ b/preset/classical-ml/requirements.txt @@ -1 +1 @@ -cloud-data-connector==1.0.3 +dataset-librarian==1.0.4 diff --git a/preset/classical-ml/tests.yaml b/preset/classical-ml/tests.yaml index 14529526..919eb4e9 100644 --- a/preset/classical-ml/tests.yaml +++ b/preset/classical-ml/tests.yaml @@ -21,23 +21,23 @@ # img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.1.0}-py3.9 modin-${PYTHON_VERSION:-3.9}: cmd: conda run -n classical-ml sample-tests/modin/test_modin.sh - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} shm_size: 10.24G modin-notebook-${PYTHON_VERSION:-3.9}: cmd: papermill --log-output jupyter/modin/IntelModin_Vs_Pandas.ipynb -k classical-ml - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True scikit-${PYTHON_VERSION:-3.9}: cmd: conda run -n classical-ml sample-tests/scikit/test_scikit.sh - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} scikit-notebook-${PYTHON_VERSION:-3.9}: cmd: papermill --log-output jupyter/sklearn/Intel_Extension_For_SKLearn_GettingStarted.ipynb -k classical-ml - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True xgboost-${PYTHON_VERSION:-3.9}: cmd: conda run -n classical-ml sample-tests/xgboost/test_xgboost.sh - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} xgboost-notebook-${PYTHON_VERSION:-3.9}: cmd: papermill --log-output jupyter/xgboost/IntelPython_XGBoost_Performance.ipynb -k classical-ml - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-classical-ml-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True diff --git a/preset/classical-ml/tests/scikit/kmeans.py b/preset/classical-ml/tests/scikit/kmeans.py index 9120b7d0..c78acba7 100644 --- a/preset/classical-ml/tests/scikit/kmeans.py +++ b/preset/classical-ml/tests/scikit/kmeans.py @@ -62,6 +62,7 @@ data, labels = load_digits(return_X_y=True) (n_samples, n_features), n_digits = data.shape, np.unique(labels).size +data = np.array(data, dtype=np.float64) print(f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}") diff --git a/preset/classical-ml/tests/scikit/test_scikit.sh b/preset/classical-ml/tests/scikit/test_scikit.sh index a6b2f24e..9d16e938 100755 --- a/preset/classical-ml/tests/scikit/test_scikit.sh +++ b/preset/classical-ml/tests/scikit/test_scikit.sh @@ -14,8 +14,8 @@ # limitations under the License. set -xe + SCRIPT_DIR=$(dirname "$0") python "${SCRIPT_DIR}/kmeans.py" - -python "${SCRIPT_DIR}/kmeans.py" true +python "${SCRIPT_DIR}/kmeans.py" true # Enable intel opt diff --git a/preset/data-analytics/.actions.json b/preset/data-analytics/.actions.json index 639f025c..bc955304 100644 --- a/preset/data-analytics/.actions.json +++ b/preset/data-analytics/.actions.json @@ -1,5 +1,5 @@ { "PYTHON_VERSION": ["3.9", "3.10"], "experimental": [true], - "runner_label": ["PVC"] + "runner_label": ["clx"] } diff --git a/preset/data-analytics/Dockerfile b/preset/data-analytics/Dockerfile index 37954c83..ffb56ceb 100644 --- a/preset/data-analytics/Dockerfile +++ b/preset/data-analytics/Dockerfile @@ -12,107 +12,100 @@ # See the License for the specific language governing permissions and # limitations under the License. + ARG BASE_IMAGE="ubuntu" ARG BASE_TAG="22.04" -FROM ${BASE_IMAGE}:${BASE_TAG} as data-analytics-base +FROM ${BASE_IMAGE}:${BASE_TAG} as data-analytics ENV DEBIAN_FRONTEND=noninteractive -# See http://bugs.python.org/issue19846 - ENV LANG=C.UTF-8 SHELL ["/bin/bash", "-c"] RUN apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ - bzip2 \ - ca-certificates \ - diffutils \ - gcc \ - git \ - gzip \ - make \ - patch \ - rsync \ - unzip \ - wget \ - xz-utils && \ + bzip2 \ + ca-certificates \ + diffutils \ + gcc \ + git \ + gzip \ + make \ + patch \ + rsync \ + unzip \ + wget \ + xz-utils && \ rm -rf /var/lib/apt/lists/* -FROM data-analytics-base as data-analytics-python - -# Setting up non-root directories RUN useradd --uid 1000 -d /home/dev -s /bin/bash -m dev -# Set a password for the user (Optional) RUN echo 'dev:password' | chpasswd USER dev WORKDIR /home/dev ENV CONDA_ROOT=/home/dev/conda - ARG MINIFORGE_VERSION ARG PYTHON_VERSION ARG IDP_VERSION ARG INTEL_CHANNEL -RUN wget --progress=dot:giga --no-check-certificate "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-${MINIFORGE_VERSION}.sh" -O miniforge.sh && \ +RUN wget --progress=dot:giga --no-check-certificate "https://github.com/conda-forge/miniforge/releases/latest/download/${MINIFORGE_VERSION}.sh" -O miniforge.sh && \ chmod +x miniforge.sh && \ ./miniforge.sh -b -p "${CONDA_ROOT}" && \ rm ./miniforge.sh && \ - ln -s "${CONDA_ROOT}" "${CONDA_ROOT}/../miniforge3" && \ + ln -s "${CONDA_ROOT}" "${CONDA_ROOT}/../miniforge" && \ export PATH="${CONDA_ROOT}/bin/:${PATH}" && \ - conda update -y conda && \ - conda config --add channels conda-forge && \ - conda config --add channels https://software.repos.intel.com/python/conda/ && \ conda init --all && \ conda install -y \ - 'jupyterlab>=4.1.8' \ - 'notebook>=7.1.3' \ - 'jupyterhub>=4.1.5' \ - 'jupyter-server-proxy>=4.1.2' \ - 'mako>=1.2.2' \ - 'pyjwt>=2.4.0' \ - 'cryptography>=42.0.5' \ - 'nodejs>=20.12.2' \ + 'colorama==0.4.6' \ + 'conda==24.5.0' \ + 'jupyterhub==5.1.0' \ + 'jupyter-server-proxy==4.3.0' \ + 'mamba==1.5.8' \ + 'networkx==3.3' \ + 'notebook==7.2.1' \ + 'python==3.10.14' \ 'idna>=3.7' \ - 'tqdm>=4.66.2' \ - && \ - jupyter labextension disable "@jupyterlab/apputils-extension:announcements" && \ - conda clean -y --all + 'requests>=2.32.0' \ + 'setuptools>=70.0.0' \ + 'tqdm>=4.66.3' \ + 'urllib3>=2.2.2' \ + 'nodejs==22.5.1' \ + && \ + jupyter labextension disable "@jupyterlab/apputils-extension:announcements" \ + && \ + conda clean -y --all \ + && \ + conda config --add channels ${INTEL_CHANNEL} ENV PATH ${CONDA_ROOT}/condabin:${CONDA_ROOT}/bin/:${PATH} +RUN conda config --set pip_interop_enabled True ARG IDP_VERSION ARG DPNP_VERSION ARG MODIN_VERSION ARG NUMPY_VERSION -# data-analytics Env - conda packages -RUN conda create -yn data-analytics -c "${INTEL_CHANNEL}" -c conda-forge \ - dpnp="${DPNP_VERSION}" \ - numpy="${NUMPY_VERSION}" \ - python="${PYTHON_VERSION}" \ - modin-ray="${MODIN_VERSION}" \ - 'python-dotenv>=1.0.1' \ - 'tqdm>=4.66.2' \ - 'matplotlib-base>=3.4.3' \ - 'threadpoolctl>=3.3.0' \ - 'ipython>=8.18.1' \ - 'ipykernel>=6.29.3' \ - 'kernda>=0.3.0' \ - 'protobuf>=4.24.4' \ - 'pillow>=10.2.0' \ +RUN conda create -yn data-analytics \ + "python=${PYTHON_VERSION}" \ + "dpnp=${DPNP_VERSION}" \ + 'ipykernel==6.29.5' \ + 'kernda==0.3.0' \ + 'matplotlib-base==3.8.4' \ + "modin-ray=${MODIN_VERSION}" \ + 'python-dotenv==1.0.1' \ 'idna>=3.7' \ - 'tornado>=6.3.3' && \ + 'requests>=2.32.0' \ + 'setuptools>=70.0.0' \ + 'tqdm>=4.66.3' \ + 'urllib3>=2.2.2' \ + && \ conda clean -y --all RUN conda run -n data-analytics python -m pip install --no-deps --no-cache-dir \ - 'dataset-librarian==1.0.4' \ - 'cloud-data-connector==1.0.3' - -FROM data-analytics-python as data-analytics-jupyter + 'dataset-librarian==1.0.4' EXPOSE 8888 @@ -122,10 +115,10 @@ RUN mkdir -p ~/jupyter/ && chmod -R a+rwx ~/jupyter/ && \ COPY --chown=dev notebooks /home/dev/jupyter COPY --chown=dev tests /home/dev/sample-tests -RUN "${CONDA_ROOT}/envs/data-analytics/bin/python" -m ipykernel install --user --name data-analytics --display-name "Data Analytics" && \ - "${CONDA_ROOT}/envs/data-analytics/bin/kernda" -o -y "$HOME/.local/share/jupyter/kernels/$(echo data-analytics | sed -e 's/\(.*\)/\L\1/')/kernel.json" && \ - "${CONDA_ROOT}/envs/data-analytics/bin/python" -m ipykernel.kernelspec --user && \ - conda clean -y --all +RUN KERNEL_DIR="${CONDA_ROOT}/share/jupyter/kernels/data-analytics" && \ + conda run -n data-analytics python -m ipykernel install --prefix "$CONDA_ROOT" --name data-analytics --display-name "Data Analytics" && \ + conda run -n data-analytics kernda -o -y "$KERNEL_DIR/kernel.json" && \ + conda run -n base jupyter kernelspec list CMD ["bash", "-c", "source activate data-analytics && jupyter lab --notebook-dir=~/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root"] diff --git a/preset/data-analytics/docker-compose.yaml b/preset/data-analytics/docker-compose.yaml index 99b37f6d..9c00331e 100644 --- a/preset/data-analytics/docker-compose.yaml +++ b/preset/data-analytics/docker-compose.yaml @@ -15,6 +15,7 @@ # -*- coding: utf-8 -*- # + version: '3' services: data-analytics: @@ -22,26 +23,26 @@ services: args: BASE_IMAGE: ${BASE_IMAGE:-ubuntu} BASE_TAG: ${BASE_TAG:-22.04} - DPNP_VERSION: ${NUMBA_DPEX_VERSION:-0.14.0} - IDP_VERSION: ${IDP_VERSION:-2024.1.0} + DPNP_VERSION: ${DPNP_VERSION:-0.15.0} + IDP_VERSION: ${IDP_VERSION:-2024.2} INTEL_CHANNEL: ${INTEL_CHANNEL:-https://software.repos.intel.com/python/conda/} - MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Linux-x86_64} - MODIN_VERSION: ${MODIN_VERSION:-0.26.1} - MPI_VERSION: ${MPI_VERSION:-2021.12.0} - NUMBA_DPEX_VERSION: ${NUMBA_DPEX_VERSION:-0.22.1} + MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Miniforge3-Linux-x86_64} + MODIN_VERSION: ${MODIN_VERSION:-0.30.0} NUMPY_VERSION: ${NUMPY_VERSION:-1.26.4} - PYTHON_VERSION: ${PYTHON_VERSION:-3.10} + PYTHON_VERSION: ${PYTHON_VERSION:-3.9} + XGBOOST_VERSION: ${XGBOOST_VERSION:-2.0.3} http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: '' context: . + target: data-analytics labels: docs: data_analytics org.opencontainers.image.title: "Intel® AI Tools Selector Preset Containers - Data Analytics" org.opencontainers.base.name: "ubuntu:22.04" org.opencontainers.image.name: "intel/data-analytics" - org.opencontainers.image.version: 2024.1.0-py${PYTHON_VERSION:-3.10} - dependency.python: ${PYTHON_VERSION:-3.10} + org.opencontainers.image.version: 2024.2.0-py${PYTHON_VERSION:-3.9} + dependency.python: ${PYTHON_VERSION:-3.9} dependency.python.pip: requirements.txt dependency.apt.bzip2: true dependency.apt.ca-certificates: true @@ -55,34 +56,21 @@ services: dependency.apt.unzip: true dependency.apt.wget: true dependency.apt.xz-utils: true - dependency.conda.jupyterlab: '>=4.1.8' - dependency.conda.notebook: '>=7.1.3' - dependency.conda.jupyterhub: '>=4.1.5' - dependency.conda.jupyter-server-proxy: '>=4.1.2' - dependency.conda.mako: '>=1.2.2' - dependency.conda.pyjwt: '>=2.4.0' - dependency.conda.cryptography: '>=42.0.5' - dependency.conda.nodejs: '>=20.12.2' - dependency.conda.idna: '>=3.7' - dependency.conda.tqdm: '>=4.66.2' - dependency.conda.dpnp: '>=0.14.0' - dependency.conda.numpy: '>=1.26.4' - dependency.conda.python: "=${PYTHON_VERSION:-3.10}" - dependency.conda.modin-ray: '>=0.26.1' - dependency.conda.python-dotenv: '>=1.0.1' - dependency.conda.matplotlib-base: '>=3.4.3' - dependency.conda.dataset_librarian: '>=1.0.4' - dependency.conda.threadpoolctl: '>=3.3.0' - dependency.conda.ipython: '>=8.18.1' - dependency.conda.ipykernel: '>=6.29.3' - dependency.conda.kernda: '>=0.3.0' - dependency.conda.protobuf: '>=4.24.4' - dependency.conda.pillow: '>=10.2.0' - dependency.conda.tornado: '>=6.3.3' - target: data-analytics-jupyter - command: > - bash -c "conda run -n data-analytics python -c 'import modin.pandas as pd, modin.config as cfg; cfg.Engine.put(\"Ray\"); df = pd.DataFrame([1]);print(df+1)'" - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-data-analytics-2024.1.0-py${PYTHON_VERSION:-3.10} + dependency.conda.colorama: '==0.4.6' + dependency.conda.conda: '==24.5.0' + dependency.conda.dpnp: '=0.15.0' + dependency.conda.ipykernel: '==6.29.5' + dependency.conda.jupyterhub: '==5.1.0' + dependency.conda.jupyter-server-proxy: '==4.3.0' + dependency.conda.kernda: '==0.3.0' + dependency.conda.mamba: '==1.5.8' + dependency.conda.matplotlib-base: '==3.8.4' + dependency.conda.modin-ray: '=0.30.0' + dependency.conda.networkx: '==3.3' + dependency.conda.notebook: '==7.2.1' + dependency.conda.python: '==3.10.14' + dependency.conda.python-dotenv: '==1.0.1' + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-data-analytics-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} environment: http_proxy: ${http_proxy} https_proxy: ${https_proxy} @@ -90,3 +78,7 @@ services: shm_size: 12GB volumes: - /dev/dri/by-path:/dev/dri/by-path + command: > + bash -c " conda run -n data-analytics python -c 'import modin.pandas as pd;import + modin.config as cfg;cfg.Engine.put(\"Ray\");df = pd.DataFrame([1]);print(df+1)' + " diff --git a/preset/data-analytics/requirements.txt b/preset/data-analytics/requirements.txt index d231202d..8fe3dfff 100644 --- a/preset/data-analytics/requirements.txt +++ b/preset/data-analytics/requirements.txt @@ -1 +1 @@ -cloud-data-connector==1.0.3 +dataset-librarian==1.0.4 diff --git a/preset/data-analytics/tests.yaml b/preset/data-analytics/tests.yaml index 5aff8c81..846bf0b9 100644 --- a/preset/data-analytics/tests.yaml +++ b/preset/data-analytics/tests.yaml @@ -14,12 +14,12 @@ dataset-librarian-${PYTHON_VERSION:-3.9}: cmd: conda run -n data-analytics bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-data-analytics-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-data-analytics-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} modin-${PYTHON_VERSION:-3.9}: cmd: conda run -n data-analytics sample-tests/modin/test_modin.sh - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-data-analytics-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-data-analytics-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} shm_size: 10G modin-notebook-${PYTHON_VERSION:-3.9}: cmd: papermill --log-output jupyter/modin/IntelModin_Vs_Pandas.ipynb -k data-analytics - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-data-analytics-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-data-analytics-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True diff --git a/preset/deep-learning/Dockerfile b/preset/deep-learning/Dockerfile index 05721e11..213606b8 100644 --- a/preset/deep-learning/Dockerfile +++ b/preset/deep-learning/Dockerfile @@ -12,158 +12,148 @@ # See the License for the specific language governing permissions and # limitations under the License. + ARG BASE_IMAGE=ubuntu ARG BASE_TAG=22.04 -FROM ${BASE_IMAGE}:${BASE_TAG} AS dgpu-base +FROM ${BASE_IMAGE}:${BASE_TAG} AS deep-learning-base -ENV DEBIAN_FRONTEND=noninteractive +SHELL ["/bin/bash", "-c"] -# See http://bugs.python.org/issue19846 +ENV DEBIAN_FRONTEND=noninteractive ENV LANG C.UTF-8 ARG PYTHON_VERSION EXPOSE 8080 -ENV LANG=C.UTF-8 - -SHELL ["/bin/bash", "-c"] - RUN apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ - apt-utils \ - build-essential \ - bzip2 \ - ca-certificates \ - clinfo \ - cmake \ - diffutils \ - g++ \ - gcc \ - git \ - gnupg2 \ - gpg-agent \ - gzip \ - make \ - numactl \ - patch \ - rsync \ - unzip \ - wget \ - sudo \ - xz-utils && \ + apt-utils \ + build-essential \ + bzip2 \ + ca-certificates \ + clinfo \ + cmake \ + diffutils \ + g++ \ + gcc \ + git \ + gnupg2 \ + gpg-agent \ + gzip \ + make \ + numactl \ + patch \ + rsync \ + unzip \ + wget \ + sudo \ + xz-utils \ + && \ rm -rf /var/lib/apt/lists/* -# GPU Drivers setup ARG DEVICE ARG ICD_VER ARG LEVEL_ZERO_GPU_VER ARG LEVEL_ZERO_VER ARG LEVEL_ZERO_DEV_VER - -# Public Drivers link RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ - gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg -RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" | \ + gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \ + echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" | \ tee /etc/apt/sources.list.d/intel-gpu-jammy.list RUN apt-get update && \ apt-get install -y --no-install-recommends --fix-missing \ - intel-opencl-icd="${ICD_VER}" \ - intel-level-zero-gpu="${LEVEL_ZERO_GPU_VER}" \ - level-zero="${LEVEL_ZERO_VER}" + intel-level-zero-gpu="${LEVEL_ZERO_GPU_VER}" \ + intel-opencl-icd="${ICD_VER}" \ + level-zero="${LEVEL_ZERO_VER}" RUN apt-get update && \ apt-get install -y --no-install-recommends --fix-missing \ - intel-media-va-driver-non-free \ - libmfx1 \ - libmfxgen1 \ - libvpl2 \ - libegl-mesa0 \ - libegl1-mesa \ - libegl1-mesa-dev \ - libgbm1 \ - libgl1-mesa-dev \ - libgl1-mesa-dri \ - libglapi-mesa \ - libgles2-mesa-dev \ - libglx-mesa0 \ - libigdgmm12 \ - libxatracker2 \ - mesa-va-drivers \ - mesa-vdpau-drivers \ - mesa-vulkan-drivers \ - va-driver-all \ - vainfo \ - hwinfo \ - clinfo + clinfo \ + hwinfo \ + intel-media-va-driver-non-free \ + libegl-mesa0 \ + libegl1-mesa \ + libegl1-mesa-dev \ + libgbm1 \ + libgl1-mesa-dev \ + libgl1-mesa-dri \ + libglapi-mesa \ + libgles2-mesa-dev \ + libglx-mesa0 \ + libigdgmm12 \ + libmfx1 \ + libmfxgen1 \ + libvpl2 \ + mesa-va-drivers \ + mesa-vdpau-drivers \ + mesa-vulkan-drivers \ + va-driver-all \ + vainfo RUN apt-get install -y --no-install-recommends --fix-missing \ - libigc-dev \ - intel-igc-cm \ - libigdfcl-dev \ - libigfxcmrt-dev \ - level-zero-dev="${LEVEL_ZERO_DEV_VER}" && \ - rm -rf /var/lib/apt/lists/* - -RUN rm /etc/apt/sources.list.d/*list + intel-igc-cm \ + libigc-dev \ + libigdfcl-dev \ + libigfxcmrt-dev \ + level-zero-dev="${LEVEL_ZERO_DEV_VER}" \ + && \ + rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/*list -FROM dgpu-base as deep-learning-python - -# Setting up non-root directories RUN useradd --uid 1000 -d /home/dev -s /bin/bash dev RUN groupadd -g 109 render -## Add the user to the required groups RUN usermod -aG root,sudo,video,render dev -# Set a password for the user (Optional) RUN echo 'dev:password' | chpasswd USER dev WORKDIR /home/dev ENV CONDA_ROOT=/home/dev/conda - -# Miniforge Python Installation ARG MINIFORGE_VERSION ARG PYTHON_VERSION ARG IDP_VERSION ARG INTEL_CHANNEL -RUN wget --progress=dot:giga --no-check-certificate "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-${MINIFORGE_VERSION}.sh" -O miniforge.sh && \ +RUN wget --progress=dot:giga --no-check-certificate "https://github.com/conda-forge/miniforge/releases/latest/download/${MINIFORGE_VERSION}.sh" -O miniforge.sh && \ chmod +x miniforge.sh && \ ./miniforge.sh -b -p "${CONDA_ROOT}" && \ rm ./miniforge.sh && \ - ln -s "${CONDA_ROOT}" "${CONDA_ROOT}/../miniforge3" && \ + ln -s "${CONDA_ROOT}" "${CONDA_ROOT}/../miniforge" && \ export PATH="${CONDA_ROOT}/bin/:${PATH}" && \ - conda update -y conda && \ - conda config --add channels conda-forge && \ - conda config --add channels https://software.repos.intel.com/python/conda/ && \ conda init --all && \ - conda install -c conda-forge \ - 'jupyterlab>=4.1.8' \ - 'notebook>=7.1.3' \ - 'jupyterhub>=4.1.5' \ - 'jupyter-server-proxy>=4.1.2' \ - 'mako>=1.2.2' \ - 'pyjwt>=2.4.0' \ - 'cryptography>=42.0.5' \ - 'nodejs>=20.12.2' \ + conda install -y \ + 'colorama==0.4.6' \ + 'conda==24.5.0' \ + 'jupyter-server-proxy==4.3.0' \ + 'jupyterhub==5.1.0' \ + 'ld_impl_linux-64==2.40' \ + 'mamba==1.5.8' \ + 'networkx==3.3' \ + 'notebook==7.2.1' \ + 'python==3.10.14' \ + 'aiohttp>=3.9.4' \ + 'certifi>=2024.07.04' \ 'idna>=3.7' \ - 'tqdm>=4.66.2' \ - && \ - jupyter labextension disable "@jupyterlab/apputils-extension:announcements" && \ - conda clean -y --all + 'jinja2>=3.1.4' \ + 'requests>=2.32.0' \ + 'setuptools>=70.0.0' \ + 'tqdm>=4.66.3' \ + 'urllib3>=2.2.2' \ + 'zipp>=3.19.1' \ + 'nodejs==22.5.1' \ + && \ + jupyter labextension disable "@jupyterlab/apputils-extension:announcements" \ + && \ + conda clean -y --all \ + && \ + conda config --add channels ${INTEL_CHANNEL} ENV PATH ${CONDA_ROOT}/condabin:${CONDA_ROOT}/bin/:${PATH} +RUN conda config --set pip_interop_enabled True -RUN conda config --set pip_interop_enabled True # Improve interoperabilty among conda an pypi packages - - -# PyTorch Installation -ARG IDP_VERSION ARG DPNP_VERSION ARG NUMPY_VERSION - ARG TORCH_CPU_VERSION ARG ONECCL_CPU_VERSION ARG IPEX_CPU_VERSION @@ -171,120 +161,94 @@ ARG TORCHVISION_CPU_VERSION ARG TORCHAUDIO_CPU_VERSION ARG DEEPSPEED_VERSION -# PyTorch CPU Env - conda packages -RUN conda create -yn pytorch-cpu -c "${INTEL_CHANNEL}" -c conda-forge \ - dpnp="${DPNP_VERSION}" \ - numpy="${NUMPY_VERSION}" \ - python="${PYTHON_VERSION}" \ - intel-openmp="${IDP_VERSION}" \ - pytorch="${TORCH_CPU_VERSION}" \ - oneccl_bind_pt="${ONECCL_CPU_VERSION}" \ - intel-extension-for-pytorch="${IPEX_CPU_VERSION}" \ - torchvision="${TORCHVISION_CPU_VERSION}" \ - torchaudio="${TORCHAUDIO_CPU_VERSION}" \ - 'matplotlib-base>=3.4.3' \ - 'ipykernel>=6.29.3' \ - 'kernda>=0.3.0' \ - 'pillow>=10.2.0' \ - 'aiohttp>=3.9.0' \ - 'tornado>=6.3.3' \ - 'jinja2>=3.1.3' \ +RUN conda create -yn 'pytorch-cpu' \ + -c huggingface \ + "python=${PYTHON_VERSION}" \ + 'accelerate==0.32.1' \ + "dpnp=${DPNP_VERSION}" \ + "intel-extension-for-pytorch=${IPEX_CPU_VERSION}" \ + 'ipykernel==6.29.5' \ + 'kernda==0.3.0' \ + 'matplotlib-base>=3.8.4' \ + "oneccl_bind_pt=${ONECCL_CPU_VERSION}" \ + "pytorch=${TORCH_CPU_VERSION}" \ + 'tensorboardx==2.6.2.2' \ + "torchaudio=${TORCHAUDIO_CPU_VERSION}" \ + "torchvision=${TORCHVISION_CPU_VERSION}" \ + 'python-dotenv==1.0.1' \ + 'aiohttp>=3.9.4' \ + 'certifi>=2024.07.04' \ 'idna>=3.7' \ - 'onnx>=1.15.0' \ + 'jinja2>=3.1.4' \ + 'onnx>=1.16.0' \ + 'requests>=2.32.0' \ + 'tqdm>=4.66.3' \ + 'urllib3>=2.2.2' \ + 'zipp>=3.19.1' \ && \ conda clean -y --all -# PyPI packages -RUN conda run -n pytorch-cpu pip install --no-deps --no-cache-dir --ignore-installed \ - 'ninja>=1.11.1.1' \ - 'python-dotenv>=1.0.1' \ - 'tqdm>=4.66.2' \ - 'cloud-data-connector==1.0.3' \ - 'dataset-librarian==1.0.4' && \ - conda run -n pytorch-cpu pip install --no-cache-dir --ignore-installed \ - 'transformers>=4.40.2' \ - 'datasets>=2.19.1' \ - 'evaluate>=0.4.2' && \ - conda run -n pytorch-cpu pip install --no-cache-dir -U 'accelerate>=0.30.0' && \ - conda run -n pytorch-cpu pip install --no-cache-dir "git+https://github.com/huggingface/optimum-intel.git" && \ +RUN conda run -n 'pytorch-cpu' pip install --no-deps --no-cache-dir \ + 'dataset-librarian==1.0.4' \ + && \ + conda run -n 'pytorch-cpu' pip install --no-cache-dir \ + 'evaluate==0.4.2' \ + "git+https://github.com/huggingface/optimum-intel.git" \ + && \ conda clean -y --all - - -RUN conda run -n pytorch-cpu conda install 'protobuf=4.24' -c conda-forge --override --force-reinstall -y - -# PyTorch Installation ARG IDP_VERSION ARG DPNP_VERSION ARG NUMPY_VERSION - -ARG TORCH_GPU_VERSION -ARG ONECCL_GPU_VERSION -ARG IPEX_GPU_VERSION -ARG TORCHVISION_GPU_VERSION -ARG TORCHAUDIO_GPU_VERSION +ARG TORCH_XPU_VERSION +ARG ONECCL_XPU_VERSION +ARG IPEX_XPU_VERSION +ARG TORCHVISION_XPU_VERSION +ARG TORCHAUDIO_XPU_VERSION ARG IDEX_VERSION -ARG DEEPSPEED_VERSION -# PyTorch GPU Env - conda packages -RUN conda create -yn pytorch-gpu -c "${INTEL_CHANNEL}" -c conda-forge \ - dpnp="${DPNP_VERSION}" \ - dpcpp-cpp-rt="${IDP_VERSION}" \ - mkl-dpcpp="${IDP_VERSION}" \ - dpcpp_impl_linux-64="${IDP_VERSION}" \ - numpy="${NUMPY_VERSION}" \ - python="${PYTHON_VERSION}" \ - intel-openmp="${IDP_VERSION}" \ - python="${PYTHON_VERSION}" \ - pytorch="${TORCH_GPU_VERSION}" \ - oneccl_bind_pt="${ONECCL_GPU_VERSION}" \ - intel-extension-for-pytorch="${IPEX_GPU_VERSION}" \ - torchvision="${TORCHVISION_GPU_VERSION}" \ - torchaudio="${TORCHAUDIO_GPU_VERSION}" \ - 'tensorboardx>=2.6.2.2' \ - 'matplotlib-base>=3.4.3' \ - 'pandas>=2.2.2' \ - 'ipython>=8.18.1' \ - 'ipykernel>=6.29.3' \ - 'kernda>=0.3.0' \ - 'pillow>=10.2.0' \ - 'aiohttp>=3.9.0' \ - 'tornado>=6.3.3' \ - 'jinja2>=3.1.3' \ +RUN conda create -yn 'pytorch-gpu' \ + -c huggingface \ + "python=${PYTHON_VERSION}" \ + 'accelerate==0.32.1' \ + "dpnp=${DPNP_VERSION}" \ + "intel-extension-for-pytorch=${IPEX_XPU_VERSION}" \ + 'ipykernel==6.29.5' \ + 'kernda==0.3.0' \ + 'matplotlib-base>=3.8.4' \ + "oneccl_bind_pt=${ONECCL_XPU_VERSION}" \ + "pytorch=${TORCH_XPU_VERSION}" \ + 'tensorboardx==2.6.2.2' \ + "torchaudio=${TORCHAUDIO_XPU_VERSION}" \ + "torchvision=${TORCHVISION_XPU_VERSION}" \ + 'python-dotenv==1.0.1' \ + 'aiohttp>=3.9.4' \ + 'certifi>=2024.07.04' \ 'idna>=3.7' \ - 'onnx>=1.15.0' \ - 'packaging=23.2' \ - 'setuptools=69.1.0' \ + 'jinja2>=3.1.4' \ + 'onnx>=1.16.0' \ + 'requests>=2.32.0' \ + 'tqdm>=4.66.3' \ + 'urllib3>=2.2.2' \ + 'zipp>=3.19.1' \ && \ conda clean -y --all -# PyPI packages -RUN conda run -n pytorch-gpu pip install --no-deps --no-cache-dir --ignore-installed \ - 'ninja>=1.11.1.1' \ - 'python-dotenv>=1.0.1' \ - 'tqdm>=4.66.2' \ - 'cloud-data-connector==1.0.3' \ - 'dataset-librarian==1.0.4' && \ - conda run -n pytorch-gpu pip install --no-cache-dir --ignore-installed \ - 'transformers>=4.40.2' \ - 'datasets>=2.19.1' \ - 'evaluate>=0.4.2' && \ - conda run -n pytorch-gpu pip install --no-cache-dir -U 'accelerate>=0.30.0' && \ - conda run -n pytorch-gpu pip install --no-cache-dir "git+https://github.com/huggingface/optimum-intel.git" && \ +RUN conda run -n 'pytorch-gpu' pip install --no-deps --no-cache-dir \ + 'dataset-librarian==1.0.4' \ + && \ + conda run -n 'pytorch-gpu' pip install --no-cache-dir \ + 'evaluate==0.4.2' \ + "git+https://github.com/huggingface/optimum-intel.git" \ + && \ conda clean -y --all - - -RUN conda run -n pytorch-gpu conda install 'protobuf=4.24' -c conda-forge --override --force-reinstall -y - - -# TensorFlow Installation ARG IDP_VERSION ARG DPNP_VERSION ARG NUMPY_VERSION - ARG TF_VERSION -ARG ITEX_VERSION +ARG ITEX_CPU_VERSION +ARG ITEX_XPU_VERSION ARG HOROVOD_VERSION ARG IMPI_VERSION @@ -293,149 +257,122 @@ ARG HOROVOD_WITHOUT_MXNET=1 ARG HOROVOD_WITHOUT_GLOO=1 ARG HOROVOD_WITH_MPI=1 - -# Tensorflow Env - conda packages -RUN conda create -yn tensorflow-cpu -c "${INTEL_CHANNEL}" -c conda-forge \ - dpnp="${DPNP_VERSION}" \ - dpcpp-cpp-rt="${IDP_VERSION}" \ - mkl-dpcpp="${IDP_VERSION}" \ - numpy="${NUMPY_VERSION}" \ - python="${PYTHON_VERSION}" \ - intel-extension-for-tensorflow="${ITEX_VERSION}=*cpu*" \ - intel-optimization-for-horovod="${INTEL_HOROVOD}" \ - tensorflow="${TF_VERSION}" \ - impi-devel="${IMPI_VERSION}" \ - 'matplotlib-base>=3.4.3' \ - 'ipython>=8.18.1' \ - 'ipykernel>=6.29.3' \ - 'kernda>=0.3.0' \ - 'pillow>=10.2.0' \ - 'cryptography>=42.0.4' \ - 'werkzeug>=2.2.3' \ - 'aiohttp>=3.9.0' \ - 'tornado>=6.3.3' \ - 'pyjwt>=2.8.0' \ - 'oauthlib>=3.2.2' \ - 'idna>=3.7' \ +RUN conda create -yn 'tensorflow-cpu' \ + "python=${PYTHON_VERSION}" \ + "dpnp=${DPNP_VERSION}" \ + "intel-extension-for-tensorflow=${ITEX_CPU_VERSION}=*cpu*" \ + "intel-optimization-for-horovod=${HOROVOD_VERSION}" \ + 'ipykernel==6.29.5' \ + 'kernda==0.3.0' \ + 'matplotlib-base>=3.8.4' \ 'onnx>=1.14.1' \ + 'py-cpuinfo==9.0.0' \ + "tensorflow=${TF_VERSION}" \ + 'tensorflow-hub==0.16.1' \ + 'tqdm==4.66.4' \ + 'python-dotenv==1.0.1' \ + 'aiohttp>=3.9.4' \ + 'certifi>=2024.07.04' \ + 'idna>=3.7' \ + 'requests>=2.32.0' \ + 'urllib3>=2.2.2' \ + 'werkzeug>=3.0.3' \ + 'zipp>=3.19.1' \ && \ conda clean -y --all -# PyPI packages -RUN conda run -n tensorflow-cpu pip install --no-cache-dir --ignore-installed \ - 'py-cpuinfo>=9.0.0' \ - 'requests>=2.31.0' \ - 'cryptography>=42.0.7' -RUN conda run -n tensorflow-cpu pip install --no-deps --no-cache-dir --ignore-installed \ - 'tensorflow-hub>=0.16.1' \ - 'tqdm>=4.66.2' \ +RUN conda run -n 'tensorflow-cpu' pip install --no-deps --no-cache-dir \ 'dataset-librarian==1.0.4' \ - 'cloud-data-connector>=1.0.3' && \ + && \ conda clean -y --all -# Tensorflow Env - conda packages -RUN conda create -yn tensorflow-gpu -c "${INTEL_CHANNEL}" -c conda-forge \ - dpnp="${DPNP_VERSION}" \ - dpcpp-cpp-rt="${IDP_VERSION}" \ - mkl-dpcpp="${IDP_VERSION}" \ - numpy="${NUMPY_VERSION}" \ - python="${PYTHON_VERSION}" \ - intel-extension-for-tensorflow="${ITEX_VERSION}=*xpu*" \ - intel-optimization-for-horovod="${INTEL_HOROVOD}" \ - tensorflow="${TF_VERSION}" \ - impi-devel="${IMPI_VERSION}" \ - 'matplotlib-base>=3.4.3' \ - 'ipython>=8.18.1' \ - 'ipykernel>=6.29.3' \ - 'kernda>=0.3.0' \ - 'pillow>=10.2.0' \ - 'cryptography>=42.0.4' \ - 'werkzeug>=2.2.3' \ - 'aiohttp>=3.9.0' \ - 'tornado>=6.3.3' \ - 'pyjwt>=2.8.0' \ - 'oauthlib>=3.2.2' \ - 'idna>=3.7' \ +RUN conda create -yn 'tensorflow-gpu' \ + "python=${PYTHON_VERSION}" \ + "dpnp=${DPNP_VERSION}" \ + "intel-extension-for-tensorflow=${ITEX_XPU_VERSION}=*xpu*" \ + "intel-optimization-for-horovod=${HOROVOD_VERSION}" \ + 'ipykernel==6.29.5' \ + 'kernda==0.3.0' \ + 'matplotlib-base>=3.8.4' \ 'onnx>=1.14.1' \ - 'packaging=23.2' \ - 'setuptools=69.1.0' \ + 'py-cpuinfo==9.0.0' \ + "tensorflow=${TF_VERSION}" \ + 'tensorflow-hub==0.16.1' \ + 'tqdm==4.66.4' \ + 'python-dotenv==1.0.1' \ + 'aiohttp>=3.9.4' \ + 'certifi>=2024.07.04' \ + 'idna>=3.7' \ + 'requests>=2.32.0' \ + 'urllib3>=2.2.2' \ + 'zipp>=3.19.1' \ && \ conda clean -y --all -# PyPI packages -RUN conda run -n tensorflow-gpu pip install --no-cache-dir --ignore-installed \ - 'py-cpuinfo>=9.0.0' \ - 'requests>=2.31.0' \ - 'cryptography>=42.0.7' -RUN conda run -n tensorflow-gpu pip install --no-deps --no-cache-dir --ignore-installed \ - 'tensorflow-hub>=0.16.1' \ - 'tqdm>=4.66.2' \ +RUN conda run -n 'tensorflow-gpu' pip install --no-deps --no-cache-dir \ 'dataset-librarian==1.0.4' \ - 'cloud-data-connector==1.0.3' && \ + && \ conda clean -y --all -FROM deep-learning-python as deep-learning-jupyter - -ARG KERNEL_NAME_TF_CPU="Intel TensorFlow cpu" -ARG KERNEL_NAME_TF_GPU="Intel TensorFlow gpu" -ARG KERNEL_NAME_PT_CPU="Intel PyTorch cpu" -ARG KERNEL_NAME_PT_GPU="Intel PyTorch gpu" - EXPOSE 8888 RUN mkdir -p ~/jupyter/ && chmod -R a+rwx ~/jupyter/ && \ mkdir ~/.local && chmod a+rwx ~/.local RUN \ - "${CONDA_ROOT}/envs/pytorch-cpu/bin/python" -m ipykernel install --user --name pytorch-cpu --display-name "${KERNEL_NAME_PT_CPU}" && \ - "${CONDA_ROOT}/envs/pytorch-cpu/bin/kernda" -o -y "$HOME/.local/share/jupyter/kernels/$(echo pytorch-cpu | sed -e 's/\(.*\)/\L\1/')/kernel.json" && \ - "${CONDA_ROOT}/envs/pytorch-gpu/bin/python" -m ipykernel install --user --name pytorch-gpu --display-name "${KERNEL_NAME_PT_GPU}" && \ - "${CONDA_ROOT}/envs/pytorch-gpu/bin/kernda" -o -y "$HOME/.local/share/jupyter/kernels/$(echo pytorch-gpu | sed -e 's/\(.*\)/\L\1/')/kernel.json" && \ - "${CONDA_ROOT}/envs/tensorflow-cpu/bin/python" -m ipykernel install --user --name tensorflow-cpu --display-name "${KERNEL_NAME_TF_CPU}" && \ - "${CONDA_ROOT}/envs/tensorflow-cpu/bin/kernda" -o -y "$HOME/.local/share/jupyter/kernels/$(echo tensorflow-cpu | sed -e 's/\(.*\)/\L\1/')/kernel.json" && \ - "${CONDA_ROOT}/envs/tensorflow-gpu/bin/python" -m ipykernel install --user --name tensorflow-gpu --display-name "${KERNEL_NAME_TF_GPU}" && \ - "${CONDA_ROOT}/envs/tensorflow-gpu/bin/kernda" -o -y "$HOME/.local/share/jupyter/kernels/$(echo tensorflow-gpu | sed -e 's/\(.*\)/\L\1/')/kernel.json" && \ - python -m ipykernel.kernelspec --user + ENVS_LIST=('pytorch-cpu' 'pytorch-gpu' 'tensorflow-cpu' 'tensorflow-gpu') && \ + KERNEL_NAMES=('Intel PyTorch CPU' 'Intel PyTorch GPU' 'Intel TensorFlow CPU' 'Intel TensorFlow GPU') && \ + for i in "${!ENVS_LIST[@]}"; do \ + CONDA_ENV="${ENVS_LIST[i]}" && \ + KERNEL_NAME="${KERNEL_NAMES[i]}" && \ + KERNEL_DIR="${CONDA_ROOT}/share/jupyter/kernels/$CONDA_ENV" && \ + conda run -n "$CONDA_ENV" python -m ipykernel install --prefix "$CONDA_ROOT" --name "$CONDA_ENV" --display-name "$KERNEL_NAME" && \ + conda run -n "$CONDA_ENV" kernda -o -y "$KERNEL_DIR/kernel.json" && \ + conda run -n base jupyter kernelspec list \ + ; done CMD ["bash", "-c", "jupyter lab --notebook-dir=~/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root"] -FROM deep-learning-jupyter as distributed-deep-learning +FROM deep-learning-base as deep-learning +SHELL ["/bin/bash", "-c"] USER root -# Install OpenMPI -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - libopenmpi-dev \ - openmpi-bin \ - openmpi-common +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends --fix-missing \ + libopenmpi-dev \ + openmpi-bin \ + openmpi-common ENV OMPI_ALLOW_RUN_AS_ROOT=1 ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ENV OMPI_MCA_tl_tcp_if_exclude="lo,docker0" -# Install OpenSSH RUN apt-get install -y --no-install-recommends --fix-missing \ - openssh-client \ - openssh-server && \ - rm /etc/ssh/ssh_host_*_key \ - /etc/ssh/ssh_host_*_key.pub && \ - rm -rf /var/lib/apt/lists/* - -RUN mkdir -p /var/run/sshd && \ + openssh-client \ + openssh-server \ + && \ + rm -rf \ + /etc/ssh/ssh_host_*_key \ + /etc/ssh/ssh_host_*_key.pub \ + /var/lib/apt/lists/* \ + && \ + mkdir -p /var/run/sshd \ + && \ echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config -# https://github.com/openucx/ucx/issues/4742#issuecomment-584059909 ENV UCX_TLS=ud,sm,self USER dev -RUN conda install -n pytorch-cpu -c "${INTEL_CHANNEL}" -c conda-forge \ - deepspeed="${DEEPSPEED_VERSION}" \ - 'tensorboardx>=2.6.2.2' - -RUN conda install -n pytorch-gpu -c "${INTEL_CHANNEL}" -c conda-forge \ - deepspeed="${DEEPSPEED_VERSION}" \ - 'tensorboardx>=2.6.2.2' +RUN ENVS_LIST=('pytorch-cpu' 'pytorch-gpu') && \ + for i in "${!ENVS_LIST[@]}"; do \ + CONDA_ENV="${ENVS_LIST[i]}" && \ + conda install -yn "$CONDA_ENV" \ + "deepspeed=${DEEPSPEED_VERSION}" \ + 'tensorboardx==2.6.2.2' \ + ; done && \ + conda clean -y --all COPY --chown=dev notebooks /home/dev/jupyter COPY --chown=dev tests /home/dev/sample-tests diff --git a/preset/deep-learning/docker-compose.yaml b/preset/deep-learning/docker-compose.yaml index 663e064c..023b6f82 100644 --- a/preset/deep-learning/docker-compose.yaml +++ b/preset/deep-learning/docker-compose.yaml @@ -15,6 +15,7 @@ # -*- coding: utf-8 -*- # + version: '3' services: dl-base: @@ -22,44 +23,42 @@ services: args: BASE_IMAGE: ${BASE_IMAGE:-ubuntu} BASE_TAG: ${BASE_TAG:-22.04} - DEEPSPEED_VERSION: ${DEEPSPEED_VERSION:-0.14.0} + DEEPSPEED_VERSION: ${DEEPSPEED_VERSION:-0.14.2} DEVICE: ${DEVICE:-flex} - DPNP_VERSION: ${NUMBA_DPEX_VERSION:-0.14.0} - HOROVOD_VERSION: ${HOROVOD_VERSION:-0.28.1.4} - ICD_VER: 23.43.27642.40-803~22.04 - IDP_VERSION: ${IDP_VERSION:-2024.1.0} - IMPI_VERSION: ${IMPI_VERSION:-2021.12} + DPNP_VERSION: ${DPNP_VERSION:-0.15.0} + HOROVOD_VERSION: ${HOROVOD_VERSION:-0.28.1.5} + ICD_VER: 23.43.27642.52-803~22.04 + IDP_VERSION: ${IDP_VERSION:-2024.2} + IMPI_VERSION: ${IMPI_VERSION:-2021.13} INTEL_CHANNEL: ${INTEL_CHANNEL:-https://software.repos.intel.com/python/conda/} - IPEX_CPU_VERSION: ${IPEX_CPU_VERSION:-2.2.0=*cpu*} - IPEX_GPU_VERSION: ${IPEX_GPU_VERSION:-2.1.20=*xpu*} - ITEX_VERSION: ${ITEX_VERSION:-2.15} + IPEX_CPU_VERSION: ${IPEX_CPU_VERSION:-2.3.100} + IPEX_XPU_VERSION: ${IPEX_XPU_VERSION:-2.1.40} + ITEX_CPU_VERSION: ${ITEX_CPU_VERSION:-2.15.0} + ITEX_XPU_VERSION: ${ITEX_XPU_VERSION:-2.15.0.1} LEVEL_ZERO_DEV_VER: 1.14.0-744~22.04 - LEVEL_ZERO_GPU_VER: 1.3.27642.40-803~22.04 + LEVEL_ZERO_GPU_VER: 1.3.27642.52-803~22.04 LEVEL_ZERO_VER: 1.14.0-744~22.04 - MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Linux-x86_64} - MPI_VERSION: ${MPI_VERSION:-2021.12.0} - NUMBA_DPEX_VERSION: ${NUMBA_DPEX_VERSION:-0.22.1} + MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Miniforge3-Linux-x86_64} + MPI_VERSION: ${MPI_VERSION:-2021.13} + NUMBA_DPEX_VERSION: ${NUMBA_DPEX_VERSION:-0.23.0} NUMPY_VERSION: ${NUMPY_VERSION:-1.26.4} - ONECCL_CPU_VERSION: ${ONECCL_CPU_VERSION:-2.2.0=*cpu*} - ONECCL_GPU_VERSION: ${ONECCL_GPU_VERSION:-2.1.200=*xpu*} - PYTHON_VERSION: ${PYTHON_VERSION:-3.10} - TF_VERSION: ${TF_VERSION:-2.15} - TORCHAUDIO_CPU_VERSION: ${TORCHAUDIO_CPU_VERSION:-2.2.0=*cpu*} - TORCHAUDIO_GPU_VERSION: ${TORCHAUDIO_GPU_VERSION:-2.1.0=*xpu*} - TORCHVISION_CPU_VERSION: ${TORCHVISION_CPU_VERSION:-0.17=*cpu*} - TORCHVISION_GPU_VERSION: ${TORCHVISION_GPU_VERSION:-0.16.0=*xpu*} - TORCH_CPU_VERSION: ${TORCH_CPU_VERSION:-2.2.0=*cpu*} - TORCH_GPU_VERSION: ${TORCH_GPU_VERSION:-2.1.0=*xpu*} + ONECCL_CPU_VERSION: ${ONECCL_CPU_VERSION:-2.3.0} + ONECCL_XPU_VERSION: ${ONECCL_XPU_VERSION:-2.1.400} + PYTHON_VERSION: ${PYTHON_VERSION:-3.9} + TF_VERSION: ${TF_VERSION:-2.15.1} + TORCHAUDIO_CPU_VERSION: ${TORCHAUDIO_CPU_VERSION:-2.3.1} + TORCHAUDIO_XPU_VERSION: ${TORCHAUDIO_XPU_VERSION:-2.1.0} + TORCHVISION_CPU_VERSION: ${TORCHVISION_CPU_VERSION:-0.18.1} + TORCHVISION_XPU_VERSION: ${TORCHVISION_XPU_VERSION:-0.16.0} + TORCH_CPU_VERSION: ${TORCH_CPU_VERSION:-2.3.1} + TORCH_XPU_VERSION: ${TORCH_XPU_VERSION:-2.1.0} http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: '' context: . labels: docs: false - target: deep-learning-jupyter - command: | - bash -c "conda run -n pytorch-cpu python -c 'import torch;print(torch.__version__);import intel_extension_for_pytorch as ipex;print(ipex.__version__);' && \ - conda run -n tensorflow-cpu python -c 'import tensorflow as tf; print(tf.__version__)'" + target: deep-learning-base environment: http_proxy: ${http_proxy} https_proxy: ${https_proxy} @@ -67,15 +66,24 @@ services: shm_size: 12GB volumes: - /dev/dri/by-path:/dev/dri/by-path + command: > + bash -c " conda run -n pytorch-cpu python -c 'import torch;print(torch.__version__);import + intel_extension_for_pytorch as ipex;print(ipex.__version__);' && + + conda run -n tensorflow-cpu python -c 'import tensorflow as tf;print(tf.__version__)' + " + + deep-learning: build: + target: deep-learning labels: docs: deep_learning org.opencontainers.image.title: "Intel® AI Tools Selector Preset Containers - Deep Learning" org.opencontainers.base.name: "ubuntu:22.04" org.opencontainers.image.name: "intel/deep-learning" - org.opencontainers.image.version: 2024.1.0-py${PYTHON_VERSION:-3.10} - dependency.python: ${PYTHON_VERSION:-3.10} + org.opencontainers.image.version: 2024.2.0-py${PYTHON_VERSION:-3.9} + dependency.python: ${PYTHON_VERSION:-3.9} dependency.python.pip: requirements.txt dependency.apt.apt-utils: true dependency.apt.build-essential: true @@ -92,11 +100,11 @@ services: dependency.apt.gzip: true dependency.apt.hwinfo: true dependency.apt.intel-igc-cm: true - dependency.apt.intel-level-zero-gpu: '=1.3.27642.40-803~22.04' + dependency.apt.intel-level-zero-gpu: true dependency.apt.intel-media-va-driver-non-free: true - dependency.apt.intel-opencl-icd: '=23.43.27642.40-803~22.04' - dependency.apt.level-zero: '=1.14.0-744~22.04' - dependency.apt.level-zero-dev: '=1.14.0-744~22.04' + dependency.apt.intel-opencl-icd: true + dependency.apt.level-zero: true + dependency.apt.level-zero-dev: true dependency.apt.libegl1-mesa: true dependency.apt.libegl1-mesa-dev: true dependency.apt.libegl-mesa0: true @@ -114,7 +122,6 @@ services: dependency.apt.libmfxgen1: true dependency.apt.libopenmpi-dev: true dependency.apt.libvpl2: true - dependency.apt.libxatracker2: true dependency.apt.make: true dependency.apt.mesa-va-drivers: true dependency.apt.mesa-vdpau-drivers: true @@ -132,69 +139,71 @@ services: dependency.apt.vainfo: true dependency.apt.wget: true dependency.apt.xz-utils: true - dependency.conda.jupyterlab: '>=4.1.8' - dependency.conda.aiohttp: '>=3.9.0' - dependency.conda.cryptography: '>=42.0.4' - dependency.conda.dataset_librarian: '>=1.0.4' - dependency.conda.deepspeed: '=0.14.0' - dependency.conda.dpcpp_impl_linux-64: '=2024.1.0' - dependency.conda.dpcpp-cpp-rt: '=2024.1.0' - dependency.conda.dpnp: '=0.14.0' - dependency.conda.idna: '>=3.7' - dependency.conda.impi-devel: '=2021.12' - dependency.conda.intel-extension-for-pytorch_cpu: '=2.2.0=*cpu*' - dependency.conda.intel-extension-for-pytorch_gpu: '=2.1.20=*xpu*' - dependency.conda.intel-extension-for-tensorflow_cpu: '=2.15=*cpu*' - dependency.conda.intel-extension-for-tensorflow_gpu: '=2.15=*xpu*' - dependency.conda.intel-openmp: '=2024.1.0' - dependency.conda.intel-optimization-for-horovod: '=0.28.1.4' - dependency.conda.ipykernel: '>=6.29.3' - dependency.conda.ipython: '>=8.18.1' - dependency.conda.jinja2: '>=3.1.3' - dependency.conda.jupyterhub: '>=4.1.5' - dependency.conda.jupyter-server-proxy: '>=4.1.2' - dependency.conda.kernda: '>=0.3.0' - dependency.conda.mako: '>=1.2.2' - dependency.conda.matplotlib-base: '>=3.4.3' - dependency.conda.mkl-dpcpp: '2024.1.0' - dependency.conda.nodejs: '>=20.12.2' - dependency.conda.notebook: '>=7.1.3' - dependency.conda.numpy: '=1.26.4' - dependency.conda.oauthlib: '>=3.2.2' - dependency.conda.oneccl_bind_pt_cpu: '=2.2.0=*cpu*' - dependency.conda.oneccl_bind_pt_gpu: '=2.1.200=*xpu*' + dependency.conda.accelerate: '==0.32.1' + dependency.conda.colorama: '==0.4.6' + dependency.conda.conda: '==24.5.0' + dependency.conda.dpnp: '=0.15.0' + dependency.conda.intel-extension-for-pytorch_cpu: '=2.3.100' + dependency.conda.intel-extension-for-pytorch_xpu: '=2.1.40' + dependency.conda.intel-extension-for-tensorflow_cpu: '=2.15.0=*cpu*' + dependency.conda.intel-extension-for-tensorflow_xpu: '=2.15.0.1=*xpu*' + dependency.conda.intel-optimization-for-horovod: '=0.28.1.5' + dependency.conda.ipykernel: '==6.29.5' + dependency.conda.jupyterhub: '==5.1.0' + dependency.conda.jupyter-server-proxy: '==4.3.0' + dependency.conda.kernda: '==0.3.0' + dependency.conda.ld_impl_linux-64: '==2.40' + dependency.conda.mamba: '==1.5.8' + dependency.conda.matplotlib-base: '>=3.8.4' + dependency.conda.mpi: '==1.0' + dependency.conda.mpich: '==4.2.2' + dependency.conda.networkx: '==3.3' + dependency.conda.notebook: '==7.2.1' + dependency.conda.oneccl_bind_pt_cpu: '=2.3.0' + dependency.conda.oneccl_bind_pt_xpu: '=2.1.400' dependency.conda.onnx: '>=1.14.1' - dependency.conda.packaging: '=23.2' - dependency.conda.pandas: '>=2.2.2' - dependency.conda.pillow: '>=10.2.0' - dependency.conda.protobuf: '=4.24' - dependency.conda.pyjwt: '>=2.4.0' - dependency.conda.python: "=${PYTHON_VERSION:-3.10}" - dependency.conda.pytorch_cpu: '=2.2.0=*cpu*' - dependency.conda.pytorch_gpu: '=2.1.0=*xpu*' - dependency.conda.setuptools: '=69.1.0' - dependency.conda.tensorboardx: '>=2.6.2.2' - dependency.conda.tensorflow: '=2.15' - dependency.conda.torchaudio_cpu: '=2.2.0=*cpu*' - dependency.conda.torchaudio_gpu: '=2.1.0=*xpu*' - dependency.conda.torchvision_cpu: '=0.17=*cpu*' - dependency.conda.torchvision_gpu: '=0.16.0=*xpu*' - dependency.conda.tornado: '>=6.3.3' - dependency.conda.tqdm: '>=4.66.2' - dependency.conda.werkzeug: '>=2.2.3' - target: distributed-deep-learning + dependency.conda.py-cpuinfo: '==9.0.0' + dependency.conda.python: '==3.10.14' + dependency.conda.pytorch_cpu: '=2.3.1' + dependency.conda.pytorch_xpu: '=2.1.0' + dependency.conda.tensorboardx: '==2.6.2.2' + dependency.conda.tensorflow: '=2.15.1' + dependency.conda.tensorflow-hub: '==0.16.1' + dependency.conda.torchaudio_cpu: '=2.3.1' + dependency.conda.torchaudio_xpu: '=2.1.0' + dependency.conda.torchvision_cpu: '=0.18.1' + dependency.conda.torchvision_xpu: '=0.16.0' + dependency.conda.tqdm: '==4.66.4' depends_on: - dl-base extends: dl-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-2024.1.0-py${PYTHON_VERSION:-3.10} - command: | - bash -c "conda run -n pytorch-cpu python -c 'import torch;print(torch.__version__);import intel_extension_for_pytorch as ipex;print(ipex.__version__);' && \ - conda run -n pytorch-cpu bash -c 'mpirun --version' && \ - conda run -n pytorch-cpu python -c 'import oneccl_bindings_for_pytorch as oneccl;print(\"\\nOneCCL:\", oneccl.__version__)' && \ - conda run -n pytorch-gpu python -c 'import torch;print(torch.device(\"xpu\"));import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available());print(ipex.xpu.has_onemkl())' && \ - conda run -n pytorch-gpu bash -c 'mpirun --version' && \ - conda run -n pytorch-gpu python -c 'import oneccl_bindings_for_pytorch as oneccl;print(\"\\nOneCCL:\", oneccl.__version__)' && \ - conda run -n tensorflow-cpu python -c 'import tensorflow;print(tensorflow.__version__);import intel_extension_for_tensorflow as itex;print(itex.__version__)' && \ - conda run -n tensorflow-gpu python -c 'from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())' && \ - conda run -n tensorflow-gpu bash -c 'horovodrun --check-build && mpirun --version' && \ - conda run -n tensorflow-gpu python -c 'import horovod.tensorflow as hvd;hvd.init();import horovod.tensorflow'" + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + command: > + bash -c " conda run -n pytorch-cpu python -c 'import torch;print(torch.__version__);import + intel_extension_for_pytorch as ipex;print(ipex.__version__);' && + + conda run -n pytorch-cpu bash -c 'mpirun --version' && + + conda run -n pytorch-cpu python -c 'import oneccl_bindings_for_pytorch as oneccl;print(\"\\nOneCCL:\", + oneccl.__version__)' && + + conda run -n pytorch-gpu python -c 'import torch;print(torch.device(\"xpu\"));import + intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available());print(ipex.xpu.has_onemkl())' + && + + conda run -n pytorch-gpu bash -c 'mpirun --version' && + + conda run -n pytorch-gpu python -c 'import oneccl_bindings_for_pytorch as oneccl;print(\"\\nOneCCL:\", + oneccl.__version__)' && + + conda run -n tensorflow-cpu python -c 'import tensorflow;print(tensorflow.__version__);import + intel_extension_for_tensorflow as itex;print(itex.__version__)' && + + conda run -n tensorflow-gpu python -c 'from tensorflow.python.client import + device_lib;print(device_lib.list_local_devices())' && + + conda run -n tensorflow-gpu bash -c 'horovodrun --check-build && mpirun --version' + && + + conda run -n tensorflow-gpu python -c 'import horovod.tensorflow as hvd;hvd.init();import + horovod.tensorflow' " diff --git a/preset/deep-learning/requirements.txt b/preset/deep-learning/requirements.txt index 4122126b..db93ef0d 100644 --- a/preset/deep-learning/requirements.txt +++ b/preset/deep-learning/requirements.txt @@ -1,14 +1,3 @@ -accelerate>=0.30.0 -cloud-data-connector>=1.0.3 -cryptography>=42.0.7 -dataset-librarian>=1.0.4 -datasets>=2.19.1 -evaluate>=0.4.2 +dataset-librarian==1.0.4 +evaluate==0.4.2 git+https://github.com/huggingface/optimum-intel.git -ninja>=1.11.1.1 -py-cpuinfo>=9.0.0 -python-dotenv>=1.0.1 -requests>=2.31.0 -tensorflow-hub>=0.16.1 -tqdm>=4.66.2 -transformers>=4.40.2 diff --git a/preset/deep-learning/tests.yaml b/preset/deep-learning/tests.yaml old mode 100644 new mode 100755 index 0b0cdcae..399d4291 --- a/preset/deep-learning/tests.yaml +++ b/preset/deep-learning/tests.yaml @@ -12,39 +12,50 @@ # See the License for the specific language governing permissions and # limitations under the License. +--- deep-learning-ipex-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n pytorch-cpu python -W ignore sample-tests/intel_extension_for_pytorch/test_ipex.py --device cpu --ipex - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} deep-learning-ipex-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n pytorch-gpu python -W ignore sample-tests/intel_extension_for_pytorch/test_ipex.py --device xpu --ipex - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] + deep-learning-ipex-notebook-${PYTHON_VERSION:-3.9}-cpu: cmd: papermill --log-output jupyter/ipex/ResNet50_Inference.ipynb -k pytorch-cpu - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True deep-learning-ipex-notebook-${PYTHON_VERSION:-3.9}-gpu: cmd: papermill --log-output jupyter/ipex/ResNet50_Inference.ipynb -k pytorch-gpu - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True + device: ["/dev/dri"] + deep-learning-ipex-quantization-notebook-${PYTHON_VERSION:-3.9}-cpu: cmd: papermill --log-output jupyter/ipex-quantization/IntelPytorch_Quantization.ipynb -k pytorch-cpu - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True + deep-learning-itex-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n tensorflow-cpu python -W ignore sample-tests/intel_extension_for_tensorflow/test_itex.py - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} deep-learning-itex-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n tensorflow-gpu python -W ignore sample-tests/intel_extension_for_tensorflow/test_itex.py - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] + deep-learning-tensorflow-dataset-librarian-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n tensorflow-cpu bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} deep-learning-tensorflow-dataset-librarian-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n tensorflow-gpu bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] + deep-learning-torch-dataset-librarian-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n pytorch-cpu bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} deep-learning-torch-dataset-librarian-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n pytorch-gpu bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-deep-learning-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] diff --git a/preset/inference-optimization/Dockerfile b/preset/inference-optimization/Dockerfile index 6689b437..a38e8266 100644 --- a/preset/inference-optimization/Dockerfile +++ b/preset/inference-optimization/Dockerfile @@ -12,35 +12,34 @@ # See the License for the specific language governing permissions and # limitations under the License. + ARG COMPOSE_PROJECT_NAME FROM ${COMPOSE_PROJECT_NAME}-dl-base as inference-optimization -ENV SIGOPT_PROJECT=. +SHELL ["/bin/bash", "-c"] +ENV SIGOPT_PROJECT=. ARG NEURAL_COMPRESSOR_VERSION ARG INTEL_CHANNEL - -RUN conda install -yn pytorch-cpu -c "${INTEL_CHANNEL}" -c conda-forge \ - neural-compressor="${NEURAL_COMPRESSOR_VERSION}" - -RUN conda install -yn pytorch-gpu -c "${INTEL_CHANNEL}" -c conda-forge \ - neural-compressor="${NEURAL_COMPRESSOR_VERSION}" - -RUN conda install -yn tensorflow-cpu -c "${INTEL_CHANNEL}" -c conda-forge \ - neural-compressor="${NEURAL_COMPRESSOR_VERSION}" - -RUN conda install -yn tensorflow-gpu -c "${INTEL_CHANNEL}" -c conda-forge \ - neural-compressor="${NEURAL_COMPRESSOR_VERSION}" - -RUN conda run -n tensorflow-cpu python -m pip install --no-deps --no-cache-dir \ - 'tf2onnx>=1.16.1' \ - 'onnxruntime>=1.17.3' && \ +RUN ENVS_LIST=('pytorch-cpu' 'pytorch-gpu' 'tensorflow-cpu' 'tensorflow-gpu') && \ + for i in "${!ENVS_LIST[@]}"; do \ + CONDA_ENV="${ENVS_LIST[i]}" && \ + conda install -yn "$CONDA_ENV" \ + "neural-compressor=${NEURAL_COMPRESSOR_VERSION}" \ + 'scikit-learn>=1.5.0' \ + ; \ + done && \ conda clean -y --all -RUN conda run -n tensorflow-gpu python -m pip install --no-deps --no-cache-dir \ - 'tf2onnx>=1.16.1' \ - 'onnxruntime>=1.17.3' && \ +RUN ENVS_LIST=('tensorflow-cpu' 'tensorflow-gpu') && \ + for i in "${!ENVS_LIST[@]}"; do \ + CONDA_ENV="${ENVS_LIST[i]}" && \ + conda run -n "$CONDA_ENV" python -m pip install --no-deps --no-cache-dir \ + 'tf2onnx==1.16.1' \ + 'onnxruntime==1.18.1' \ + ; \ + done && \ conda clean -y --all COPY --chown=dev notebooks /home/dev/jupyter diff --git a/preset/inference-optimization/docker-compose.yaml b/preset/inference-optimization/docker-compose.yaml index ac8ebc07..cf543bff 100644 --- a/preset/inference-optimization/docker-compose.yaml +++ b/preset/inference-optimization/docker-compose.yaml @@ -15,6 +15,7 @@ # -*- coding: utf-8 -*- # + version: '3' services: dl-base: @@ -22,42 +23,42 @@ services: args: BASE_IMAGE: ${BASE_IMAGE:-ubuntu} BASE_TAG: ${BASE_TAG:-22.04} - DEEPSPEED_VERSION: ${DEEPSPEED_VERSION:-0.14.0} DEVICE: ${DEVICE:-flex} - DPNP_VERSION: ${NUMBA_DPEX_VERSION:-0.14.0} - HOROVOD_VERSION: ${HOROVOD_VERSION:-0.28.1.4} - ICD_VER: 23.43.27642.40-803~22.04 - IDP_VERSION: ${IDP_VERSION:-2024.1.0} - IMPI_VERSION: ${IMPI_VERSION:-2021.12} + DPNP_VERSION: ${DPNP_VERSION:-0.15.0} + HOROVOD_VERSION: ${HOROVOD_VERSION:-0.28.1.5} + ICD_VER: 23.43.27642.52-803~22.04 + IDP_VERSION: ${IDP_VERSION:-2024.2} + IMPI_VERSION: ${IMPI_VERSION:-2021.13} INTEL_CHANNEL: ${INTEL_CHANNEL:-https://software.repos.intel.com/python/conda/} - IPEX_CPU_VERSION: ${IPEX_CPU_VERSION:-2.2.0=*cpu*} - IPEX_GPU_VERSION: ${IPEX_GPU_VERSION:-2.1.20=*xpu*} - ITEX_VERSION: ${ITEX_VERSION:-2.15} + IPEX_CPU_VERSION: ${IPEX_CPU_VERSION:-2.3.100} + IPEX_XPU_VERSION: ${IPEX_XPU_VERSION:-2.1.40} + ITEX_CPU_VERSION: ${ITEX_CPU_VERSION:-2.15.0} + ITEX_XPU_VERSION: ${ITEX_XPU_VERSION:-2.15.0.1} LEVEL_ZERO_DEV_VER: 1.14.0-744~22.04 - LEVEL_ZERO_GPU_VER: 1.3.27642.40-803~22.04 + LEVEL_ZERO_GPU_VER: 1.3.27642.52-803~22.04 LEVEL_ZERO_VER: 1.14.0-744~22.04 - MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Linux-x86_64} - MPI_VERSION: ${MPI_VERSION:-2021.12.0} - NEURAL_COMPRESSOR_VERSION: ${NEURAL_COMPRESSOR_VERSION:-2.4.1} - NUMBA_DPEX_VERSION: ${NUMBA_DPEX_VERSION:-0.22.1} + MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Miniforge3-Linux-x86_64} + MPI_VERSION: ${MPI_VERSION:-2021.13} + NEURAL_COMPRESSOR_VERSION: ${NEURAL_COMPRESSOR_VERSION:-2.5.1} + NUMBA_DPEX_VERSION: ${NUMBA_DPEX_VERSION:-0.23.0} NUMPY_VERSION: ${NUMPY_VERSION:-1.26.4} - ONECCL_CPU_VERSION: ${ONECCL_CPU_VERSION:-2.2.0=*cpu*} - ONECCL_GPU_VERSION: ${ONECCL_GPU_VERSION:-2.1.200=*xpu*} - PYTHON_VERSION: ${PYTHON_VERSION:-3.10} - TF_VERSION: ${TF_VERSION:-2.15} - TORCHAUDIO_CPU_VERSION: ${TORCHAUDIO_CPU_VERSION:-2.2.0=*cpu*} - TORCHAUDIO_GPU_VERSION: ${TORCHAUDIO_GPU_VERSION:-2.1.0=*xpu*} - TORCHVISION_CPU_VERSION: ${TORCHVISION_CPU_VERSION:-0.17=*cpu*} - TORCHVISION_GPU_VERSION: ${TORCHVISION_GPU_VERSION:-0.16.0=*xpu*} - TORCH_CPU_VERSION: ${TORCH_CPU_VERSION:-2.2.0=*cpu*} - TORCH_GPU_VERSION: ${TORCH_GPU_VERSION:-2.1.0=*xpu*} + ONECCL_CPU_VERSION: ${ONECCL_CPU_VERSION:-2.3.0} + ONECCL_XPU_VERSION: ${ONECCL_XPU_VERSION:-2.1.400} + PYTHON_VERSION: ${PYTHON_VERSION:-3.9} + TF_VERSION: ${TF_VERSION:-2.15.1} + TORCHAUDIO_CPU_VERSION: ${TORCHAUDIO_CPU_VERSION:-2.3.1} + TORCHAUDIO_XPU_VERSION: ${TORCHAUDIO_XPU_VERSION:-2.1.0} + TORCHVISION_CPU_VERSION: ${TORCHVISION_CPU_VERSION:-0.18.1} + TORCHVISION_XPU_VERSION: ${TORCHVISION_XPU_VERSION:-0.16.0} + TORCH_CPU_VERSION: ${TORCH_CPU_VERSION:-2.3.1} + TORCH_XPU_VERSION: ${TORCH_XPU_VERSION:-2.1.0} http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: '' context: ../deep-learning labels: docs: false - target: deep-learning-jupyter + target: deep-learning-base environment: http_proxy: ${http_proxy} https_proxy: ${https_proxy} @@ -65,9 +66,12 @@ services: shm_size: 12GB volumes: - /dev/dri/by-path:/dev/dri/by-path - command: | - bash -c "conda run -n pytorch-cpu python -c 'import torch;print(torch.__version__);import intel_extension_for_pytorch as ipex;print(ipex.__version__);' && \ - conda run -n tensorflow-cpu python -c 'import tensorflow as tf; print(tf.__version__)'" + command: > + bash -c " conda run -n pytorch-cpu python -c 'import torch;print(torch.__version__);import + intel_extension_for_pytorch as ipex;print(ipex.__version__)' && + + conda run -n tensorflow-cpu python -c 'import tensorflow as tf;print(tf.__version__)' + " inference-optimization: @@ -75,13 +79,14 @@ services: args: COMPOSE_PROJECT_NAME: ${COMPOSE_PROJECT_NAME:-preset} context: . + target: inference-optimization labels: docs: inference_optimization org.opencontainers.image.title: "Intel® AI Tools Selector Preset Containers - Inference Optimization" org.opencontainers.base.name: "intel/deep-learning" org.opencontainers.image.name: "intel/inference-optimization" - org.opencontainers.image.version: 2024.1.0-py${PYTHON_VERSION:-3.10} - dependency.python: ${PYTHON_VERSION:-3.10} + org.opencontainers.image.version: 2024.2.0-py${PYTHON_VERSION:-3.9} + dependency.python: ${PYTHON_VERSION:-3.9} dependency.python.pip: requirements.txt dependency.apt.apt-utils: true dependency.apt.build-essential: true @@ -98,11 +103,11 @@ services: dependency.apt.gzip: true dependency.apt.hwinfo: true dependency.apt.intel-igc-cm: true - dependency.apt.intel-level-zero-gpu: '1.3.27642.40-803~22.04' + dependency.apt.intel-level-zero-gpu: true dependency.apt.intel-media-va-driver-non-free: true - dependency.apt.intel-opencl-icd: '23.43.27642.40-803~22.04' - dependency.apt.level-zero: '1.14.0-744~22.04' - dependency.apt.level-zero-dev: '1.14.0-744~22.04' + dependency.apt.intel-opencl-icd: true + dependency.apt.level-zero: true + dependency.apt.level-zero-dev: true dependency.apt.libegl1-mesa: true dependency.apt.libegl1-mesa-dev: true dependency.apt.libegl-mesa0: true @@ -120,7 +125,6 @@ services: dependency.apt.libmfxgen1: true dependency.apt.libopenmpi-dev: true dependency.apt.libvpl2: true - dependency.apt.libxatracker2: true dependency.apt.make: true dependency.apt.mesa-va-drivers: true dependency.apt.mesa-vdpau-drivers: true @@ -138,68 +142,72 @@ services: dependency.apt.vainfo: true dependency.apt.wget: true dependency.apt.xz-utils: true - dependency.conda.jupyterlab: '>=4.1.8' - dependency.conda.aiohttp: '>=3.9.0' - dependency.conda.cryptography: '>=42.0.4' - dependency.conda.dataset_librarian: '>=1.0.4' - dependency.conda.deepspeed: '>=0.14.0' - dependency.conda.dpcpp_impl_linux-64: '>=2024.1.' - dependency.conda.dpcpp-cpp-rt: '>=2024.1.' - dependency.conda.dpnp: '>=0.14.0' - dependency.conda.idna: '>=3.7' - dependency.conda.impi-devel: '>=2021.12' - dependency.conda.intel-extension-for-pytorch_cpu: '>=2.2.0=*cpu*' - dependency.conda.intel-extension-for-pytorch_gpu: '>=2.1.20=*xpu*' - dependency.conda.intel-extension-for-tensorflow_cpu: '>=2.15=*cpu*' - dependency.conda.intel-extension-for-tensorflow_gpu: '>=2.15=*xpu*' - dependency.conda.intel-openmp: '>=2024.1.0' - dependency.conda.intel-optimization-for-horovod: '>=0.28.1.4' - dependency.conda.ipykernel: '>=6.29.3' - dependency.conda.ipython: '>=8.18.1' - dependency.conda.jinja2: '>=3.1.3' - dependency.conda.jupyterhub: '>=4.1.5' - dependency.conda.jupyter-server-proxy: '>=4.1.2' - dependency.conda.kernda: '>=0.3.0' - dependency.conda.mako: '>=1.2.2' - dependency.conda.matplotlib-base: '>=3.4.3' - dependency.conda.mkl-dpcpp: '>=2024.1.0' - dependency.conda.neural-compressor: '>=2.4.1' - dependency.conda.nodejs: '>=20.12.2' - dependency.conda.notebook: '>=7.1.3' - dependency.conda.numpy: '>=1.26.4' - dependency.conda.oauthlib: '>=3.2.2' - dependency.conda.oneccl_bind_pt_cpu: '>=2.2.0=*cpu*' - dependency.conda.oneccl_bind_pt_gpu: '>=2.1.200=*xpu*' + dependency.conda.accelerate: '==0.32.1' + dependency.conda.colorama: '==0.4.6' + dependency.conda.conda: '==24.5.0' + dependency.conda.dpnp: '=0.15.0' + dependency.conda.intel-extension-for-pytorch_cpu: '=2.3.100' + dependency.conda.intel-extension-for-pytorch_xpu: '=2.1.40' + dependency.conda.intel-extension-for-tensorflow_cpu: '=2.15.0=*cpu*' + dependency.conda.intel-extension-for-tensorflow_xpu: '=2.15.0.1=*xpu*' + dependency.conda.intel-optimization-for-horovod: '=0.28.1.5' + dependency.conda.ipykernel: '==6.29.5' + dependency.conda.jupyterhub: '==5.1.0' + dependency.conda.jupyter-server-proxy: '==4.3.0' + dependency.conda.kernda: '==0.3.0' + dependency.conda.ld_impl_linux-64: '==2.40' + dependency.conda.mamba: '==1.5.8' + dependency.conda.matplotlib-base: '>=3.8.4' + dependency.conda.mpi: '==1.0' + dependency.conda.mpich: '==4.2.2' + dependency.conda.networkx: '==3.3' + dependency.conda.neural-compressor: '=2.5.1' + dependency.conda.notebook: '==7.2.1' + dependency.conda.oneccl_bind_pt_cpu: '=2.3.0' + dependency.conda.oneccl_bind_pt_xpu: '=2.1.400' dependency.conda.onnx: '>=1.14.1' - dependency.conda.packaging: '>=23.2' - dependency.conda.pandas: '>=2.2.2' - dependency.conda.pillow: '>=10.2.0' - dependency.conda.protobuf: '>=4.24' - dependency.conda.pyjwt: '>=2.4.0' - dependency.conda.python: "=${PYTHON_VERSION:-3.10}" - dependency.conda.pytorch_cpu: '>=2.2.0=*cpu*' - dependency.conda.pytorch_gpu: '>=2.1.0=*xpu*' - dependency.conda.setuptools: '>=69.1.0' - dependency.conda.tensorboardx: '>=2.6.2.2' - dependency.conda.tensorflow: '>=2.15' - dependency.conda.torchaudio_cpu: '>=2.2.0=*cpu*' - dependency.conda.torchaudio_gpu: '>=2.1.0=*xpu*' - dependency.conda.torchvision_cpu: '>=0.17=*cpu*' - dependency.conda.torchvision_gpu: '>=0.16.0=*xpu*' - dependency.conda.tornado: '>=6.3.3' - dependency.conda.tqdm: '>=4.66.2' - dependency.conda.werkzeug: '>=2.2.3' - target: inference-optimization + dependency.conda.onnxruntime: '==1.18.1' + dependency.conda.py-cpuinfo: '==9.0.0' + dependency.conda.python: '==3.10.14' + dependency.conda.pytorch_cpu: '=2.3.1' + dependency.conda.pytorch_xpu: '=2.1.0' + dependency.conda.scikit-learn: '>=1.5.0' + dependency.conda.tensorboardx: '==2.6.2.2' + dependency.conda.tensorflow: '=2.15.1' + dependency.conda.tensorflow-hub: '==0.16.1' + dependency.conda.tf2onnx: '==1.16.1' + dependency.conda.torchaudio_cpu: '=2.3.1' + dependency.conda.torchaudio_xpu: '=2.1.0' + dependency.conda.torchvision_cpu: '=0.18.1' + dependency.conda.torchvision_xpu: '=0.16.0' + dependency.conda.tqdm: '==4.66.4' depends_on: - dl-base extends: dl-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-2024.1.0-py${PYTHON_VERSION:-3.10} - command: | - bash -c "conda run -n pytorch-cpu python -c 'import intel_extension_for_pytorch as ipex;print(ipex.__version__);' && \ - conda run -n pytorch-cpu python -c 'import neural_compressor;print(\"Neural Compressor Version:\", neural_compressor.__version__)' && \ - conda run -n pytorch-gpu python -c 'import torch;print(torch.device(\"xpu\"));import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available());' && \ - conda run -n pytorch-gpu python -c 'import neural_compressor;print(\"Neural Compressor Version:\", neural_compressor.__version__)' && \ - conda run -n tensorflow-cpu python -c 'import intel_extension_for_tensorflow as itex;print(itex.__version__);' && \ - conda run -n tensorflow-cpu python -c 'import neural_compressor, tf2onnx; print(\"\\nNeural Compressor Version:\", neural_compressor.__version__, \"\\\nTensorFlow2ONNX Version:\", tf2onnx.__version__)' && \ - conda run -n tensorflow-gpu python -c 'from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())' && \ - conda run -n tensorflow-gpu python -c 'import neural_compressor, tf2onnx; print(\"\\nNeural Compressor Version:\", neural_compressor.__version__, \"\\\nTensorFlow2ONNX Version:\", tf2onnx.__version__)'" + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + command: > + bash -c "conda run -n pytorch-cpu python -c 'import intel_extension_for_pytorch + as ipex;print(ipex.__version__)' && + + conda run -n pytorch-cpu python -c 'import neural_compressor;print(\"Neural + Compressor Version:\", neural_compressor.__version__)' && + + conda run -n pytorch-gpu python -c 'import torch;print(torch.device(\"xpu\"));import + intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())' && + + conda run -n pytorch-gpu python -c 'import neural_compressor;print(\"Neural + Compressor Version:\", neural_compressor.__version__)' && + + conda run -n tensorflow-cpu python -c 'import intel_extension_for_tensorflow + as itex;print(itex.__version__)' && + + conda run -n tensorflow-cpu python -c 'import neural_compressor, tf2onnx;print(\"\nNeural + Compressor Version:\", neural_compressor.__version__)';print(\"\nTensorFlow2ONNX + Version:\", tf2onnx.__version__)' && + + conda run -n tensorflow-gpu python -c 'from tensorflow.python.client import + device_lib;print(device_lib.list_local_devices())' && + + conda run -n tensorflow-gpu python -c 'import neural_compressor, tf2onnx;print(\"\\nNeural + Compressor Version:\", neural_compressor.__version__)';print(\"\\TensorFlow2ONNX + Version:\", tf2onnx.__version__)' " diff --git a/preset/inference-optimization/requirements.txt b/preset/inference-optimization/requirements.txt index 15dad774..3a3f0f13 100644 --- a/preset/inference-optimization/requirements.txt +++ b/preset/inference-optimization/requirements.txt @@ -1,16 +1,5 @@ -accelerate>=0.30.0 -cloud-data-connector>=1.0.3 -cryptography>=42.0.7 -dataset-librarian>=1.0.4 -datasets>=2.19.1 -evaluate>=0.4.2 +dataset-librarian==1.0.4 +evaluate==0.4.2 git+https://github.com/huggingface/optimum-intel.git -ninja>=1.11.1.1 -onnxruntime>=1.17.3 -py-cpuinfo>=9.0.0 -python-dotenv>=1.0.1 -requests>=2.31.0 -tensorflow-hub>=0.16.1 -tf2onnx>==1.16.1 -tqdm>=4.66.2 -transformers>=4.40.2 +tf2onnx==1.16.1 +onnxruntime==1.19.0 diff --git a/preset/inference-optimization/tests.yaml b/preset/inference-optimization/tests.yaml index 98731067..a906ecde 100644 --- a/preset/inference-optimization/tests.yaml +++ b/preset/inference-optimization/tests.yaml @@ -12,78 +12,87 @@ # See the License for the specific language governing permissions and # limitations under the License. +--- inference-optimization-inc-ipex-quantization-notebook-${PYTHON_VERSION:-3.9}-cpu: cmd: papermill --log-output jupyter/inc-ipex-quantization/quantize_with_inc.ipynb result.ipynb -k pytorch-cpu --cwd jupyter/inc-ipex-quantization - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True inference-optimization-inc-ipex-quantization-notebook-${PYTHON_VERSION:-3.9}-gpu: cmd: papermill --log-output jupyter/inc-ipex-quantization/quantize_with_inc.ipynb result.ipynb -k pytorch-gpu --cwd jupyter/inc-ipex-quantization - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True + device: ["/dev/dri"] inference-optimization-inc-itex-notebook-${PYTHON_VERSION:-3.9}-cpu: cmd: papermill --log-output jupyter/inc-itex/inc_sample_tensorflow.ipynb result.ipynb -k tensorflow-cpu --cwd jupyter/inc-itex - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True # Status: Commented due to out of resources error # inference-optimization-inc-itex-notebook-${PYTHON_VERSION:-3.9}-gpu: # cmd: papermill --log-output jupyter/inc-itex/inc_sample_tensorflow.ipynb result.ipynb -k tensorflow-gpu --cwd jupyter/inc-itex -# img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} +# img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} # notebook: True inference-optimization-inc-tensorflow-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n tensorflow-cpu sample-tests/neural_compressor/tensorflow/run.sh cpu - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} inference-optimization-inc-tensorflow-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n tensorflow-gpu sample-tests/neural_compressor/tensorflow/run.sh gpu - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] inference-optimization-inc-torch-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n pytorch-cpu sample-tests/neural_compressor/torch/run.sh cpu - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} inference-optimization-ipex-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n pytorch-cpu python -W ignore sample-tests/intel_extension_for_pytorch/test_ipex.py --device cpu --ipex - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} inference-optimization-ipex-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n pytorch-gpu python -W ignore sample-tests/intel_extension_for_pytorch/test_ipex.py --device xpu --ipex - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] inference-optimization-itex-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n tensorflow-cpu python -W ignore sample-tests/intel_extension_for_tensorflow/test_itex.py - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} inference-optimization-itex-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n tensorflow-gpu python -W ignore sample-tests/intel_extension_for_tensorflow/test_itex.py - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] inference-optimization-itex-inference-notebook-${PYTHON_VERSION:-3.9}-cpu: cmd: papermill --log-output jupyter/itex-inference/tutorial_optimize_TensorFlow_pretrained_model.ipynb result.ipynb -k tensorflow-cpu --cwd jupyter/itex-inference - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} notebook: True # Need update from TensorFlow v1 to V2 # inference-optimization-itex-inference-notebook-${PYTHON_VERSION:-3.9}-gpu: # cmd: papermill --log-output jupyter/itex-inference/tutorial_optimize_TensorFlow_pretrained_model.ipynb result.ipynb -k tensorflow-gpu --cwd jupyter/itex-inference -# img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} +# img: ${REGISTRY}/aiops/mlops-ci:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} # notebook: True inference-optimization-onnx-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n tensorflow-cpu sample-tests/onnx/run.sh - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} inference-optimization-onnx-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n tensorflow-gpu sample-tests/onnx/run.sh - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] inference-optimization-tensorflow-dataset-librarian-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n tensorflow-cpu bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} inference-optimization-tensorflow-dataset-librarian-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n tensorflow-gpu bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] inference-optimization-torch-dataset-librarian-${PYTHON_VERSION:-3.9}-cpu: cmd: conda run -n pytorch-cpu bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} inference-optimization-torch-dataset-librarian-${PYTHON_VERSION:-3.9}-gpu: cmd: conda run -n pytorch-gpu bash -c 'yes | python -m dataset_librarian.dataset -n msmarco --download -d ~/msmarco' - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.1.0}-py${PYTHON_VERSION:-3.9} + img: amr-registry.caas.intel.com/aiops/aikit-products-dev:b-${GITHUB_RUN_NUMBER:-0}-inference-optimization-${RELEASE:-2024.2.0}-py${PYTHON_VERSION:-3.9} + device: ["/dev/dri"] diff --git a/python/.actions.json b/python/.actions.json index db103a36..1112d076 100644 --- a/python/.actions.json +++ b/python/.actions.json @@ -1,5 +1,5 @@ { "IDP_VERSION": ["full", "core"], "experimental": [true], - "runner_label": ["PVC"] + "runner_label": ["clx"] } diff --git a/python/README.md b/python/README.md index 0b9f95dc..16b1d591 100644 --- a/python/README.md +++ b/python/README.md @@ -1,21 +1,46 @@ -# Intel® Distribution for Python +# Intel® Distribution for Python* -[Intel® Distribution for Python] enhances performance and can improve your program speed from 10 to 100 times faster. It is a Python distribution that includes the [Intel® Math Kernel Library] (oneMKL) and other Intel performance libraries to enable near-native performance through acceleration of core numerical and machine learning packages. - -[Intel® Distribution for Python] is available as part of the [Intel® oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html). +[Intel® Distribution for Python*] enhances performance and can improve your program speed from 10 to 100 times faster. It is a Python* distribution that includes the [Intel® Math Kernel Library] (oneMKL) and other Intel performance libraries to enable near-native performance through acceleration of core numerical and machine learning packages. ## Images -The images below include variations for only the core packages in the [Intel® Distribution for Python] installation, or all of the packages. +The images below include variations for only the core packages in the [Intel® Distribution for Python*] installation, or all of the packages. | Tag(s) | IDP | | ---------------------- | ---------- | | `3.10-full`, `latest` | `2024.2.0` | | `3.10-core` | `2024.2.0` | -## Build from Source +## Run a Performance Sample + +To run a performance sample run the following commands: + +```bash +git clone https://github.com/intel/ai-containers +cd ai-containers/python +docker run --rm -it \ + -v $PWD/tests:/tests \ + intel/python:latest \ + python /tests/perf_sample.py +``` + +### Compare the results against stock python + +In the previous command, you should see a result at the bottom like: `Time Consuming: 0.03897857666015625`. We can compare this against `python:3.11-slim-bullseye` + +```bash +# Use the working directory from the above command +docker run --rm -it \ + -v $PWD/tests:/tests \ + python:3.10-slim-bullseye \ + bash +pip install numpy +python /tests/perf_sample.py +``` + +## Build from Source (Advanced) -To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: +To build the images from source, clone the [AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: ```bash cd python @@ -27,8 +52,8 @@ You can find the list of services below for each container in the group: | Service Name | Description | | ------------ | ------------------------------------------------------------------- | -| `idp` | Base image with [Intel® Distribution for Python] | -| `pip` | Equivalent python image without [Intel® Distribution for Python] | +| `idp` | Base image with [Intel® Distribution for Python*] | +| `pip` | Equivalent python image without [Intel® Distribution for Python*] | ## License @@ -40,5 +65,5 @@ It is the image user's responsibility to ensure that any use of The images below -[Intel® Distribution for Python]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html#gs.9bos9m +[Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html#gs.9bos9m [Intel® Math Kernel Library]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html diff --git a/python/docker-compose.yaml b/python/docker-compose.yaml index a8039de4..2d674ae3 100644 --- a/python/docker-compose.yaml +++ b/python/docker-compose.yaml @@ -17,6 +17,7 @@ services: build: args: MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Linux-x86_64} + no_proxy: "" context: . labels: dependency.apt.wget: true diff --git a/python/requirements.txt b/python/requirements.txt index 0418b164..fa2002ba 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,6 +1,6 @@ numpy==1.26.4 setuptools>=70.0.0 psutil==6.0.0 -mkl==2024.2.0 -mkl-include==2024.2.0 -intel-openmp==2024.2.0 +mkl==2024.2.1 +mkl-include==2024.2.1 +intel-openmp==2024.2.1 diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile index 809da9c8..0ad9c07f 100644 --- a/pytorch/Dockerfile +++ b/pytorch/Dockerfile @@ -34,13 +34,9 @@ ARG BASE_IMAGE_TAG ARG PACKAGE_OPTION=pip ARG PYTHON_VERSION ARG PYTHON_BASE=${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER}-${BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${PACKAGE_OPTION}-py${PYTHON_VERSION}-base +ARG TORCHSERVE_BASE=${PYTHON_BASE} FROM ${PYTHON_BASE} AS ipex-base-pip -ARG IPEX_VERSION -ARG PYTORCH_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHVISION_VERSION - WORKDIR / COPY requirements.txt . @@ -49,11 +45,6 @@ RUN python -m pip install --no-cache-dir -r requirements.txt && \ FROM ${PYTHON_BASE} AS ipex-base-idp -ARG IPEX_VERSION -ARG PYTORCH_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHVISION_VERSION - WORKDIR / COPY requirements.txt . @@ -158,8 +149,8 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ - gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg -RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" | \ + gpg --dearmor --yes --output /usr/share/keyrings/intel-graphics.gpg +RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | \ tee /etc/apt/sources.list.d/intel-gpu-jammy.list ARG ICD_VER @@ -171,34 +162,32 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends --fix-missing \ intel-opencl-icd=${ICD_VER} \ intel-level-zero-gpu=${LEVEL_ZERO_GPU_VER} \ - level-zero=${LEVEL_ZERO_VER} \ - level-zero-dev=${LEVEL_ZERO_DEV_VER} && \ + libze1=${LEVEL_ZERO_VER} \ + libze-dev=${LEVEL_ZERO_DEV_VER} && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN no_proxy=$no_proxy wget -q -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ - | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ - echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ - | tee /etc/apt/sources.list.d/oneAPI.list +RUN rm -rf /etc/apt/sources.list.d/intel-gpu-jammy.list -ARG DPCPP_VER -ARG MKL_VER -ARG CCL_VER +ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors -RUN apt-get update && \ - apt-get install -y --no-install-recommends --fix-missing \ - intel-oneapi-runtime-dpcpp-cpp=${DPCPP_VER} \ - intel-oneapi-runtime-mkl=${MKL_VER} \ - intel-oneapi-runtime-ccl=${CCL_VER}; +FROM ipex-xpu-base AS ipex-xpu-base-wheels-pip WORKDIR / COPY xpu-requirements.txt . -RUN python -m pip install --no-cache-dir -r xpu-requirements.txt +RUN python -m pip install --no-cache-dir -r xpu-requirements.txt && \ + rm -rf xpu-requirements.txt + +FROM ipex-xpu-base AS ipex-xpu-base-wheels-idp + +WORKDIR / +COPY xpu-requirements.txt . -ENV LD_LIBRARY_PATH=/opt/intel/oneapi/redist/lib:$LD_LIBRARY_PATH +RUN conda run -n idp python -m pip install --no-cache-dir -r xpu-requirements.txt && \ + rm -rf xpu-requirements.txt -FROM ipex-xpu-base AS ipex-xpu-jupyter +FROM ipex-xpu-base-wheels-${PACKAGE_OPTION} AS ipex-xpu-jupyter WORKDIR /jupyter COPY jupyter-requirements.txt . @@ -214,7 +203,8 @@ EXPOSE 8888 CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root --ServerApp.token= --ServerApp.password= --ServerApp.allow_origin=* --ServerApp.base_url=$NB_PREFIX"] -FROM ${PYTHON_BASE} as torchserve-base + +FROM ${TORCHSERVE_BASE} as torchserve-base ENV PYTHONUNBUFFERED=TRUE @@ -230,8 +220,6 @@ RUN useradd -m -s /bin/bash model-server && \ mkdir -p /home/model-server/model-store && \ chown -R model-server /home/model-server/ -FROM torchserve-base AS compile - RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ g++ \ git \ @@ -241,16 +229,17 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin python3-venv && \ rm -rf /var/lib/apt/lists/* -RUN python3 -m venv /home/venv +WORKDIR / +COPY venv-requirements.txt . + +RUN python3 -m venv /home/venv && \ + /home/venv/bin/python -m pip install --no-cache-dir --upgrade pip && \ + /home/venv/bin/python -m pip install --no-cache-dir -r venv-requirements.txt && \ + rm -rf venv-requirements.txt ENV PATH="/home/venv/bin:$PATH" WORKDIR /home/model-server -COPY torchserve-requirements.txt . -COPY requirements.txt . - -RUN python -m pip install --no-cache-dir -r requirements.txt && \ - python -m pip install --no-cache-dir -r torchserve-requirements.txt RUN echo -e "#!/bin/bash \n\ set -e \n\ @@ -262,13 +251,29 @@ else \n\ fi \n\ tail -f /dev/null" >> /usr/local/bin/dockerd-entrypoint.sh -FROM torchserve-base AS torchserve +FROM torchserve-base AS compile-cpu + +COPY serving/torchserve-requirements.txt . +COPY requirements.txt . + +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + python -m pip install --no-cache-dir -r torchserve-requirements.txt && \ + rm -rf requirements.txt torchserve-requirements.txt + +FROM torchserve-base AS compile-xpu + +COPY serving/torchserve-xpu-requirements.txt . + +RUN python -m pip install --no-cache-dir -r torchserve-xpu-requirements.txt && \ + rm -rf torchserve-xpu-requirements.txt + +FROM torchserve-base AS torchserve-cpu USER model-server WORKDIR /home/model-server -COPY --chown=model-server --from=compile /home/venv /home/venv -COPY --chown=model-server --chmod=755 --from=compile /usr/local/bin/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh +COPY --chown=model-server --from=compile-cpu /home/venv /home/venv +COPY --chown=model-server --chmod=755 --from=compile-cpu /usr/local/bin/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh COPY --chown=model-server serving/config.properties /home/model-server/config.properties ENV PATH="/home/venv/bin:$PATH" @@ -279,3 +284,64 @@ EXPOSE 8080 8081 8082 7070 7071 ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] CMD ["serve"] + +FROM torchserve-base AS torchserve-xpu + +RUN apt-get update && \ + apt-get install -y --no-install-recommends --fix-missing \ + gnupg2 \ + gpg-agent \ + rsync && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ + gpg --dearmor --yes --output /usr/share/keyrings/intel-graphics.gpg +RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | \ + tee /etc/apt/sources.list.d/intel-gpu-jammy.list + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + jq \ + curl \ + libnl-genl-3-200 \ + intel-gsc \ + libdrm2 \ + intel-metrics-discovery \ + intel-metrics-library && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + +ARG XPU_SMI_VERSION + +ARG API_URL=https://api.github.com/repos/intel/xpumanager/releases/tags/V${XPU_SMI_VERSION} + +RUN wget -q --header="Accept: application/vnd.github.v3+json" --header="User-Agent: MyClient/1.0.0" -O - "$API_URL" | tee /tmp/asset_data.txt && \ + wget -q --no-check-certificate "$(jq -r '.assets[] | select(.name | test("^xpu-smi.*u22\\.04_amd64\\.deb$")) | .browser_download_url' < /tmp/asset_data.txt)" && \ + ldconfig && dpkg -i --force-all -- *.deb && \ + rm -rf -- *.deb /etc/apt/sources.list.d/intel-gpu-jammy.list /etc/apt/sources.list.d/oneAPI.list /tmp/asset_data.txt + +ARG GID=109 + +RUN groupadd -g ${GID} render &&\ + usermod -aG video,render model-server + +USER model-server + +WORKDIR /home/model-server + +RUN wget --progress=dot:giga https://raw.githubusercontent.com/pytorch/serve/master/examples/intel_extension_for_pytorch/intel_gpu_metric_collector.py && \ + wget --progress=dot:giga https://raw.githubusercontent.com/pytorch/serve/master/examples/intel_extension_for_pytorch/intel_gpu.py + +COPY --chown=model-server --from=compile-xpu /home/venv /home/venv +COPY --chown=model-server --chmod=755 --from=compile-xpu /usr/local/bin/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh +COPY --chown=model-server serving/config-xpu.properties /home/model-server/config.properties + +ENV PATH="/home/venv/bin:$PATH" +ENV TEMP=/home/model-server/tmp + +# 8080/8081/8082 REST and 7070/7071 gRPC +EXPOSE 8080 8081 8082 7070 7071 + +ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] +CMD ["serve"] diff --git a/pytorch/README.md b/pytorch/README.md index c2302f78..6b7a4012 100644 --- a/pytorch/README.md +++ b/pytorch/README.md @@ -24,6 +24,8 @@ The images below include support for both CPU and GPU optimizations: | Tag(s) | Pytorch | IPEX | Driver | Dockerfile | | ---------------------- | -------- | -------------- | ------ | --------------- | +| `2.3.110-xpu-pip-base`,`2.3.110-xpu` | [v2.3.1][torch-v2.3.1] | [v2.3.110+xpu] | [950] | [v0.4.0-Beta] | +| `2.1.40-xpu-pip-base`,`2.1.40-xpu` | [v2.1.0] | [v2.1.40+xpu] | [914] | [v0.4.0-Beta] | | `2.1.30-xpu` | [v2.1.0] | [v2.1.30+xpu] | [803] | [v0.4.0-Beta] | | `2.1.20-xpu` | [v2.1.0] | [v2.1.20+xpu] | [803] | [v0.3.4] | | `2.1.10-xpu` | [v2.1.0] | [v2.1.10+xpu] | [736] | [v0.2.3] | @@ -36,7 +38,7 @@ docker run -it --rm \ --device /dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ --ipc=host \ - intel/intel-extension-for-pytorch:2.1.30-xpu + intel/intel-extension-for-pytorch:2.3.110-xpu ``` --- @@ -45,8 +47,10 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | | --------------------- | -------- | ------------- | ------ | ------------ | --------------- | -| `2.1.20-xpu-pip-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] | -| `2.1.10-xpu-pip-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] | +| `2.3.110-xpu-pip-jupyter` | [v2.3.1][torch-v2.3.1] | [v2.3.110+xpu] | [950] | `8888` | [v0.4.0-Beta] | +| `2.1.40-xpu-pip-jupyter` | [v2.1.0] | [v2.1.40+xpu] | [914] | `8888` | [v0.4.0-Beta] | +| `2.1.20-xpu-pip-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] | +| `2.1.10-xpu-pip-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] | ### Run the XPU Jupyter Container @@ -55,7 +59,7 @@ docker run -it --rm \ -p 8888:8888 \ --device /dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ - intel/intel-extension-for-pytorch:2.1.20-xpu-pip-jupyter + intel/intel-extension-for-pytorch:2.3.110-xpu-pip-jupyter ``` After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. @@ -66,7 +70,8 @@ The images below are built only with CPU optimizations (GPU acceleration support | Tag(s) | Pytorch | IPEX | Dockerfile | | -------------------------- | -------- | ------------ | --------------- | -| `2.3.0-pip-base`, `latest` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | +| `2.4.0-pip-base`, `latest` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | +| `2.3.0-pip-base` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | | `2.2.0-pip-base` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | | `2.1.0-pip-base` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | | `2.0.0-pip-base` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | @@ -83,6 +88,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | Pytorch | IPEX | Dockerfile | | ------------------- | -------- | ------------ | --------------- | +| `2.4.0-pip-jupyter` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | | `2.3.0-pip-jupyter` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | | `2.2.0-pip-jupyter` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | | `2.1.0-pip-jupyter` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | @@ -93,7 +99,7 @@ docker run -it --rm \ -p 8888:8888 \ -v $PWD/workspace:/workspace \ -w /workspace \ - intel/intel-extension-for-pytorch:2.3.0-pip-jupyter + intel/intel-extension-for-pytorch:2.4.0-pip-jupyter ``` After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. @@ -104,6 +110,7 @@ The images below additionally include [Intel® oneAPI Collective Communications | Tag(s) | Pytorch | IPEX | oneCCL | INC | Dockerfile | | --------------------- | -------- | ------------ | -------------------- | --------- | -------------- | +| `2.4.0-pip-multinode` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.4.0] | [v3.0] | [v0.4.0-Beta] | | `2.3.0-pip-multinode` | [v2.3.0] | [v2.3.0+cpu] | [v2.3.0][ccl-v2.3.0] | [v2.6] | [v0.4.0-Beta] | | `2.2.0-pip-multinode` | [v2.2.2] | [v2.2.0+cpu] | [v2.2.0][ccl-v2.2.0] | [v2.6] | [v0.4.0-Beta] | | `2.1.100-pip-mulitnode` | [v2.1.2] | [v2.1.100+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.6] | [v0.4.0-Beta] | @@ -186,7 +193,7 @@ To add these files correctly please follow the steps described below. -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ -v $PWD/tests:/workspace/tests \ -w /workspace \ - intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ + intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ bash -c '/usr/sbin/sshd -D' ``` @@ -199,7 +206,7 @@ To add these files correctly please follow the steps described below. -v $PWD/tests:/workspace/tests \ -v $PWD/hostfile:/workspace/hostfile \ -w /workspace \ - intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ + intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ bash -c 'ipexrun cpu --nnodes 2 --nprocs-per-node 1 --master-addr 127.0.0.1 --master-port 3022 /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl' ``` @@ -227,7 +234,7 @@ Additionally, if you have a [DeepSpeed* configuration](https://www.deepspeed.ai/ -v $PWD/hostfile:/workspace/hostfile \ -v $PWD/ds_config.json:/workspace/ds_config.json \ -w /workspace \ - intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ + intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ bash -c 'deepspeed --launcher IMPI \ --master_addr 127.0.0.1 --master_port 3022 \ --deepspeed_config ds_config.json --hostfile /workspace/hostfile \ @@ -236,13 +243,11 @@ Additionally, if you have a [DeepSpeed* configuration](https://www.deepspeed.ai/ --- -#### Hugging Face Generative AI Container - The image below is an extension of the IPEX Multi-Node Container designed to run Hugging Face Generative AI scripts. The container has the typical installations needed to run and fine tune PyTorch generative text models from Hugging Face. It can be used to run multinode jobs using the same instructions from the [IPEX Multi-Node container](#setup-and-run-ipex-multi-node-container). -| Tag(s) | Pytorch | IPEX | oneCCL | transformers | Dockerfile | -| --------------------- | -------- | ------------ | -------------------- | --------- | --------------- | -| `2.3.0-pip-multinode-hf-4.41.2-genai` | [v2.3.1](https://github.com/pytorch/pytorch/releases/tag/v2.3.1) | [v2.3.0+cpu] | [v2.3.0][ccl-v2.3.0] | [v4.41.2] | [v0.4.0-Beta] | +| Tag(s) | Pytorch | IPEX | oneCCL | HF Transformers | Dockerfile | +| ------------------------------------- | -------- | ------------ | -------------------- | --------------- | --------------- | +| `2.4.0-pip-multinode-hf-4.44.0-genai` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.4.0] | [v4.44.0] | [v0.4.0-Beta] | Below is an example that shows single node job with the existing [`finetune.py`](../workflows/charts/huggingface-llm/scripts/finetune.py) script. @@ -251,7 +256,7 @@ Below is an example that shows single node job with the existing [`finetune.py`] docker run -it \ -v $PWD/workflows/charts/huggingface-llm/scripts:/workspace/scripts \ -w /workspace/scripts \ - intel/intel-extension-for-pytorch:2.3.0-pip-multinode-hf-4.41.2-genai \ + intel/intel-extension-for-pytorch:2.4.0-pip-multinode-hf-4.44.0-genai \ bash -c 'python finetune.py ' ``` @@ -261,17 +266,25 @@ The images below are [TorchServe*] with CPU Optimizations: | Tag(s) | Pytorch | IPEX | Dockerfile | | ------------------- | -------- | ------------ | --------------- | +| `2.4.0-serving-cpu` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | | `2.3.0-serving-cpu` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | | `2.2.0-serving-cpu` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | For more details, follow the procedure in the [TorchServe](https://github.com/pytorch/serve/blob/master/examples/intel_extension_for_pytorch/README.md) instructions. +The images below are [TorchServe*] with XPU Optimizations: + +| Tag(s) | Pytorch | IPEX | Dockerfile | +| ------------------- | -------- | ------------ | --------------- | +| `2.3.110-serving-xpu` | [v2.3.1][torch-v2.3.1] | [v2.3.110+xpu] | [v0.4.0-Beta] | + ## CPU only images with Intel® Distribution for Python* The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded) and include [Intel® Distribution for Python*]: | Tag(s) | Pytorch | IPEX | Dockerfile | | ---------------- | -------- | ------------ | --------------- | +| `2.4.0-idp-base` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | | `2.3.0-idp-base` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | | `2.2.0-idp-base` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | | `2.1.0-idp-base` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | @@ -281,6 +294,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | Pytorch | IPEX | Dockerfile | | ------------------- | -------- | ------------ | --------------- | +| `2.4.0-idp-jupyter` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | | `2.3.0-idp-jupyter` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | | `2.2.0-idp-jupyter` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | | `2.1.0-idp-jupyter` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | @@ -290,6 +304,7 @@ The images below additionally include [Intel® oneAPI Collective Communications | Tag(s) | Pytorch | IPEX | oneCCL | INC | Dockerfile | | --------------------- | -------- | ------------ | -------------------- | --------- | --------------- | +| `2.4.0-idp-multinode` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.3.0] | [v3.0] | [v0.4.0-Beta] | | `2.3.0-idp-multinode` | [v2.3.0] | [v2.3.0+cpu] | [v2.3.0][ccl-v2.3.0] | [v2.6] | [v0.4.0-Beta] | | `2.2.0-idp-multinode` | [v2.2.0] | [v2.2.0+cpu] | [v2.2.0][ccl-v2.2.0] | [v2.4.1] | [v0.3.4] | | `2.1.0-idp-mulitnode` | [v2.1.0] | [v2.1.0+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.3.1] | [v0.2.3] | @@ -301,6 +316,8 @@ The images below are built only with CPU and GPU optimizations and include [Inte | Tag(s) | Pytorch | IPEX | Driver | Dockerfile | | ---------------- | -------- | ------------ | -------- | ------ | +| `2.3.110-xpu-idp-base` | [v2.3.1][torch-v2.3.1] | [v2.3.110+xpu] | [950] | [v0.4.0-Beta] | +| `2.1.40-xpu-idp-base` | [v2.1.0] | [v2.1.40+xpu] | [914] | [v0.4.0-Beta] | | `2.1.30-xpu-idp-base` | [v2.1.0] | [v2.1.30+xpu] | [803] | [v0.4.0-Beta] | | `2.1.10-xpu-idp-base` | [v2.1.0] | [v2.1.10+xpu] | [736] | [v0.2.3] | @@ -308,12 +325,14 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | | --------------------- | -------- | ------------- | ------ | ------------ | --------------- | -| `2.1.20-xpu-idp-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] | -| `2.1.10-xpu-idp-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] | +| `2.3.110-xpu-idp-jupyter` | [v2.3.1][torch-v2.3.1] | [v2.3.110+xpu] | [950] | `8888` | [v0.4.0-Beta] | +| `2.1.40-xpu-idp-jupyter` | [v2.1.0] | [v2.1.40+xpu] | [914] | `8888` | [v0.4.0-Beta] | +| `2.1.20-xpu-idp-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] | +| `2.1.10-xpu-idp-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] | ## Build from Source -To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: +To build the images from source, clone the [AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: ```bash cd pytorch @@ -375,11 +394,14 @@ It is the image user's responsibility to ensure that any use of The images below [v0.2.3]: https://github.com/intel/ai-containers/blob/v0.2.3/pytorch/Dockerfile [v0.1.0]: https://github.com/intel/ai-containers/blob/v0.1.0/pytorch/Dockerfile +[v2.3.110+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.3.110%2Bxpu +[v2.1.40+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.40%2Bxpu [v2.1.30+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.30%2Bxpu [v2.1.20+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.20%2Bxpu [v2.1.10+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.10%2Bxpu [v2.0.110+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.110%2Bxpu +[v2.4.0]: https://github.com/pytorch/pytorch/releases/tag/v2.4.0 [v2.3.0]: https://github.com/pytorch/pytorch/releases/tag/v2.3.0 [v2.2.2]: https://github.com/pytorch/pytorch/releases/tag/v2.2.2 [v2.2.0]: https://github.com/pytorch/pytorch/releases/tag/v2.2.0 @@ -388,11 +410,15 @@ It is the image user's responsibility to ensure that any use of The images below [v2.0.1]: https://github.com/pytorch/pytorch/releases/tag/v2.0.1 [v2.0.0]: https://github.com/pytorch/pytorch/releases/tag/v2.0.0 +[torch-v2.3.1]: https://github.com/pytorch/pytorch/tree/v2.3.1 + +[v3.0]: https://github.com/intel/neural-compressor/releases/tag/v3.0 [v2.6]: https://github.com/intel/neural-compressor/releases/tag/v2.6 [v2.4.1]: https://github.com/intel/neural-compressor/releases/tag/v2.4.1 [v2.3.1]: https://github.com/intel/neural-compressor/releases/tag/v2.3.1 [v2.1.1]: https://github.com/intel/neural-compressor/releases/tag/v2.1.1 +[v2.4.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.4.0%2Bcpu [v2.3.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.3.0%2Bcpu [v2.2.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.2.0%2Bcpu [v2.1.100+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.0%2Bcpu @@ -400,14 +426,17 @@ It is the image user's responsibility to ensure that any use of The images below [v2.0.100+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.0%2Bcpu [v2.0.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.0%2Bcpu +[ccl-v2.4.0]: https://github.com/intel/torch-ccl/releases/tag/v2.4.0%2Bcpu%2Brc0 [ccl-v2.3.0]: https://github.com/intel/torch-ccl/releases/tag/v2.3.0%2Bcpu [ccl-v2.2.0]: https://github.com/intel/torch-ccl/releases/tag/v2.2.0%2Bcpu [ccl-v2.1.0]: https://github.com/intel/torch-ccl/releases/tag/v2.1.0%2Bcpu [ccl-v2.0.0]: https://github.com/intel/torch-ccl/releases/tag/v2.1.0%2Bcpu -[v4.41.2]: https://github.com/huggingface/transformers/releases/tag/v4.41.2 +[v4.44.0]: https://github.com/huggingface/transformers/releases/tag/v4.44.0 +[950]: https://dgpu-docs.intel.com/releases/stable_950_13_20240814.html +[914]: https://dgpu-docs.intel.com/releases/stable_914_33_20240730.html [803]: https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html [736]: https://dgpu-docs.intel.com/releases/stable_736_25_20231031.html [647]: https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml index 03f51ab4..efacdea1 100644 --- a/pytorch/docker-compose.yaml +++ b/pytorch/docker-compose.yaml @@ -25,16 +25,16 @@ services: BASE_IMAGE_NAME: ${BASE_IMAGE_NAME:-ubuntu} BASE_IMAGE_TAG: ${BASE_IMAGE_TAG:-22.04} GITHUB_RUN_NUMBER: ${GITHUB_RUN_NUMBER:-0} - IPEX_VERSION: ${IPEX_VERSION:-2.3.0} + IPEX_VERSION: ${IPEX_VERSION:-2.4.0} MINIFORGE_VERSION: ${MINIFORGE_VERSION:-Linux-x86_64} NO_PROXY: '' PACKAGE_OPTION: ${PACKAGE_OPTION:-pip} PYTHON_VERSION: ${PYTHON_VERSION:-3.10} - PYTORCH_VERSION: ${PYTORCH_VERSION:-2.3.0+cpu} + PYTORCH_VERSION: ${PYTORCH_VERSION:-2.4.0+cpu} REGISTRY: ${REGISTRY} REPO: ${REPO} - TORCHAUDIO_VERSION: ${TORCHAUDIO_VERSION:-2.3.0+cpu} - TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.18.0+cpu} + TORCHAUDIO_VERSION: ${TORCHAUDIO_VERSION:-2.4.0} + TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.19.0} context: . labels: dependency.python: ${PYTHON_VERSION:-3.10} @@ -43,21 +43,21 @@ services: org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.name: "intel/intel-optimized-pytorch" org.opencontainers.image.title: "Intel® Extension for PyTorch Base Image" - org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base + org.opencontainers.image.version: ${IPEX_VERSION:-2.4.0}-${PACKAGE_OPTION:-pip}-base target: ipex-base-${PACKAGE_OPTION:-pip} command: > sh -c "python -c 'import torch; import intel_extension_for_pytorch as ipex; print(\"torch:\", torch.__version__, \" ipex:\",ipex.__version__)'" depends_on: - ${PACKAGE_OPTION:-pip} - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-base + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-base pull_policy: always jupyter: build: labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.4.0}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for PyTorch Jupyter Image" - org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-jupyter + org.opencontainers.image.version: ${IPEX_VERSION:-2.4.0}-${PACKAGE_OPTION:-pip}-jupyter target: jupyter command: > bash -c "python -m jupyter --version" @@ -65,7 +65,7 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} extends: ipex-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-jupyter + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-jupyter network_mode: host ports: - 8888:8888 @@ -79,9 +79,9 @@ services: dependency.pip.apt.virtualenv: true dependency.pip.deepspeed: 0.14.4 dependency.python.pip: multinode/requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.4.0}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Image" - org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode + org.opencontainers.image.version: ${IPEX_VERSION:-2.4.0}-${PACKAGE_OPTION:-pip}-multinode target: multinode command: > bash -c "python -c 'import neural_compressor;import oneccl_bindings_for_pytorch as oneccl;import deepspeed; @@ -89,82 +89,65 @@ services: \"\\nOneCCL:\", oneccl.__version__, \"\\nDeepspeed:\", deepspeed.__version__)'" extends: ipex-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-oneccl-inc-${INC_VERSION:-3.0} shm_size: 2gb xpu: build: args: - CCL_VER: ${CCL_VER:-2021.12.0-309} - DPCPP_VER: ${DPCPP_VER:-2024.1.0-963} - ICD_VER: ${ICD_VER:-23.43.27642.40-803~22.04} - IPEX_XPU_VERSION: ${IPEX_VERSION:-2.1.20} - LEVEL_ZERO_DEV_VER: ${LEVEL_ZERO_DEV_VER:-1.14.0-744~22.04} - LEVEL_ZERO_GPU_VER: ${LEVEL_ZERO_GPU_VER:-1.3.27642.40-803~22.04} - LEVEL_ZERO_VER: ${LEVEL_ZERO_VER:-1.14.0-744~22.04} - MKL_VER: ${MKL_VER:-2024.1.0-691} + ICD_VER: ${ICD_VER:-24.26.30049.10-950~22.04} + LEVEL_ZERO_DEV_VER: ${LEVEL_ZERO_DEV_VER:-1.17.6-950~22.04} + LEVEL_ZERO_GPU_VER: ${LEVEL_ZERO_GPU_VER:-1.3.30049.10-950~22.04} + LEVEL_ZERO_VER: ${LEVEL_ZERO_VER:-1.17.6-950~22.04} NO_PROXY: '' - ONECCL_VERSION: ${ONECCL_VERSION:-2.1.300} PACKAGE_OPTION: ${PACKAGE_OPTION:-pip} - PYTORCH_XPU_VERSION: ${PYTORCH_VERSION:-2.1.0} - TORCHVISION_XPU_VERSION: ${TORCHVISION_VERSION:-0.16.0} labels: dependency.apt.build-essential: true dependency.apt.clinfo: true dependency.apt.git: true dependency.apt.gnupg2: true dependency.apt.gpg-agent: true - dependency.apt.intel-level-zero-gpu: ${LEVEL_ZERO_GPU_VER:-1.3.27642.40-803~22.04} - dependency.apt.intel-oneapi-runtime-ccl: ${CCL_VER:-2021.12.0-309} - dependency.apt.intel-oneapi-runtime-dpcpp-cpp: ${DPCPP_VER:-2024.1.0-963} - dependency.apt.intel-oneapi-runtime-mkl: ${MKL_VER:-2024.1.0-691} - dependency.apt.intel-opencl-icd: ${ICD_VER:-23.43.27642.40-803~22.04} - dependency.apt.level-zero: ${LEVEL_ZERO_VER:-1.14.0-744~22.04} - dependency.apt.level-zero-dev: ${LEVEL_ZERO_DEV_VER:-1.14.0-744~22.04} + dependency.apt.intel-level-zero-gpu: ${LEVEL_ZERO_GPU_VER:-1.3.30049.10-950~22.04} + dependency.apt.intel-opencl-icd: ${ICD_VER:-24.26.30049.10-950~22.04} + dependency.apt.level-zero: ${LEVEL_ZERO_VER:-1.17.6-950~22.04} + dependency.apt.level-zero-dev: ${LEVEL_ZERO_DEV_VER:-1.17.6-950~22.04} dependency.apt.rsync: true dependency.apt.unzip: true dependency.idp.pip: false org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.title: "Intel® Extension for PyTorch XPU Base Image" - org.opencontainers.image.version: ${IPEX_VERSION:-2.1.20}-xpu-${PACKAGE_OPTION:-pip}-base - target: ipex-xpu-base + org.opencontainers.image.version: ${IPEX_VERSION:-2.3.110}-xpu-${PACKAGE_OPTION:-pip}-base + target: ipex-xpu-base-wheels-${PACKAGE_OPTION:-pip} command: > python -c "import torch;print(torch.device('xpu'));import intel_extension_for_pytorch as - ipex;print(ipex.xpu.is_available());print(torch.__version__); + ipex;print(torch.xpu.has_xpu());print(torch.__version__); print(ipex.__version__); [print(f'[{i}]: - {ipex.xpu.get_device_properties(i)}') for i in - range(ipex.xpu.device_count())];" + {torch.xpu.get_device_properties(i)}') for i in + range(torch.xpu.device_count())];" extends: ipex-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.1.30xpu}-xpu-base + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.110xpu}-xpu-base xpu-jupyter: build: args: - CCL_VER: ${CCL_VER:-2021.12.0-309} - DPCPP_VER: ${DPCPP_VER:-2024.1.0-963} - ICD_VER: ${ICD_VER:-23.43.27642.40-803~22.04} - IPEX_XPU_VERSION: ${IPEX_VERSION:-2.1.20} - LEVEL_ZERO_DEV_VER: ${LEVEL_ZERO_DEV_VER:-1.14.0-744~22.04} - LEVEL_ZERO_GPU_VER: ${LEVEL_ZERO_GPU_VER:-1.3.27642.40-803~22.04} - LEVEL_ZERO_VER: ${LEVEL_ZERO_VER:-1.14.0-744~22.04} - MKL_VER: ${MKL_VER:-2024.1.0-691} + ICD_VER: ${ICD_VER:-24.26.30049.10-950~22.04} + LEVEL_ZERO_DEV_VER: ${LEVEL_ZERO_DEV_VER:-1.17.6-950~22.04} + LEVEL_ZERO_GPU_VER: ${LEVEL_ZERO_GPU_VER:-1.3.30049.10-950~22.04} + LEVEL_ZERO_VER: ${LEVEL_ZERO_VER:-1.17.6-950~22.04} NO_PROXY: '' - ONECCL_VERSION: ${ONECCL_VERSION:-2.1.200} PACKAGE_OPTION: ${PACKAGE_OPTION:-pip} - PYTORCH_XPU_VERSION: ${PYTORCH_VERSION:-2.1.0} - TORCHVISION_XPU_VERSION: ${TORCHVISION_VERSION:-0.16.0} labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.1.20}-xpu-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.3.110}-xpu-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for PyTorch XPU Jupyter Image" - org.opencontainers.image.version: ${IPEX_VERSION:-2.1.20}-xpu-${PACKAGE_OPTION:-pip}-jupyter + org.opencontainers.image.version: ${IPEX_VERSION:-2.3.110}-xpu-${PACKAGE_OPTION:-pip}-jupyter target: ipex-xpu-jupyter command: > bash -c "python -m jupyter --version" extends: ipex-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.1.30xpu}-xpu-jupyter + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.110xpu}-xpu-jupyter ports: - 8888:8888 - torchserve: + torchserve-cpu: build: args: PACKAGE_OPTION: pip @@ -173,33 +156,54 @@ services: dependency.apt.openjdk-17-jdk: true dependency.idp: false dependency.python.ipex: requirements.txt - dependency.python.pip: torchserve-requirements.txt + dependency.python.pip: serving/torchserve-requirements.txt docs: serving org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.title: "Intel® Extension for PyTorch Serving Image" - org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-serving-cpu - target: torchserve + org.opencontainers.image.version: ${IPEX_VERSION:-2.4.0}-serving-cpu + target: torchserve-cpu command: torchserve --version entrypoint: "" extends: ipex-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-cpu ports: - 8080:8080 - 8081:8081 - 8082:8082 - 7070:7070 - 7071:7071 + torchserve-xpu: + build: + args: + PACKAGE_OPTION: pip + XPU_SMI_VERSION: ${XPU_SMI_VERSION:-1.2.38} + TORCHSERVE_BASE: ipex-xpu-base + labels: + dependency.apt.numactl: true + dependency.apt.openjdk-17-jdk: true + dependency.apt.xpu-smi: ${XPU_SMI_VERSION:-1.2.38} + dependency.idp: false + dependency.python.pip: serving/torchserve-xpu-requirements.txt + docs: serving + org.opencontainers.base.name: "intel/python:3.10-core" + org.opencontainers.image.title: "Intel® Extension for PyTorch XPU Serving Image" + org.opencontainers.image.version: ${IPEX_VERSION:-2.3.110}-serving-xpu + target: torchserve-xpu + command: torchserve --version + entrypoint: "" + extends: xpu + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-xpu hf-genai: build: args: - HF_VERSION: ${HF_VERSION:-4.41.2} + HF_VERSION: ${HF_VERSION:-4.44.0} labels: dependency.python.pip: hf-genai-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.3.0}-${PACKAGE_OPTION:-pip}-multinode" + org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.4.0}-${PACKAGE_OPTION:-pip}-multinode" org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Huggingface Generative AI Image" - org.opencontainers.image.version: ${IPEX_VERSION:-2.3.0}-${PACKAGE_OPTION:-pip}-multinode-hf-${HF_VERSION:-4.41.2}-genai" + org.opencontainers.image.version: ${IPEX_VERSION:-2.4.0}-${PACKAGE_OPTION:-pip}-multinode-hf-${HF_VERSION:-4.44.0}-genai" target: hf-genai extends: ipex-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-hf-${HF_VERSION:-4.41.2} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-hf-${HF_VERSION:-4.44.0} command: > - bash -c "python-c 'import transformers; print(transformers.__version__)'" + bash -c "python -c 'import transformers; print(transformers.__version__)'" diff --git a/pytorch/hf-genai-requirements.txt b/pytorch/hf-genai-requirements.txt index df77695f..8eb7fb3a 100644 --- a/pytorch/hf-genai-requirements.txt +++ b/pytorch/hf-genai-requirements.txt @@ -1,13 +1,13 @@ -accelerate==0.32.1 -datasets==2.20.0 +accelerate==0.34.2 +datasets==3.0.0 einops==0.8.0 -evaluate==0.4.2 -onnxruntime-extensions==0.11.0 -onnxruntime==1.18.1 -peft==0.11.1 -protobuf==5.27.2 +evaluate==0.4.3 +onnxruntime-extensions==0.12.0 +onnxruntime==1.19.2 +peft==0.12.0 +protobuf==5.28.1 py-cpuinfo==9.0.0 -scikit-learn==1.5.1 +scikit-learn==1.5.2 SentencePiece==0.2.0 tokenizers==0.19.1 -transformers==4.42.4 +transformers==4.44.2 diff --git a/pytorch/jupyter-requirements.txt b/pytorch/jupyter-requirements.txt index b5ab6652..4313b738 100644 --- a/pytorch/jupyter-requirements.txt +++ b/pytorch/jupyter-requirements.txt @@ -1,4 +1,4 @@ -jupyterlab==4.3.0a2 +jupyterlab==4.3.0b1 jupyterhub==5.1.0 notebook==7.3.0a1 jupyter-server-proxy>=4.1.2 diff --git a/pytorch/multinode/requirements.txt b/pytorch/multinode/requirements.txt index 53f579ca..a303e658 100644 --- a/pytorch/multinode/requirements.txt +++ b/pytorch/multinode/requirements.txt @@ -1,5 +1,5 @@ -neural-compressor==2.6 -oneccl_bind_pt==2.3.0+cpu ---extra-index-url https://developer.intel.com/ipex-whl-stable-cpu +neural-compressor==3.0.2 +oneccl_bind_pt==2.4.0+cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ oneccl-devel>=2021.13.0 # required to build deepspeed ops mpi4py>=3.1.0 # required to build deepspeed ops diff --git a/pytorch/notebooks/ipex-xpu.ipynb b/pytorch/notebooks/ipex-xpu.ipynb index 662dd634..45df4c35 100644 --- a/pytorch/notebooks/ipex-xpu.ipynb +++ b/pytorch/notebooks/ipex-xpu.ipynb @@ -25,13 +25,13 @@ "outputs": [], "source": [ "import intel_extension_for_pytorch as ipex\n", - "print(ipex.xpu.is_available())\n", - "if (not ipex.xpu.is_available()):\n", + "print(torch.xpu.has_xpu())\n", + "if (not torch.xpu.is_available()):\n", " print('Intel GPU not detected. Please install GPU with compatible drivers')\n", " sys.exit(1)\n", - "print(ipex.xpu.has_onemkl())\n", + "print(torch.xpu.has_onemkl())\n", "print(torch.__version__); print(ipex.__version__)\n", - "[print(f'[{i}]: {ipex.xpu.get_device_properties(i)}') for i in range(ipex.xpu.device_count())]\n" + "[print(f'[{i}]: {torch.xpu.get_device_properties(i)}') for i in range(torch.xpu.device_count())]\n" ] } ], diff --git a/pytorch/requirements.txt b/pytorch/requirements.txt index 6e20f9ea..664b5ad8 100644 --- a/pytorch/requirements.txt +++ b/pytorch/requirements.txt @@ -1,6 +1,6 @@ -torch==2.3.1 -torchvision==0.18.1 -torchaudio==2.3.1 --f https://download.pytorch.org/whl/cpu/torch_stable.html -intel_extension_for_pytorch==2.3.100+cpu ---extra-index-url https://developer.intel.com/ipex-whl-stable-cpu +torch==2.4.0+cpu +torchvision==0.19.0+cpu +torchaudio==2.4.0+cpu +--extra-index-url https://download.pytorch.org/whl/cpu +intel_extension_for_pytorch==2.4.0+cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ diff --git a/pytorch/serving/README.md b/pytorch/serving/README.md index 133c48f4..c0a5413c 100644 --- a/pytorch/serving/README.md +++ b/pytorch/serving/README.md @@ -12,29 +12,73 @@ The [Torchserve Model Archiver](https://github.com/pytorch/serve/blob/master/mod Follow the instructions found in the link above depending on whether you are intending to archive a model or a workflow. Use the provided container rather than installing the archiver with the example command below: +#### Create a Model Archive for CPU device + ```bash curl -O https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth docker run --rm -it \ + --entrypoint='' \ + -u root \ -v $PWD:/home/model-server \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu \ - torch-model-archiver --model-name squeezenet \ - --version 1.0 \ - --model-file model-archive/model.py \ - --serialized-file squeezenet1_1-b8a52dc0.pth \ - --handler image_classifier \ - --export-path /home/model-server + intel/intel-optimized-pytorch:2.4.0-serving-cpu \ + torch-model-archiver --model-name squeezenet1_1 \ + --version 1.1 \ + --model-file model-archive/model.py \ + --serialized-file squeezenet1_1-b8a52dc0.pth \ + --handler image_classifier \ + --export-path /home/model-server +``` + +### Create a Model Archive for XPU device + +Use a squeezenet model [optimized](./model-store/ipex_squeezenet.py) for XPU using Intel® Extension for PyTorch*. + +```bash +docker run --rm -it \ + --entrypoint='' \ + -u root \ + -v $PWD:/home/model-server \ + --device /dev/dri \ + intel/intel-optimized-pytorch:2.3.110-serving-xpu \ + sh -c 'python model-archive/ipex_squeezenet.py && \ + torch-model-archiver --model-name squeezenet1_1 \ + --version 1.1 \ + --serialized-file squeezenet1_1-jit.pt \ + --handler image_classifier \ + --export-path /home/model-server' ``` ### Test Model Test Torchserve with the new archived model. The example below is for the squeezenet model. +#### Run Torchserve for CPU device + ```bash # Assuming that the above pre-archived model is in the current working directory docker run -d --rm --name server \ -v $PWD:/home/model-server/model-store \ + -v $PWD/wf-store:/home/model-server/wf-store \ --net=host \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu + intel/intel-optimized-pytorch:2.4.0-serving-cpu +``` + +#### Run Torchserve for XPU device + +```bash +# Assuming that the above pre-archived model is in the current working directory +docker run -d --rm --name server \ + -v $PWD:/home/model-server/model-store \ + -v $PWD/wf-store:/home/model-server/wf-store \ + -v $PWD/config-xpu.properties:/home/model-server/config.properties \ + --net=host \ + --device /dev/dri \ + intel/intel-optimized-pytorch:2.3.110-serving-xpu +``` + +After lauching the container, follow the steps below: + +```bash # Verify that the container has launched successfully docker logs server # Attempt to register the model and make an inference request @@ -87,7 +131,7 @@ As demonstrated in the above example, models must be registered before they can -v $PWD:/home/model-server/model-store \ -v $PWD/config.properties:/home/model-server/config.properties \ --net=host \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu + intel/intel-optimized-pytorch:2.4.0-serving-cpu # Verify that the container has launched successfully docker logs server # Check the models list @@ -111,7 +155,7 @@ As demonstrated in the above example, models must be registered before they can ### KServe -Apply Intel Optimizations to KServe by patching the serving runtimes to use Intel Optimized Serving Containers with `kubectl apply -f patch.yaml` +Apply Intel Optimizations to KServe by patching the serving runtimes to use Serving Containers with Intel Optimizations via `kubectl apply -f patch.yaml` > [!NOTE] > You can modify this `patch.yaml` file to change the serving runtime pod configuration. diff --git a/pytorch/serving/build-kfs.sh b/pytorch/serving/build-kfs.sh index 7cdedc93..89e30823 100755 --- a/pytorch/serving/build-kfs.sh +++ b/pytorch/serving/build-kfs.sh @@ -16,7 +16,7 @@ cd .. || exit docker compose pull torchserve -docker tag "$(docker images -q | head -n1)" intel/torchserve:latest +docker tag "${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-ubuntu-22.04-py3.10-torchserve" intel/torchserve:latest git clone https://github.com/pytorch/serve cd serve/kubernetes/kserve || exit git apply ../../../serving/kfs.patch diff --git a/pytorch/serving/config-xpu.properties b/pytorch/serving/config-xpu.properties new file mode 100644 index 00000000..170a1485 --- /dev/null +++ b/pytorch/serving/config-xpu.properties @@ -0,0 +1,15 @@ +inference_address=http://0.0.0.0:8080 +management_address=http://0.0.0.0:8081 +metrics_address=http://0.0.0.0:8082 +number_of_netty_threads=32 +install_py_dep_per_model=true +job_queue_size=1000 +model_store=/home/model-server/model-store +workflow_store=/home/model-server/wf-store +allowed_urls=https://s3.amazonaws.com/.*,https://torchserve.pytorch.org/.* +ipex_enable=true +ipex_gpu_enable=true +system_metrics_cmd=/home/model-server/intel_gpu_metric_collector.py --gpu 1 +disable_token_authorization=true +enable_model_api=true +enable_envvars_config=true diff --git a/pytorch/serving/config.properties b/pytorch/serving/config.properties index 8f17094d..ecaec9e0 100644 --- a/pytorch/serving/config.properties +++ b/pytorch/serving/config.properties @@ -12,3 +12,4 @@ cpu_launcher_enable=true cpu_launcher_args=--use_logical_core disable_token_authorization=true enable_model_api=true +enable_envvars_config=true diff --git a/pytorch/serving/model-archive/ipex_squeezenet.py b/pytorch/serving/model-archive/ipex_squeezenet.py new file mode 100644 index 00000000..14c0dcb4 --- /dev/null +++ b/pytorch/serving/model-archive/ipex_squeezenet.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# +# This file was assembled from multiple pieces, whose use is documented +# throughout. Please refer to the TensorFlow dockerfiles documentation +# for more information. +# based on https://github.com/pytorch/pytorch/blob/master/Dockerfile +# +# NOTE: To build this you will need a docker version >= 19.03 and DOCKER_BUILDKIT=1 +# +# If you do not use buildkit you are not going to have a good time +# +# For reference: +# https://docs.docker.com/develop/develop-images/build_enhancements/ + +# pylint: skip-file + +import intel_extension_for_pytorch as ipex +import torch +import torchvision.models as models + +# load the model +model = models.squeezenet1_1(pretrained=True) +model = model.eval() + +# define dummy input tensor to use for the model's forward call to record operations in the model for tracing +N, C, H, W = 1, 3, 224, 224 +data = torch.randn(N, C, H, W) + +model.eval() +data = torch.rand(1, 3, 224, 224) + +#################### code changes ################# +model = model.to("xpu") +data = data.to("xpu") +model = ipex.optimize(model, dtype=torch.bfloat16) +#################### code changes ################# + +with torch.no_grad(): + with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16): + ############################# code changes ##################### + model = torch.jit.trace(model, data) + model = torch.jit.freeze(model) + model(data) +torch.jit.save(model, "squeezenet1_1-jit.pt") diff --git a/pytorch/serving/model-archive/mar-test.sh b/pytorch/serving/model-archive/mar-test.sh index f07b83ad..aabee71f 100644 --- a/pytorch/serving/model-archive/mar-test.sh +++ b/pytorch/serving/model-archive/mar-test.sh @@ -26,8 +26,18 @@ # For reference: # https://docs.docker.com/develop/develop-images/build_enhancements/ -wget https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth -torch-model-archiver --model-name squeezenet1_1 --version 1.1 --model-file /home/model-server/model-archive/model.py --serialized-file squeezenet1_1-b8a52dc0.pth --handler image_classifier --export-path /home/model-server/model-store +if [[ "$1" == "cpu" ]]; then + wget https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth + torch-model-archiver --model-name squeezenet1_1 --version 1.1 --model-file /home/model-server/model-archive/model.py --serialized-file squeezenet1_1-b8a52dc0.pth --handler image_classifier --export-path /home/model-server/model-store + rm -rf squeezenet1_1-b8a52dc0.pth +elif [[ "$1" == "xpu" ]]; then + python /home/model-server/model-archive/ipex_squeezenet.py + torch-model-archiver --model-name squeezenet1_1 --version 1.1 --serialized-file squeezenet1_1-jit.pt --handler image_classifier --export-path /home/model-server/model-store + rm -rf squeezenet1_1-jit.pt +else + echo "Only cpu and xpu devices supported" + exit 1 +fi + [ -f "/home/model-server/model-store/squeezenet1_1.mar" ] && echo "squeezenet1_1.pth Archived Succesfully at /home/model-server/model-store/squeezenet1_1.mar" -rm -rf squeezenet1_1-b8a52dc0.pth find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)" | xargs rm -rf diff --git a/pytorch/serving/patch.yaml b/pytorch/serving/patch.yaml index 487eab34..cd18e847 100644 --- a/pytorch/serving/patch.yaml +++ b/pytorch/serving/patch.yaml @@ -242,7 +242,7 @@ spec: - grpc-v1 containers: - name: kserve-container - image: "intel/intel-extension-for-pytorch:2.2.0-serving-cpu-kserve" + image: "intel/intel-extension-for-pytorch:2.4.0-serving-cpu-kserve" args: - torchserve - --start diff --git a/pytorch/serving/tests.yaml b/pytorch/serving/tests.yaml index 3c91eced..986e220a 100644 --- a/pytorch/serving/tests.yaml +++ b/pytorch/serving/tests.yaml @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -ipex-serving-model-archive: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve - cmd: /home/model-server/model-archive/mar-test.sh +ipex-serving-cpu-model-archive: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-cpu + cmd: /home/model-server/model-archive/mar-test.sh cpu entrypoint: /bin/bash volumes: - src: $PWD/pytorch/serving/model-archive @@ -23,8 +23,20 @@ ipex-serving-model-archive: dst: /home/model-server/model-store user: root workdir: /home/model-server/model-archive -ipex-serving-workflow-archive: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve +ipex-serving-xpu-model-archive: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-xpu + cmd: /home/model-server/model-archive/mar-test.sh xpu + entrypoint: /bin/bash + device: ["/dev/dri"] + volumes: + - src: $PWD/pytorch/serving/model-archive + dst: /home/model-server/model-archive + - src: $PWD/pytorch/serving/model-store + dst: /home/model-server/model-store + user: root + workdir: /home/model-server/model-archive +ipex-serving-cpu-workflow-archive: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-cpu cmd: /home/model-server/model-archive/war-test.sh entrypoint: /bin/bash volumes: @@ -34,10 +46,23 @@ ipex-serving-workflow-archive: dst: /home/model-server/wf-store user: root workdir: /home/model-server/model-archive -ipex-serving-rest-workflow: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve +ipex-serving-cpu-rest-workflow: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-cpu + cmd: bash /home/model-server/wf-store/rest-test.sh + serving: True + volumes: + - src: $PWD/pytorch/serving/model-store + dst: /home/model-server/model-store + - src: $PWD/pytorch/serving/wf-store + dst: /home/model-server/wf-store + env: + ENABLE_TORCH_PROFILER: 'true' + shm_size: 1g +ipex-serving-xpu-rest-workflow: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-xpu cmd: bash /home/model-server/wf-store/rest-test.sh serving: True + device: ["/dev/dri"] volumes: - src: $PWD/pytorch/serving/model-store dst: /home/model-server/model-store @@ -47,8 +72,8 @@ ipex-serving-rest-workflow: ENABLE_TORCH_PROFILER: 'true' shm_size: 1g workdir: /home/model-server/wf-store -ipex-serving-rest-inference: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve +ipex-serving-cpu-rest-inference: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-cpu cmd: bash /home/model-server/model-store/rest-test.sh serving: True volumes: @@ -60,8 +85,8 @@ ipex-serving-rest-inference: ENABLE_TORCH_PROFILER: 'true' shm_size: 1g workdir: /home/model-server/model-store -ipex-serving-grpc-inference: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve +ipex-serving-cpu-grpc-inference: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-py${PYTHON_VERSION:-3.10}-torchserve-cpu cmd: bash /home/model-server/model-store/grpc-test.sh serving: True volumes: diff --git a/pytorch/torchserve-requirements.txt b/pytorch/serving/torchserve-requirements.txt similarity index 56% rename from pytorch/torchserve-requirements.txt rename to pytorch/serving/torchserve-requirements.txt index 0dbb45d5..41d78b17 100644 --- a/pytorch/torchserve-requirements.txt +++ b/pytorch/serving/torchserve-requirements.txt @@ -6,7 +6,7 @@ pyyaml>=6.0.1 torch-model-archiver==0.11.1 torch-workflow-archiver==0.2.14 torchserve==0.11.1 -torchtext==0.18.0 -torchvision==0.18.1 --f https://developer.intel.com/ipex-whl-stable-cpu -intel_extension_for_pytorch==2.3.100+cpu +torchtext==0.18.0+cpu +torchvision==0.19.0 +-f https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ +intel_extension_for_pytorch==2.4.0+cpu diff --git a/pytorch/serving/torchserve-xpu-requirements.txt b/pytorch/serving/torchserve-xpu-requirements.txt new file mode 100644 index 00000000..6cd3ff99 --- /dev/null +++ b/pytorch/serving/torchserve-xpu-requirements.txt @@ -0,0 +1,14 @@ +torch==2.3.1+cxx11.abi +torchvision==0.18.1+cxx11.abi +torchaudio==2.3.1+cxx11.abi +intel_extension_for_pytorch==2.3.110+xpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us +numpy==2.1.1 +captum>=0.7.0 +cython>=3.0.10 +pynvml>=11.5.0 +pyyaml>=6.0.1 +-f https://download.pytorch.org/whl/torch_stable.html +torch-model-archiver==0.11.1 +torch-workflow-archiver==0.2.14 +torchserve==0.11.1 diff --git a/pytorch/serving/wf-store/rest-test.sh b/pytorch/serving/wf-store/rest-test.sh index 2e5850aa..2c37871f 100644 --- a/pytorch/serving/wf-store/rest-test.sh +++ b/pytorch/serving/wf-store/rest-test.sh @@ -33,13 +33,13 @@ apt-get -y install curl curl --fail -X GET http://localhost:8080/ping -cd ../model-store || exit +cd /home/model-server/model-store || exit curl --fail -O https://torchserve.pytorch.org/mar_files/cat_dog_classification.mar curl --fail -O https://torchserve.pytorch.org/mar_files/dog_breed_classification.mar curl --fail -X POST "http://127.0.0.1:8081/models?url=cat_dog_classification.mar" curl --fail -X POST "http://127.0.0.1:8081/models?url=dog_breed_classification.mar" -cd ../wf-store || exit +cd /home/model-server/wf-store || exit curl --fail -X POST "http://127.0.0.1:8081/workflows?url=dog_breed_wf.war" curl --fail -O https://raw.githubusercontent.com/pytorch/serve/master/examples/Workflows/dog_breed_classification/model_input/Cat.jpg diff --git a/pytorch/tests/tests.yaml b/pytorch/tests/tests.yaml index 1011c7a0..ceb3df74 100644 --- a/pytorch/tests/tests.yaml +++ b/pytorch/tests/tests.yaml @@ -13,66 +13,66 @@ # limitations under the License. import-ipex-cpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-base cmd: python -c "import torch;import intel_extension_for_pytorch as ipex;print(f'torch {torch.__version__} ipex {ipex.__version__}')" import-ipex-xpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.1.30xpu}-xpu-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.110xpu}-xpu-base cmd: python -c "import torch; import intel_extension_for_pytorch as ipex;[print(f'[{i}] {torch.xpu.get_device_properties(i)}') for i in range(torch.xpu.device_count())];" device: ["/dev/dri"] import-cpu-jupyter-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-jupyter cmd: python -m jupyter --version import-xpu-jupyter-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.1.30xpu}-xpu-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.110xpu}-xpu-jupyter cmd: python -m jupyter --version device: ["/dev/dri"] import-cpu-oneccl-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-oneccl-inc-${INC_VERSION:-3.0} cmd: python -c "'import oneccl_bindings_for_pytorch as oneccl;print(oneccl.__version__)'" import-cpu-transformers-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-hf-${HF_VERSION:-4.41.2} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-hf-${HF_VERSION:-4.44.0} cmd: python -c "import transformers;print(f'transformers {transformers.__version__}');assert transformers.utils.import_utils.is_ipex_available()" import-cpu-inc-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-oneccl-inc-${INC_VERSION:-3.0} cmd: python -c "'import neural_compressor as inc;print(inc.__version__)'" import-cpu-deepspeed-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-oneccl-inc-${INC_VERSION:-3.0} cmd: ds_report shm_size: 2gb ipex-cpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-base cmd: python /tests/ipex-resnet50.py --ipex --device cpu --backend gloo volumes: - dst: /tests src: $PWD/pytorch/tests ipex-xpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.1.30xpu}-xpu-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.110xpu}-xpu-base cmd: python /tests/ipex-resnet50.py --ipex --device xpu device: ["/dev/dri"] volumes: - dst: /tests src: $PWD/pytorch/tests ipex-xpu-jupyter-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.1.30xpu}-xpu-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.110xpu}-xpu-jupyter cmd: papermill --log-output /jupyter/xpu.ipynb -k python3 device: ["/dev/dri"] notebook: True oneccl-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-oneccl-inc-${INC_VERSION:-3.0} cmd: ipexrun cpu /tests/ipex-resnet50.py --ipex --device cpu --backend ccl privileged: true volumes: - dst: /tests src: $PWD/pytorch/tests oneccl-ds-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-oneccl-inc-${INC_VERSION:-3.0} cmd: ipexrun cpu /tests/ipex-resnet50.py --ipex --device cpu --backend ccl --deepspeed privileged: true volumes: - dst: /tests src: $PWD/pytorch/tests inc-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.4.0}-oneccl-inc-${INC_VERSION:-3.0} cmd: python /tests/inc-quant.py volumes: - dst: /tests diff --git a/pytorch/venv-requirements.txt b/pytorch/venv-requirements.txt new file mode 100644 index 00000000..4d686efe --- /dev/null +++ b/pytorch/venv-requirements.txt @@ -0,0 +1,5 @@ +setuptools>=70.0.0 +psutil==6.0.0 +mkl==2024.2.1 +mkl-include==2024.2.1 +intel-openmp==2024.2.1 diff --git a/pytorch/xpu-requirements.txt b/pytorch/xpu-requirements.txt index b64b92a4..217ecdf7 100644 --- a/pytorch/xpu-requirements.txt +++ b/pytorch/xpu-requirements.txt @@ -1,7 +1,11 @@ -torch==2.1.0.post2+cxx11.abi -torchvision==0.16.0.post2+cxx11.abi -torchaudio==2.1.0.post2+cxx11.abi -intel_extension_for_pytorch==2.1.30+xpu -oneccl_bind_pt==2.1.300+xpu +torch==2.3.1+cxx11.abi +torchvision==0.18.1+cxx11.abi +torchaudio==2.3.1+cxx11.abi +intel_extension_for_pytorch==2.3.110+xpu +oneccl_bind_pt==2.3.100+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us -setuptools==71.1.0 +numpy>=1.26.4 +idna>=3.7 +requests>=2.32.0 +tqdm>=4.66.3 +urllib3>=2.2.2 diff --git a/tensorflow/Dockerfile b/tensorflow/Dockerfile index 47dc728f..48fb7332 100644 --- a/tensorflow/Dockerfile +++ b/tensorflow/Dockerfile @@ -33,12 +33,11 @@ ENV KMP_AFFINITY='granularity=fine,verbose,compact,1,0' \ KMP_BLOCKTIME=1 \ KMP_SETTINGS=1 -ARG TF_VERSION - WORKDIR / COPY requirements.txt . -RUN python -m pip install --no-cache-dir -r requirements.txt +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ @@ -53,12 +52,13 @@ ENV KMP_AFFINITY='granularity=fine,verbose,compact,1,0' \ ENV PATH /usr/bin:/root/conda/envs/idp/bin:/root/conda/condabin:~/conda/bin/:${PATH} ENV TF_ENABLE_ONEDNN_OPTS=1 -ARG TF_VERSION WORKDIR / COPY requirements.txt . -RUN python -m pip install --no-cache-dir -r requirements.txt +RUN conda run -n idp python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt && \ + conda clean -y --all ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ @@ -77,37 +77,43 @@ EXPOSE 8888 CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root --ServerApp.token= --ServerApp.password= --ServerApp.allow_origin=* --ServerApp.base_url=$NB_PREFIX"] -FROM tf-base-${PACKAGE_OPTION} AS openmpi +FROM tf-base-${PACKAGE_OPTION} AS multinode RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + cmake \ + g++ \ + gcc \ + git \ + libgl1-mesa-glx \ + libglib2.0-0 \ libopenmpi-dev \ + numactl \ openmpi-bin \ - openmpi-common + openmpi-common \ + python3-dev \ + unzip \ + virtualenv -WORKDIR / -COPY ompi-requirements.txt . +ENV SIGOPT_PROJECT=. -RUN python -m pip install --no-cache-dir -r ompi-requirements.txt +WORKDIR / +COPY multinode/requirements.txt requirements.txt -FROM openmpi AS horovod +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt -ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64 +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" RUN apt-get install -y --no-install-recommends --fix-missing \ - unzip \ openssh-client \ openssh-server && \ - rm /etc/ssh/ssh_host_*_key \ - /etc/ssh/ssh_host_*_key.pub - -ENV OMPI_ALLOW_RUN_AS_ROOT=1 -ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 - -ENV OMPI_MCA_tl_tcp_if_exclude="lo,docker0" + rm /etc/ssh/ssh_host_*_key \ + /etc/ssh/ssh_host_*_key.pub && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* -# Install OpenSSH for MPI to communicate between containers -RUN mkdir -p /var/run/sshd && \ - echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config +RUN mkdir -p /var/run/sshd # Install Horovod ARG HOROVOD_WITH_TENSORFLOW=1 @@ -116,43 +122,32 @@ ARG HOROVOD_WITHOUT_PYTORCH=1 ARG HOROVOD_WITHOUT_GLOO=1 ARG HOROVOD_WITH_MPI=1 -RUN apt-get install -y --no-install-recommends --fix-missing \ - build-essential \ - cmake \ - g++ \ - gcc \ - git \ - libgl1-mesa-glx \ - libglib2.0-0 \ - python3-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR / -COPY hvd-requirements.txt . - -RUN python -m pip install --no-cache-dir -r hvd-requirements.txt - -ENV SIGOPT_PROJECT=. - -RUN wget --progress=dot:giga --no-check-certificate https://github.com/intel/neural-compressor/raw/master/docker/third-party-programs-tensorflow.txt -O /licenses/inc-third-party-programs-tensorflow.txt && \ - wget --progress=dot:giga --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licenses/INC_LICENSE +ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64 -FROM horovod AS multinode-pip +RUN python -m pip install --no-cache-dir horovod==0.28.1 -WORKDIR / -COPY multinode-requirements.txt . +ARG PYTHON_VERSION -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh -FROM horovod AS multinode-idp +# modify generate_ssh_keys to be a helper script +# print how to use helper script on bash startup +# Avoids loop for further execution of the startup file +ARG PACKAGE_OPTION=pip +ARG PYPATH="/usr/local/lib/python${PYTHON_VERSION}/dist-packages" +RUN if [ "${PACKAGE_OPTION}" = "idp" ]; then PYPATH="/opt/conda/envs/idp/lib/python${PYTHON_VERSION}/site-packages"; fi && \ + echo "source ${PYPATH}/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \ + cat '/generate_ssh_keys.sh' >> ~/.startup && \ + rm -rf /generate_ssh_keys.sh -WORKDIR / -COPY multinode-requirements.txt . +COPY multinode/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh +COPY multinode/sshd_config /etc/ssh/sshd_config +COPY multinode/ssh_config /etc/ssh/ssh_config -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +RUN wget --progress=dot:giga --no-check-certificate https://github.com/intel/neural-compressor/raw/master/docker/third-party-programs-tensorflow.txt -O /licenses/inc-third-party-programs-tensorflow.txt && \ + wget --progress=dot:giga --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licenses/INC_LICENSE -FROM ${PYTHON_BASE} AS itex-xpu-base-pip +FROM ${PYTHON_BASE} AS itex-xpu-base RUN apt-get update && \ apt-get install -y --no-install-recommends --fix-missing \ @@ -219,55 +214,7 @@ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/maste ENV LD_LIBRARY_PATH=/opt/intel/oneapi/redist/lib:$LD_LIBRARY_PATH -FROM ${PYTHON_BASE} AS itex-xpu-base-idp - -RUN apt-get update && \ - apt-get install -y --no-install-recommends --fix-missing \ - apt-utils \ - build-essential \ - clinfo \ - git \ - gnupg2 \ - gpg-agent \ - rsync \ - unzip \ - wget && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ARG ICD_VER -ARG LEVEL_ZERO_GPU_VER -ARG LEVEL_ZERO_VER -ARG LEVEL_ZERO_DEV_VER - -RUN no_proxy="" NO_PROXY="" wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ - gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg -RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" | \ - tee /etc/apt/sources.list.d/intel-gpu-jammy.list - -RUN no_proxy="" NO_PROXY="" apt-get update && \ - apt-get install -y --no-install-recommends --fix-missing \ - intel-opencl-icd=${ICD_VER} \ - intel-level-zero-gpu=${LEVEL_ZERO_GPU_VER} \ - level-zero=${LEVEL_ZERO_VER} \ - level-zero-dev=${LEVEL_ZERO_DEV_VER} && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ARG TF_VER="2.15.0" - -RUN conda install intel-extension-for-tensorflow=${TF_VER}=*xpu* \ - -c https://software.repos.intel.com/python/conda \ - -c conda-forge - -ENV LD_LIBRARY_PATH=/opt/conda/envs/idp/lib:$LD_LIBRARY_PATH - -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-programs-of-intel-tensorflow.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-programs-of-intel-optimization-for-horovod.txt /licenses/ - -FROM itex-xpu-base-${PACKAGE_OPTION} AS itex-xpu-jupyter +FROM itex-xpu-base AS itex-xpu-jupyter WORKDIR /jupyter COPY jupyter-requirements.txt . diff --git a/tensorflow/README.md b/tensorflow/README.md index c92533ef..990ecf71 100644 --- a/tensorflow/README.md +++ b/tensorflow/README.md @@ -16,7 +16,7 @@ The images below include support for both CPU and GPU optimizations: | Tag(s) | TensorFlow | ITEX | Driver | Dockerfile | | ---------------------- | ----------- | -------------- | ------- | --------------- | -| `2.15.0.1-xpu`, `xpu` | [v2.15.1] | [v2.15.0.1] | [803.63]| [v0.4.0-Beta] | +| `2.15.0.1-xpu-pip-base`, `xpu` | [v2.15.1] | [v2.15.0.1] | [803.63]| [v0.4.0-Beta] | | `2.15.0.0-xpu` | [v2.15.0] | [v2.15.0.0] | [803] | [v0.4.0-Beta] | | `2.14.0.1-xpu` | [v2.14.1] | [v2.14.0.1] | [736] | [v0.3.4] | | `2.13.0.0-xpu` | [v2.13.0] | [v2.13.0.0] | [647] | [v0.2.3] | @@ -37,6 +37,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | TensorFlow | IPEX | Driver | Dockerfile | | ------------- | ----------- | ------------- | ------ | --------------- | +| `2.15.0.1-xpu-pip-jupyter` | [v2.15.1] | [v2.15.0.1] | [803.63]| [v0.4.0-Beta] | | `xpu-jupyter` | [v2.14.1] | [v2.14.0.1] | [736] | [v0.3.4] | ### Run the XPU Jupyter Container @@ -48,7 +49,7 @@ docker run -it --rm \ --device /dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ --ipc=host \ - intel/intel-extension-for-tensorflow:xpu-jupyter + intel/intel-extension-for-tensorflow:2.15.0.1-xpu-pip-jupyter ``` After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. @@ -84,7 +85,8 @@ The images below are built only with CPU optimizations (GPU acceleration support | Tag(s) | TensorFlow | ITEX | Dockerfile | | --------------------------- | ----------- | ------------ | --------------- | -| `2.15.0-pip-base`, `latest` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.15.1-pip-base`, `latest` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-pip-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-pip-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-pip-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -92,6 +94,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | TensorFlow | ITEX | Dockerfile | | -------------------- | ----------- | ------------- | --------------- | +| `2.15.1-pip-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | | `2.15.0-pip-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-pip-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-pip-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -104,7 +107,7 @@ docker run -it --rm \ --net=host \ -v $PWD/workspace:/workspace \ -w /workspace \ - intel/intel-extension-for-tensorflow:xpu-jupyter + intel/intel-extension-for-tensorflow:2.15.1-pip-jupyter ``` After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. @@ -115,10 +118,102 @@ The images below additionally include [Horovod]: | Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | | ------------------------------ | --------- | ------------ | --------- | --------------- | +| `2.15.1-pip-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | | `2.15.0-pip-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | | `2.14.0-pip-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | | `2.13-pip-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | +> [!NOTE] +> Passwordless SSH connection is also enabled in the image, but the container does not contain any SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`. + +> [!TIP] +> Before mounting any keys, modify the permissions of those files with `chmod 600 authorized_keys; chmod 600 id_rsa` to grant read access for the default user account. + +#### Setup and Run ITEX Multi-Node Container + +Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively: + +SSH Server (Worker) + +1. *Authorized Keys* : `/etc/ssh/authorized_keys` + +SSH Client (Launcher) + +1. *Private User Key* : `/root/.ssh/id_rsa` + +To add these files correctly please follow the steps described below. + +1. Setup ID Keys + + You can use the commands provided below to [generate the identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH. + + ```bash + ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa + touch authorized_keys + cat id_rsa.pub >> authorized_keys + ``` + +2. Configure the permissions and ownership for all of the files you have created so far + + ```bash + chmod 600 id_rsa config authorized_keys + chown root:root id_rsa.pub id_rsa config authorized_keys + ``` + +3. Create a hostfile for horovod. (Optional) + + ```txt + Host host1 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + Host host2 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + ... + ``` + +4. Configure [Horovod] in your python script + + ```python + import horovod.torch as hvd + + hvd.init() + ``` + +5. Now start the workers and execute DDP on the launcher + + 1. Worker run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ + -v $PWD/tests:/workspace/tests \ + -w /workspace \ + intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ + bash -c '/usr/sbin/sshd -D' + ``` + + 2. Launcher run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/id_rsa:/root/.ssh/id_rsa \ + -v $PWD/tests:/workspace/tests \ + -v $PWD/hostfile:/root/ssh/config \ + -w /workspace \ + intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ + bash -c 'horovodrun --verbose -np 2 -H host1:1,host2:1 /workspace/tests/tf_base_test.py' + ``` + +> [!NOTE] +> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. + --- The images below are [TensorFlow* Serving] with CPU Optimizations: @@ -150,7 +245,8 @@ The images below are built only with CPU optimizations (GPU acceleration support | Tag(s) | TensorFlow | ITEX | Dockerfile | | --------------------------- | ----------- | ------------ | --------------- | -| `2.15.0-idp-base`, `latest` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.15.1-idp-base` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-idp-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-idp-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-idp-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -158,6 +254,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | TensorFlow | ITEX | Dockerfile | | -------------------- | ----------- | ------------- | --------------- | +| `2.15.1-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | | `2.15.0-idp-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-idp-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-idp-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -166,13 +263,30 @@ The images below additionally include [Horovod]: | Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | | ------------------------------ | --------- | ------------ | --------- | --------------- | +| `2.15.1-idp-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | | `2.15.0-idp-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | | `2.14.0-idp-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | | `2.13-idp-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | +## XPU images with Intel® Distribution for Python* + +The images below are built only with CPU and GPU optimizations and include [Intel® Distribution for Python*]: + +| Tag(s) | Pytorch | ITEX | Driver | Dockerfile | +| ---------------- | -------- | ------------ | -------- | ------ | +| `2.15.0.1-xpu-idp-base` | [v2.15.1] | [v2.15.0.1] | [803] | [v0.4.0-Beta] | +| `2.15.0-xpu-idp-base` | [v2.15.0] | [v2.15.0.0] | [803] | [v0.4.0-Beta] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | +| --------------------- | -------- | ------------- | ------ | ------------ | --------------- | +| `2.15.0.1-xpu-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [803] | `8888` | [v0.4.0-Beta] | +| `2.15.0-xpu-idp-jupyter` | [v2.1.0] | [v2.15.0.0] | [803] | `8888` | [v0.4.0-Beta] | + ## Build from Source -To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: +To build the images from source, clone the [AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: ```bash cd pytorch diff --git a/tensorflow/docker-compose.yaml b/tensorflow/docker-compose.yaml index 2d7e84a0..18aec65a 100644 --- a/tensorflow/docker-compose.yaml +++ b/tensorflow/docker-compose.yaml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -version: '3' include: - path: - ../python/docker-compose.yaml @@ -31,7 +30,7 @@ services: PYTHON_VERSION: ${PYTHON_VERSION:-3.10} REGISTRY: ${REGISTRY} REPO: ${REPO} - TF_VERSION: ${TF_VERSION:-2.15.0} + TF_VERSION: ${TF_VERSION:-2.15.1} target: tf-base-${PACKAGE_OPTION:-pip} context: . labels: @@ -41,20 +40,20 @@ services: org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.name: "intel/intel-optimized-tensorflow" org.opencontainers.image.title: "Intel® Extension for TensorFlow Base Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base depends_on: - ${PACKAGE_OPTION:-pip} command: > python -c 'import tensorflow as tf; print("Tensorflow Version:", tf.__version__)' - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base pull_policy: always jupyter: build: labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow Jupyter Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-jupyter + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-jupyter target: jupyter command: > bash -c "python -m jupyter --version" @@ -62,32 +61,38 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} extends: tf-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-jupyter + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-jupyter network_mode: host volumes: - /$PWD:/jupyter multinode: build: labels: + dependency.apt.build-essential: true + dependency.apt.cmake: true dependency.apt.gcc: true + dependency.apt.g++: true + dependency.apt.git: true dependency.apt.libgl1-mesa-glx: true dependency.apt.libglib2: true - dependency.apt.python3-dev: true - dependency.pip.apt.virtualenv: true dependency.apt.libopenmpi-dev: true + dependency.apt.numactl: true dependency.apt.openmpi-bin: true - dependency.apt.unzip: true dependency.apt.openssh-client: true dependency.apt.openssh-server: true - dependency.python.pip: multinode-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base" + dependency.apt.python3-dev: true + dependency.apt.unzip: true + dependency.pip.apt.virtualenv: true + dependency.pip.horovod: 0.28.1 + dependency.python.pip: multinode/requirements.txt + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow MultiNode Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-multinode - target: multinode-${PACKAGE_OPTION:-pip} + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-multinode + target: multinode command: > bash -c "horovodrun --check-build && mpirun --version && python -c 'import horovod.tensorflow as hvd;hvd.init();import horovod.tensorflow;import neural_compressor, tf2onnx; print(\"\\nNeural Compressor Version:\", neural_compressor.__version__, \"\\\nTensorFlow2ONNX Version:\", tf2onnx.__version__)'" extends: tf-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} xpu: build: args: @@ -95,10 +100,10 @@ services: LEVEL_ZERO_GPU_VER: ${LEVEL_ZERO_GPU_VER:-1.3.27642.40-803~22.04} LEVEL_ZERO_VER: ${LEVEL_ZERO_VER:-1.14.0-744~22.04} LEVEL_ZERO_DEV_VER: ${LEVEL_ZERO_DEV_VER:-1.14.0-744~22.04} - DPCPP_VER: ${DPCPP_VER:-2024.1.0-963} - MKL_VER: ${MKL_VER:-2024.1.0-691} - CCL_VER: ${CCL_VER:-2021.12.0-309} - TF_VER: ${TF_VER:-2.15.0} + DPCPP_VER: ${DPCPP_VER:-2024.2.1-1079} + MKL_VER: ${MKL_VER:-2024.2.1-103} + CCL_VER: ${CCL_VER:-2021.13.1-31} + TF_VER: ${TF_VER:-2.15.1} no_proxy: '' NO_PROXY: '' labels: @@ -108,9 +113,9 @@ services: dependency.apt.gnupg2: true dependency.apt.gpg-agent: true dependency.apt.intel-level-zero-gpu: ${LEVEL_ZERO_GPU_VER:-1.3.27642.40-803~22.04} - dependency.apt.intel-oneapi-runtime-ccl: ${CCL_VER:-2021.12.0-309} - dependency.apt.intel-oneapi-runtime-dpcpp-cpp: ${DPCPP_VER:-2024.1.0-963} - dependency.apt.intel-oneapi-runtime-mkl: ${MKL_VER:-2024.1.0-691} + dependency.apt.intel-oneapi-runtime-ccl: ${CCL_VER:-2021.13.1-31} + dependency.apt.intel-oneapi-runtime-dpcpp-cpp: ${DPCPP_VER:-2024.2.1-1079} + dependency.apt.intel-oneapi-runtime-mkl: ${MKL_VER:-2024.2.1-103} dependency.apt.intel-opencl-icd: ${ICD_VER:-23.43.27642.40-803~22.04} dependency.apt.level-zero: ${LEVEL_ZERO_VER:-1.14.0-744~22.04} dependency.apt.level-zero-dev: ${LEVEL_ZERO_DEV_VER:-1.14.0-744~22.04} @@ -120,11 +125,11 @@ services: org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.title: "Intel® Extension for TensorFlow XPU Base Image" org.opencontainers.image.version: ${TF_VER:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-base - target: itex-xpu-base-${PACKAGE_OPTION:-pip} + target: itex-xpu-base command: > sh -c "python -c 'import tensorflow as tf;print(tf.__version__);from tensorflow.python.client import device_lib;print(device_lib.list_local_devices())'" extends: tf-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.0}-itex-xpu-base + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-base xpu-jupyter: build: args: @@ -132,19 +137,19 @@ services: LEVEL_ZERO_GPU_VER: ${LEVEL_ZERO_GPU_VER:-1.3.27642.40-803~22.04} LEVEL_ZERO_VER: ${LEVEL_ZERO_VER:-1.14.0-744~22.04} LEVEL_ZERO_DEV_VER: ${LEVEL_ZERO_DEV_VER:-1.14.0-744~22.04} - DPCPP_VER: ${DPCPP_VER:-2024.1.0-963} - MKL_VER: ${MKL_VER:-2024.1.0-691} - CCL_VER: ${CCL_VER:-2021.12.0-309} - TF_VER: ${TF_VER:-2.15.0} + DPCPP_VER: ${DPCPP_VER:-2024.2.1-1079} + MKL_VER: ${MKL_VER:-2024.2.1-103} + CCL_VER: ${CCL_VER:-2021.13.1-31} + ITEX_VER: ${ITEX_VER:-2.15.0.1} no_proxy: '' NO_PROXY: '' labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-xpu-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow XPU Jupyter Image" org.opencontainers.image.version: ${TF_VER:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-jupyter target: itex-xpu-jupyter extends: tf-base command: > bash -c "python -m jupyter --version" - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.0}-itex-xpu-jupyter + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-jupyter diff --git a/tensorflow/hvd-requirements.txt b/tensorflow/hvd-requirements.txt deleted file mode 100644 index f2eadcce..00000000 --- a/tensorflow/hvd-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -horovod==0.28.1 diff --git a/tensorflow/jupyter-requirements.txt b/tensorflow/jupyter-requirements.txt index 23a73885..9bdbed92 100644 --- a/tensorflow/jupyter-requirements.txt +++ b/tensorflow/jupyter-requirements.txt @@ -1,4 +1,4 @@ -jupyterlab==4.3.0a0 +jupyterlab>=4.2.4 jupyterhub==5.1.0 -notebook==7.3.0a0 +notebook>=7.1.3 jupyter-server-proxy>=4.1.2 diff --git a/tensorflow/multinode-requirements.txt b/tensorflow/multinode-requirements.txt deleted file mode 100644 index d9cff369..00000000 --- a/tensorflow/multinode-requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -cython -tf2onnx -neural-compressor==2.6 diff --git a/tensorflow/multinode/dockerd-entrypoint.sh b/tensorflow/multinode/dockerd-entrypoint.sh new file mode 100755 index 00000000..ba13c0f9 --- /dev/null +++ b/tensorflow/multinode/dockerd-entrypoint.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -a +# shellcheck disable=SC1091 +source "$HOME/.startup" +set +a +"$@" diff --git a/tensorflow/multinode/generate_ssh_keys.sh b/tensorflow/multinode/generate_ssh_keys.sh new file mode 100755 index 00000000..0ee61398 --- /dev/null +++ b/tensorflow/multinode/generate_ssh_keys.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +function gen_single_key() { + ALG_NAME=$1 + if [[ ! -f /etc/ssh/ssh_host_${ALG_NAME}_key ]]; then + ssh-keygen -q -N "" -t "${ALG_NAME}" -f "/etc/ssh/ssh_host_${ALG_NAME}_key" + fi +} + +gen_single_key dsa +gen_single_key rsa +gen_single_key ecdsa +gen_single_key ed25519 diff --git a/tensorflow/multinode/requirements.txt b/tensorflow/multinode/requirements.txt new file mode 100644 index 00000000..80747740 --- /dev/null +++ b/tensorflow/multinode/requirements.txt @@ -0,0 +1,5 @@ +cython>=3.0.11 +impi-rt>=2021.12.0 +mpi4py>=3.1.0 +neural-compressor==3.0 +tf2onnx>=1.16.1 diff --git a/tensorflow/multinode/ssh_config b/tensorflow/multinode/ssh_config new file mode 100644 index 00000000..9ac73017 --- /dev/null +++ b/tensorflow/multinode/ssh_config @@ -0,0 +1,4 @@ +Host * + Port 3022 + IdentityFile ~/.ssh/id_rsa + StrictHostKeyChecking no diff --git a/tensorflow/multinode/sshd_config b/tensorflow/multinode/sshd_config new file mode 100644 index 00000000..4796a48a --- /dev/null +++ b/tensorflow/multinode/sshd_config @@ -0,0 +1,12 @@ +HostKey /etc/ssh/ssh_host_dsa_key +HostKey /etc/ssh/ssh_host_rsa_key +HostKey /etc/ssh/ssh_host_ecdsa_key +HostKey /etc/ssh/ssh_host_ed25519_key +AuthorizedKeysFile /etc/ssh/authorized_keys +## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time +LogLevel DEBUG3 +Port 3022 +UsePAM yes +Subsystem sftp /usr/lib/openssh/sftp-server +# https://ubuntu.com/security/CVE-2024-6387 +LoginGraceTime 0 diff --git a/tensorflow/ompi-requirements.txt b/tensorflow/ompi-requirements.txt deleted file mode 100644 index 7b64c166..00000000 --- a/tensorflow/ompi-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -impi-rt>=2021.12.0 diff --git a/tensorflow/requirements.txt b/tensorflow/requirements.txt index 9b50ec78..92fd1059 100644 --- a/tensorflow/requirements.txt +++ b/tensorflow/requirements.txt @@ -1,4 +1,4 @@ -tensorflow==2.15.0 -intel-extension-for-tensorflow[cpu]==2.15.0.0 +tensorflow==2.15.1 +intel-extension-for-tensorflow[cpu]>=2.15,<2.16 tensorflow-hub==0.16.1 -pillow==10.3.0 +pillow==10.4.0 diff --git a/tensorflow/serving/requirements.txt b/tensorflow/serving/requirements.txt index cf28053c..1f2e56e4 100644 --- a/tensorflow/serving/requirements.txt +++ b/tensorflow/serving/requirements.txt @@ -1,5 +1,5 @@ -numpy==2.0.0 -pillow==10.3.0 +numpy==2.1.0 +pillow==10.4.0 requests==2.32.3 -tensorflow==2.16.1 -tensorflow-serving-api==2.16.1 +tensorflow==2.17.0 +tensorflow-serving-api==2.17.0 diff --git a/tensorflow/tests/tests.yaml b/tensorflow/tests/tests.yaml index 0d45d9e8..43af2239 100644 --- a/tensorflow/tests/tests.yaml +++ b/tensorflow/tests/tests.yaml @@ -14,50 +14,54 @@ --- import-itex-cpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base cmd: python -c "from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())" import-itex-xpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.0}-itex-xpu-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-base cmd: python /tests/xpu_import_test.py + device: ["/dev/dri"] volumes: - src: ${PWD}/tensorflow/tests dst: /tests import-cpu-jupyter-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-jupyter cmd: python -m jupyter --version import-xpu-jupyter-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.0}-itex-xpu-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-jupyter cmd: python -m jupyter --version + device: ["/dev/dri"] import-multinode-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: horovodrun --check-build && mpirun --version && python -c 'import horovod.tensorflow as hvd;hvd.init();import horovod.tensorflow' import-inc-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: python -c "import neural_compressor as inc;print(inc.__version__)" itex-cpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base cmd: python /tests/tf_base_test.py volumes: - src: ${PWD}/tensorflow/tests dst: /tests itex-xpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.0}-itex-xpu-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-base cmd: python /tests/tf_base_test.py + device: ["/dev/dri"] volumes: - dst: /tests src: $PWD/tensorflow/tests itex-xpu-jupyter-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.0}-itex-xpu-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-jupyter cmd: papermill --log-output /jupyter/xpu.ipynb -k python3 - notebook: True + device: ["/dev/dri"] multinode-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: horovodrun -np 2 -H localhost:2 --binding-args="-bind-to socket -map-by socket" python /tests/tf_base_test.py volumes: - dst: /tests src: $PWD/tensorflow/tests inc-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: bash /tests/inc_test.sh volumes: - dst: /tests diff --git a/tensorflow/xpu-requirements.txt b/tensorflow/xpu-requirements.txt index c7099048..a338e80f 100644 --- a/tensorflow/xpu-requirements.txt +++ b/tensorflow/xpu-requirements.txt @@ -1,2 +1,6 @@ tensorflow==2.15.0 -intel-extension-for-tensorflow[xpu]==2.15.0.0 +intel-extension-for-tensorflow[xpu]==2.15.0.1 +idna>=3.7 +requests>=2.32.0 +tqdm>=4.66.3 +urllib3>=2.2.2 diff --git a/test-runner/dev-requirements.txt b/test-runner/dev-requirements.txt index 2f4f8fbf..e409fb0f 100644 --- a/test-runner/dev-requirements.txt +++ b/test-runner/dev-requirements.txt @@ -3,7 +3,7 @@ coverage>=7.5.0 coveralls>=4.0.1 expandvars>=0.12.0 hypothesis>=6.100.1 -pydantic==2.8.2 +pydantic==2.9.1 pylint>=3.1.0 pytest>=8.1.1 python_on_whales>=0.70.1 diff --git a/test-runner/requirements.txt b/test-runner/requirements.txt index 79752623..ee95cc0a 100644 --- a/test-runner/requirements.txt +++ b/test-runner/requirements.txt @@ -1,5 +1,5 @@ expandvars>=0.12.0 -pydantic==2.8.2 +pydantic==2.9.1 python_on_whales>=0.70.1 pyyaml>=6.0.1 tabulate>=0.9.0 diff --git a/workflows/README.md b/workflows/README.md index 21269eb7..1f6c9ea6 100644 --- a/workflows/README.md +++ b/workflows/README.md @@ -1,6 +1,6 @@ # Intel® AI Workflows -Demonstrating showing how the [Intel® AI Containers] can be used for different use cases: +Demonstrating showing how the [AI Containers] can be used for different use cases: ## PyTorch Workflows @@ -11,7 +11,7 @@ Demonstrating showing how the [Intel® AI Containers] can be used for different ## Build from Source -To build the images from source, clone the [Intel® AI Containers] repository, follow the main `README.md` file to setup your environment, and run the following command: +To build the images from source, clone the [AI Containers] repository, follow the main `README.md` file to setup your environment, and run the following command: ```bash cd workflows/charts/huggingface-llm @@ -21,7 +21,7 @@ docker compose run huggingface-llm sh -c "python /workspace/scripts/finetune.py ## License -View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [Intel® AI Containers]. +View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [AI Containers]. The images below also contain other software which may be under other licenses (such as Pytorch*, Jupyter*, Bash, etc. from the base). @@ -31,6 +31,6 @@ It is the image user's responsibility to ensure that any use of The images below -[Intel® AI Containers]: https://github.com/intel/ai-containers +[AI Containers]: https://github.com/intel/ai-containers [Distributed LLM Fine Tuning with Kubernetes]: https://github.com/intel/ai-containers/tree/main/workflows/charts/huggingface-llm [TorchServe* with Kubernetes]: https://github.com/intel/ai-containers/tree/main/workflows/charts/torchserve diff --git a/workflows/charts/huggingface-llm/README.md b/workflows/charts/huggingface-llm/README.md index e2439830..47755eef 100644 --- a/workflows/charts/huggingface-llm/README.md +++ b/workflows/charts/huggingface-llm/README.md @@ -347,4 +347,4 @@ fine tune the model. ``` ---------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1) +Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) diff --git a/workflows/charts/tensorflow-serving/.helmignore b/workflows/charts/tensorflow-serving/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/workflows/charts/tensorflow-serving/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/workflows/charts/tensorflow-serving/Chart.yaml b/workflows/charts/tensorflow-serving/Chart.yaml new file mode 100644 index 00000000..e6a61952 --- /dev/null +++ b/workflows/charts/tensorflow-serving/Chart.yaml @@ -0,0 +1,42 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: tensorflow-serving-on-intel +description: TensorFlow Serving is a flexible, high-performance serving system for machine learning models, designed for production environments. TensorFlow Serving makes it easy to deploy new algorithms and experiments, while keeping the same server architecture and APIs. TensorFlow Serving provides out-of-the-box integration with TensorFlow models, but can be easily extended to serve other types of models and data. + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +maintainers: + - name: tylertitsworth + email: tyler.titsworth@intel.com + url: https://github.com/tylertitsworth +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/workflows/charts/tensorflow-serving/README.md b/workflows/charts/tensorflow-serving/README.md new file mode 100644 index 00000000..bfbb2900 --- /dev/null +++ b/workflows/charts/tensorflow-serving/README.md @@ -0,0 +1,31 @@ +# tensorflow-serving-on-intel + +![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.16.0](https://img.shields.io/badge/AppVersion-1.16.0-informational?style=flat-square) + +TensorFlow Serving is a flexible, high-performance serving system for machine learning models, designed for production environments. TensorFlow Serving makes it easy to deploy new algorithms and experiments, while keeping the same server architecture and APIs. TensorFlow Serving provides out-of-the-box integration with TensorFlow models, but can be easily extended to serve other types of models and data. + +## Maintainers + +| Name | Email | Url | +| ---- | ------ | --- | +| tylertitsworth | | | + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| deploy.env | object | `{"configMapName":"intel-proxy-config","enabled":true}` | Add Environment mapping | +| deploy.image | string | `"intel/intel-extension-for-tensorflow:serving-gpu"` | Intel Extension for Tensorflow Serving image | +| deploy.modelName | string | `""` | Model Name | +| deploy.replicas | int | `1` | Number of pods | +| deploy.resources.limits | object | `{"cpu":"4000m","gpu.intel.com/i915":1,"memory":"1Gi"}` | Maximum resources per pod | +| deploy.resources.limits."gpu.intel.com/i915" | int | `1` | Intel GPU Device Configuration | +| deploy.resources.requests | object | `{"cpu":"1000m","memory":"512Mi"}` | Minimum resources per pod | +| deploy.storage.nfs | object | `{"enabled":false,"path":"nil","readOnly":true,"server":"nil"}` | Network File System (NFS) storage for models | +| fullnameOverride | string | `""` | Full qualified Domain Name | +| nameOverride | string | `""` | Name of the serving service | +| pvc.size | string | `"5Gi"` | Size of the storage | +| service.type | string | `"NodePort"` | Type of service | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) diff --git a/workflows/charts/tensorflow-serving/templates/NOTES.txt b/workflows/charts/tensorflow-serving/templates/NOTES.txt new file mode 100644 index 00000000..fb69969c --- /dev/null +++ b/workflows/charts/tensorflow-serving/templates/NOTES.txt @@ -0,0 +1,19 @@ +1. Get the application URL by running these commands: +{{- if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "tensorflow-serving.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "tensorflow-serving.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "tensorflow-serving.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "tensorflow-serving.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} +2. Make a prediction + curl http://$NODE_IP:$NODE_PORT/v1/models/{{ .Values.deploy.modelName }} + curl -X POST http://$NODE_IP:$NODE_PORT/v1/models/{{ .Values.deploy.modelName }}:predict -d '{"data": []}' diff --git a/workflows/charts/tensorflow-serving/templates/_helpers.tpl b/workflows/charts/tensorflow-serving/templates/_helpers.tpl new file mode 100644 index 00000000..2afbfd70 --- /dev/null +++ b/workflows/charts/tensorflow-serving/templates/_helpers.tpl @@ -0,0 +1,51 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "tensorflow-serving.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "tensorflow-serving.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "tensorflow-serving.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "tensorflow-serving.labels" -}} +helm.sh/chart: {{ include "tensorflow-serving.chart" . }} +{{ include "tensorflow-serving.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "tensorflow-serving.selectorLabels" -}} +app.kubernetes.io/name: {{ include "tensorflow-serving.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} diff --git a/workflows/charts/tensorflow-serving/templates/deployment.yaml b/workflows/charts/tensorflow-serving/templates/deployment.yaml new file mode 100644 index 00000000..e6a1fcf6 --- /dev/null +++ b/workflows/charts/tensorflow-serving/templates/deployment.yaml @@ -0,0 +1,84 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +{{- $name := .Values.deploy.modelName | required ".Values.deploy.modelName is required." -}} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "tensorflow-serving.fullname" . }} + labels: + {{- include "tensorflow-serving.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.deploy.replicas }} + selector: + matchLabels: + {{- include "tensorflow-serving.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "tensorflow-serving.labels" . | nindent 8 }} + spec: + securityContext: + fsGroup: 1000 + runAsUser: 1000 + containers: + - name: tensorflow-serving + image: {{ .Values.deploy.image }} + {{- if eq .Values.deploy.env.enabled true }} + envFrom: + - configMapRef: + name: {{ .Values.deploy.env.configMapName }} + {{- end }} + env: + - name: MODEL_NAME + value: {{ .Values.deploy.modelName }} + ports: + - name: rest + containerPort: 8500 + protocol: TCP + - name: grpc + containerPort: 8501 + protocol: TCP + readinessProbe: + tcpSocket: + port: rest + initialDelay: 15 + timeoutSeconds: 1 + volumeMounts: + - mountPath: /dev/shm + name: dshm + {{- if .Values.deploy.storage.nfs.enabled }} + - name: model + mountPath: /models/{{ .Values.deploy.modelName }} + {{- else }} + - name: model + mountPath: /models/{{ .Values.deploy.modelName }} + {{- end }} + resources: + {{- toYaml .Values.deploy.resources | nindent 12 }} + volumes: + - name: dshm + emptyDir: + medium: Memory + {{- if .Values.deploy.storage.nfs.enabled }} + - name: model + nfs: + server: {{ .Values.deploy.storage.nfs.server }} + path: {{ .Values.deploy.storage.nfs.path }} + readOnly: {{ .Values.deploy.storage.nfs.readOnly }} + {{- else }} + - name: model + persistentVolumeClaim: + claimName: {{ include "tensorflow-serving.fullname" . }}-model-dir + {{- end }} diff --git a/workflows/charts/tensorflow-serving/templates/pvc.yaml b/workflows/charts/tensorflow-serving/templates/pvc.yaml new file mode 100644 index 00000000..2cf9040d --- /dev/null +++ b/workflows/charts/tensorflow-serving/templates/pvc.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +{{- if not .Values.deploy.storage.nfs.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "tensorflow-serving.fullname" . }}-model-dir + labels: + {{- include "tensorflow-serving.labels" . | nindent 4 }} +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.pvc.size }} +{{- end }} diff --git a/workflows/charts/tensorflow-serving/templates/service.yaml b/workflows/charts/tensorflow-serving/templates/service.yaml new file mode 100644 index 00000000..2eab7890 --- /dev/null +++ b/workflows/charts/tensorflow-serving/templates/service.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "tensorflow-serving.fullname" . }} + labels: + {{- include "tensorflow-serving.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - name: rest + port: 8500 + targetPort: rest + - name: grpc + port: 8501 + targetPort: grpc + selector: + {{- include "tensorflow-serving.selectorLabels" . | nindent 4 }} diff --git a/workflows/charts/tensorflow-serving/templates/tests/test-connection.yaml b/workflows/charts/tensorflow-serving/templates/tests/test-connection.yaml new file mode 100644 index 00000000..0fe61c9a --- /dev/null +++ b/workflows/charts/tensorflow-serving/templates/tests/test-connection.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "tensorflow-serving.fullname" . }}-test-connection" + labels: + {{- include "tensorflow-serving.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: info + image: curlimages/curl + command: ['sh', '-c'] + args: ['curl -f {{ include "tensorflow-serving.fullname" . }}:8501/v1/models/{{ .Values.deploy.modelName}}'] + restartPolicy: OnFailure diff --git a/workflows/charts/tensorflow-serving/values.yaml b/workflows/charts/tensorflow-serving/values.yaml new file mode 100644 index 00000000..39ed23af --- /dev/null +++ b/workflows/charts/tensorflow-serving/values.yaml @@ -0,0 +1,53 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -- Name of the serving service +nameOverride: "" +# -- Full qualified Domain Name +fullnameOverride: "" +deploy: + # -- Intel Extension for Tensorflow Serving image + image: intel/intel-extension-for-tensorflow:serving-gpu + # -- Add Environment mapping + env: + configMapName: intel-proxy-config + enabled: true + # -- Model Name + modelName: "" + # -- Number of pods + replicas: 1 + resources: + # -- Maximum resources per pod + limits: + cpu: 4000m + memory: 1Gi + # -- Intel GPU Device Configuration + gpu.intel.com/i915: 1 + # -- Minimum resources per pod + requests: + cpu: 1000m + memory: 512Mi + storage: + # -- Network File System (NFS) storage for models + nfs: + enabled: false + server: nil + path: nil + readOnly: true +service: + # -- Type of service + type: NodePort +pvc: + # -- Size of the storage + size: 5Gi diff --git a/workflows/charts/tgi/.helmignore b/workflows/charts/tgi/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/workflows/charts/tgi/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/workflows/charts/tgi/Chart.yaml b/workflows/charts/tgi/Chart.yaml new file mode 100644 index 00000000..761d8b0c --- /dev/null +++ b/workflows/charts/tgi/Chart.yaml @@ -0,0 +1,42 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: tgi-on-intel +description: A Rust, Python and gRPC server for text generation inference by huggingface on Intel GPUs. + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +maintainers: + - name: tylertitsworth + email: tyler.titsworth@intel.com + url: https://github.com/tylertitsworth +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/workflows/charts/tgi/README.md b/workflows/charts/tgi/README.md new file mode 100644 index 00000000..7c020fe1 --- /dev/null +++ b/workflows/charts/tgi/README.md @@ -0,0 +1,30 @@ +# Text Generation Inference on Intel GPU + +A Rust, Python and gRPC server for text generation inference by huggingface on Intel GPUs. + +For more information about how to use Huggingface text-generation-inference with Intel optimizations, check out [huggingface's documentation](https://huggingface.co/docs/text-generation-inference/installation_intel). + +> [!TIP] +> For Gaudi-related documentation, check out [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). + +![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.16.0](https://img.shields.io/badge/AppVersion-1.16.0-informational?style=flat-square) + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| deploy.configMapName | string | `"intel-proxy-config"` | ConfigMap of Environment Variables | +| deploy.image | string | `"ghcr.io/huggingface/text-generation-inference:latest-intel"` | Intel TGI Image | +| deploy.model | string | `"HuggingFaceTB/SmolLM-135M"` | Model to be loaded | +| deploy.quantize | string | `""` | Enable Quantization (ex: bitsandbytes-nf4) | +| deploy.replicaCount | int | `1` | Number of pods | +| deploy.resources | object | `{"limits":{"cpu":"4000m","gpu.intel.com/i915":1},"requests":{"cpu":"1000m","memory":"1Gi"}}` | Resource configuration | +| deploy.resources.limits."gpu.intel.com/i915" | int | `1` | Intel GPU Device Configuration | +| fullnameOverride | string | `""` | Full qualified Domain Name | +| ingress | object | `{"annotations":{},"className":"","enabled":false,"hosts":[{"host":"chart-example.local","paths":[{"path":"/","pathType":"ImplementationSpecific"}]}],"tls":[]}` | Ingress configuration | +| nameOverride | string | `""` | Name of the serving service | +| secret.encodedToken | string | `""` | Base64 Encoded Huggingface Hub API Token | +| service | object | `{"port":80,"type":"NodePort"}` | Service configuration | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) diff --git a/workflows/charts/tgi/README.md.gotmpl b/workflows/charts/tgi/README.md.gotmpl new file mode 100644 index 00000000..0d773d1a --- /dev/null +++ b/workflows/charts/tgi/README.md.gotmpl @@ -0,0 +1,16 @@ +# Text Generation Inference on Intel GPU + +{{ template "chart.description" . }} + +For more information about how to use Huggingface text-generation-inference with Intel optimizations, check out [huggingface's documentation](https://huggingface.co/docs/text-generation-inference/installation_intel). + +> [!TIP] +> For Gaudi-related documentation, check out [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). + +{{ template "chart.versionBadge" . }}{{ template "chart.typeBadge" . }}{{ template "chart.appVersionBadge" . }} + +{{ template "chart.requirementsSection" . }} + +{{ template "chart.valuesSection" . }} + +{{ template "helm-docs.versionFooter" . }} diff --git a/workflows/charts/tgi/templates/NOTES.txt b/workflows/charts/tgi/templates/NOTES.txt new file mode 100644 index 00000000..fc906eb6 --- /dev/null +++ b/workflows/charts/tgi/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "tgi.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "tgi.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "tgi.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "tgi.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/workflows/charts/tgi/templates/_helpers.tpl b/workflows/charts/tgi/templates/_helpers.tpl new file mode 100644 index 00000000..b98dd8cb --- /dev/null +++ b/workflows/charts/tgi/templates/_helpers.tpl @@ -0,0 +1,76 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{/* +Expand the name of the chart. +*/}} +{{- define "tgi.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "tgi.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "tgi.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "tgi.labels" -}} +helm.sh/chart: {{ include "tgi.chart" . }} +{{ include "tgi.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "tgi.selectorLabels" -}} +app.kubernetes.io/name: {{ include "tgi.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "tgi.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "tgi.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/workflows/charts/tgi/templates/deploy.yaml b/workflows/charts/tgi/templates/deploy.yaml new file mode 100644 index 00000000..6c5a5bd5 --- /dev/null +++ b/workflows/charts/tgi/templates/deploy.yaml @@ -0,0 +1,81 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "tgi.fullname" . }} + labels: + {{- include "tgi.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.deploy.replicaCount }} + selector: + matchLabels: + {{- include "tgi.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "tgi.selectorLabels" . | nindent 8 }} + spec: + securityContext: + fsGroup: 1000 + runAsUser: 1000 + containers: + - name: {{ .Chart.Name }} + args: + - '--model-id' + - {{ .Values.deploy.model | quote }} + {{- if index .Values.deploy.resources.limits "gpu.intel.com/i915" }} + - '--num-shard' + - {{ index .Values.deploy.resources.limits "gpu.intel.com/i915" | quote }} + {{- end }} + - '-p' + - {{ .Values.service.port | quote }} + {{- if .Values.quantize }} + - '--quantize' + - {{ .Values.deploy.quantize | quote }} + {{- end }} + - '--cuda-graphs=0' + envFrom: + - configMapRef: + name: {{ .Values.deploy.configMapName }} + - secretRef: + name: {{ .Release.Name }}-hf-token + env: + - name: NUMBA_CACHE_DIR # https://github.com/huggingface/text-generation-inference/pull/2443 + value: /data/numba_cache + image: {{ .Values.deploy.image }} + livenessProbe: + httpGet: + path: /health + port: {{ .Values.service.port }} + initialDelaySeconds: 5 + periodSeconds: 5 + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + resources: + {{- toYaml .Values.deploy.resources | nindent 12 }} + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /data + name: hf-data + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: hf-data + emptyDir: {} diff --git a/workflows/charts/tgi/templates/ingress.yaml b/workflows/charts/tgi/templates/ingress.yaml new file mode 100644 index 00000000..f87f6cb0 --- /dev/null +++ b/workflows/charts/tgi/templates/ingress.yaml @@ -0,0 +1,76 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "tgi.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "tgi.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + kubernetes.io/ingress.allow-http: "false" + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/workflows/charts/tgi/templates/secret.yaml b/workflows/charts/tgi/templates/secret.yaml new file mode 100644 index 00000000..0507543e --- /dev/null +++ b/workflows/charts/tgi/templates/secret.yaml @@ -0,0 +1,22 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- $name := .Values.secret.encodedToken | required ".Values.secret.encodedToken is required in Base64 Format." -}} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Release.Name }}-hf-token +type: Opaque +data: + HF_TOKEN: {{ .Values.secret.encodedToken }} diff --git a/workflows/charts/tgi/templates/service.yaml b/workflows/charts/tgi/templates/service.yaml new file mode 100644 index 00000000..7aff68e5 --- /dev/null +++ b/workflows/charts/tgi/templates/service.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "tgi.fullname" . }} + labels: + {{- include "tgi.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "tgi.selectorLabels" . | nindent 4 }} diff --git a/workflows/charts/tgi/templates/tests/test-connection.yaml b/workflows/charts/tgi/templates/tests/test-connection.yaml new file mode 100644 index 00000000..007086c4 --- /dev/null +++ b/workflows/charts/tgi/templates/tests/test-connection.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "tgi.fullname" . }}-test-connection" + labels: + {{- include "tgi.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: info + image: curlimages/curl + command: ['sh', '-c'] + args: ['curl -f {{ include "tgi.fullname" . }}:{{ .Values.service.port }}/info'] + restartPolicy: OnFailure diff --git a/workflows/charts/tgi/values.yaml b/workflows/charts/tgi/values.yaml new file mode 100644 index 00000000..7d2434cc --- /dev/null +++ b/workflows/charts/tgi/values.yaml @@ -0,0 +1,64 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -- Name of the serving service +nameOverride: "" +# -- Full qualified Domain Name +fullnameOverride: "" +deploy: + # -- ConfigMap of Environment Variables + configMapName: intel-proxy-config + # -- Intel TGI Image + image: ghcr.io/huggingface/text-generation-inference:latest-intel + # -- Model to be loaded + model: HuggingFaceTB/SmolLM-135M + # -- Enable Quantization (ex: bitsandbytes-nf4) + quantize: "" + # -- Number of pods + replicaCount: 1 + # -- Resource configuration + resources: + limits: + cpu: 4000m + # -- Intel GPU Device Configuration + gpu.intel.com/i915: 1 + # habana.ai/gaudi: 1 + # memory: 409Gi + # hugepages-2Mi: 95000Mi + requests: + cpu: 1000m + memory: "1Gi" +secret: + # -- Base64 Encoded Huggingface Hub API Token + encodedToken: "" +# -- Service configuration +service: + port: 80 + type: NodePort +# -- Ingress configuration +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local diff --git a/workflows/charts/torchserve/Chart.yaml b/workflows/charts/torchserve/Chart.yaml index f3472530..99db0496 100644 --- a/workflows/charts/torchserve/Chart.yaml +++ b/workflows/charts/torchserve/Chart.yaml @@ -13,8 +13,8 @@ # limitations under the License. apiVersion: v2 -name: intel-torchserve -description: Intel TorchServe is a performant, flexible and easy to use tool for serving PyTorch models in production. +name: torchserve-on-intel +description: TorchServe on Intel is a performant, flexible and easy to use tool for serving PyTorch models in production. # A chart can be either an 'application' or a 'library' chart. # diff --git a/workflows/charts/torchserve/README.md b/workflows/charts/torchserve/README.md index b84a964c..c1a717f5 100644 --- a/workflows/charts/torchserve/README.md +++ b/workflows/charts/torchserve/README.md @@ -1,8 +1,8 @@ -# Intel TorchServe +# TorchServe with Intel Optimizations -Intel TorchServe is a performant, flexible and easy to use tool for serving PyTorch models in production. +TorchServe on Intel is a performant, flexible and easy to use tool for serving PyTorch models in production. -For more information about how to use Intel Optimized TorchServe, check out the [container documentation](../../../pytorch/serving/README.md). +For more information about how to use TorchServe with Intel Optimizations, check out the [container documentation](../../../pytorch/serving/README.md). ![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.16.0](https://img.shields.io/badge/AppVersion-1.16.0-informational?style=flat-square) @@ -18,6 +18,7 @@ For more information about how to use Intel Optimized TorchServe, check out the | deploy.resources.limits | object | `{"cpu":"4000m","memory":"1Gi"}` | Maximum resources per pod | | deploy.resources.requests | object | `{"cpu":"1000m","memory":"512Mi"}` | Minimum resources per pod | | deploy.storage.nfs | object | `{"enabled":false,"path":"nil","readOnly":true,"server":"nil","subPath":"nil"}` | Network File System (NFS) storage for models | +| deploy.tokens_disabled | bool | `true` | Set token authentication on or off. Checkout the latest [torchserve docs](https://github.com/pytorch/serve/blob/master/docs/token_authorization_api.md) for more details. | | fullnameOverride | string | `""` | Full qualified Domain Name | | nameOverride | string | `""` | Name of the serving service | | pvc.size | string | `"1Gi"` | Size of the storage | @@ -37,4 +38,4 @@ There are some additional steps that can be taken to prepare your service for yo - Integrate an [SSL Certificate](https://pytorch.org/serve/configuration.html#enable-ssl) in your model config file to serve models securely. ---------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1) +Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) diff --git a/workflows/charts/torchserve/README.md.gotmpl b/workflows/charts/torchserve/README.md.gotmpl index 1ddf329d..465c03ae 100644 --- a/workflows/charts/torchserve/README.md.gotmpl +++ b/workflows/charts/torchserve/README.md.gotmpl @@ -1,8 +1,8 @@ -# Intel TorchServe +# TorchServe with Intel Optimizations {{ template "chart.description" . }} -For more information about how to use Intel Optimized TorchServe, check out the [container documentation](../../../pytorch/serving/README.md). +For more information about how to use TorchServe with Intel Optimizations, check out the [container documentation](../../../pytorch/serving/README.md). {{ template "chart.versionBadge" . }}{{ template "chart.typeBadge" . }}{{ template "chart.appVersionBadge" . }} diff --git a/workflows/charts/torchserve/templates/NOTES.txt b/workflows/charts/torchserve/templates/NOTES.txt index 8796b205..7cf61fc4 100644 --- a/workflows/charts/torchserve/templates/NOTES.txt +++ b/workflows/charts/torchserve/templates/NOTES.txt @@ -14,3 +14,8 @@ echo "Visit http://127.0.0.1:8080 to use your application" kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT {{- end }} +{{- if eq false .Values.deploy.tokens_disabled }} +2. Display the tokens for accessing the APIs. For more details about token authentication checkout: https://github.com/pytorch/serve/blob/master/docs/token_authorization_api.md + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "torchserve.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + kubectl exec --namespace {{ .Release.Namespace }} $POD_NAME -- cat /home/model-server/key_file.json +{{- end }} diff --git a/workflows/charts/torchserve/templates/deploy.yaml b/workflows/charts/torchserve/templates/deploy.yaml index 544a2fb1..85f03142 100644 --- a/workflows/charts/torchserve/templates/deploy.yaml +++ b/workflows/charts/torchserve/templates/deploy.yaml @@ -47,6 +47,9 @@ spec: - configMapRef: name: {{ .Values.deploy.env.configMapName }} {{- end }} + env: + - name: TS_DISABLE_TOKEN_AUTHORIZATION + value: "{{ .Values.deploy.tokens_disabled }}" ports: - name: rest-1 containerPort: 8080 diff --git a/workflows/charts/torchserve/values.yaml b/workflows/charts/torchserve/values.yaml index e95efb15..f59e1c40 100644 --- a/workflows/charts/torchserve/values.yaml +++ b/workflows/charts/torchserve/values.yaml @@ -23,6 +23,8 @@ deploy: env: configMapName: intel-proxy-config enabled: true + # -- Set token authentication on or off. Checkout the latest [torchserve docs](https://github.com/pytorch/serve/blob/master/docs/token_authorization_api.md) for more details. + tokens_disabled: true # -- Models to be loaded models: all # -- Model Server Configuration file location