-
Notifications
You must be signed in to change notification settings - Fork 4
130 lines (115 loc) · 5.95 KB
/
docker-build.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
name: "Docker Build & Deploy"
on:
pull_request:
push:
branches:
- "inference"
workflow_dispatch:
# Cancel outdated workflows if they are still running
concurrency:
group: docker-build-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
docker-build-rocm:
name: Build and Install FlexFlow in a Docker Container (ROCm backend)
runs-on: ubuntu-22.04
if: ${{ ( github.event_name != 'push' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' ) || github.ref_name != 'inference' }}
env:
FF_GPU_BACKEND: "hip_rocm"
hip_version: 5.6
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Free additional space on runner
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Build Docker container
run: FF_HIP_ARCH="gfx1100,gfx1036" ./docker/build.sh flexflow
- name: Check availability of flexflow modules in Python
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
docker-build-and-publish-rocm:
name: Build and Deploy FlexFlow Docker Containers (ROCm backend)
runs-on: "runs-on=${{ github.run_id }}/runner=rocm-builder"
if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
strategy:
matrix:
hip_version: ["5.6"]
fail-fast: false
env:
FF_GPU_BACKEND: "hip_rocm"
hip_version: ${{ matrix.hip_version }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- name: Build Docker container
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
run: FF_HIP_ARCH=all ./docker/build.sh flexflow
- name: Check availability of flexflow modules in Python
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
- name: Publish Docker environment image (on push to inference)
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
run: |
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
docker-build-cuda:
name: Build and Install FlexFlow in a Docker Container (CUDA backend)
runs-on: ubuntu-22.04
strategy:
matrix:
cuda_version: ["11.8", "12.0", "12.1", "12.2"]
fail-fast: false
env:
FF_GPU_BACKEND: "cuda"
cuda_version: ${{ matrix.cuda_version }}
steps:
- name: Checkout Git Repository
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
uses: actions/checkout@v3
with:
submodules: recursive
- name: Free additional space on runner
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Build Docker container
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
env:
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
build_needed: ${{ matrix.cuda_version == '12.0' }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
# to save time.
if [[ $deploy_needed == "true" ]] ; then
export FF_CUDA_ARCH=all
./docker/build.sh flexflow
elif [[ $build_needed == "true" ]]; then
export FF_CUDA_ARCH=86
./docker/build.sh flexflow
fi
- name: Check availability of flexflow modules in Python
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
- name: Publish Docker environment image (on push to inference)
if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
run: |
./docker/publish.sh flexflow-environment
./docker/publish.sh flexflow
notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-22.04
needs: [docker-build-cuda, docker-build-and-publish-rocm]
if: ${{ failure() && github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
steps:
- name: Send Slack message
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
run: |
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"flexflow-serve Docker images build failed! <https://github.com/flexflow/flexflow-serve/actions/runs/$GITHUB_RUN_ID|(See here).> :x: \"}" $SLACK_WEBHOOK