-
Notifications
You must be signed in to change notification settings - Fork 1
125 lines (105 loc) · 3.65 KB
/
ec2-pipeline.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
name: CML-EC2-Runner
on:
workflow_dispatch:
jobs:
launch-runner:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: iterative/setup-cml@v2
- name: Deploy runner on AWS EC2
env:
REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
cml runner launch \
--cloud=aws \
--name=session-08 \
--cloud-region=ap-south-1 \
--cloud-type=g4dn.xlarge \
--cloud-hdd-size=64 \
--cloud-spot \
--single \
--labels=cml-gpu \
--idle-timeout=100
train-and-report:
runs-on: [self-hosted, cml-gpu]
needs: launch-runner
timeout-minutes: 20
# runs-on: ubuntu-latest
steps:
# - name: Set node environment
# run: |
# apt-get remove nodejs
# apt-get remove npm
# curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh | bash
# chmod +x ~/.nvm/nvm.sh
# ls -a ~
# nvm -v
# nvm install 20
# node -v
# npm -v
# - uses: actions/setup-node@v4
# with:
# node-version: 20
# - run: npm ci
# - run: npm test
# - uses: actions/checkout@v2
- name: Verify EC2 Instance
run: |
echo "Checking instance information..."
# Check if we're on EC2
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type
echo "Checking system resources..."
lscpu
free -h
df -h
nvidia-smi # This will show GPU if available
echo "Checking environment..."
env | grep AWS || true
hostname
whoami
pwd
- name: Authenticate with AWS ECR
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Nvidia check
run: |
nvidia-smi
- name: Install Docker
run: |
curl -fsSL https://get.docker.com -o get-docker.sh
sh get-docker.sh
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- name: Pull Docker image from ECR
run: |
docker pull ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
ls -a
- name: Run DVC commands in container
run: |
docker run --rm --gpus=all \
-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
-e AWS_DEFAULT_REGION=${{ secrets.AWS_REGION }} \
${{ secrets.AWS_ECR_LOGIN_URI }}/${{ secrets.ECR_REPOSITORY_NAME }}:latest \
/bin/bash -c "
dvc pull -r myremote && \
mkdir model_storage && \
dvc repro -f
"
# # Wait a moment to ensure the container has started
# sleep 5
# # Print logs from the container
# docker logs $CONTAINER_ID
# # Stop the container after retrieving logs
# docker stop $CONTAINER_ID
- name: Clean previous images and containers
run: |
docker system prune -f