Skip to content

CML-EC2-Runner

CML-EC2-Runner #69

Workflow file for this run

name: CML-EC2-Runner
on:
workflow_dispatch:
jobs:
launch-runner:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: iterative/setup-cml@v2
- name: Deploy runner on AWS EC2
env:
REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
cml runner launch \
--cloud=aws \
--name=session-08 \
--cloud-region=ap-south-1 \
--cloud-type=g4dn.xlarge \
--cloud-hdd-size=64 \
--cloud-spot \
--single \
--labels=cml-gpu \
--idle-timeout=100
train-and-report:
runs-on: [self-hosted, cml-gpu]
needs: launch-runner
timeout-minutes: 20
# container:
# image: docker://pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime
# options: --gpus all
# runs-on: ubuntu-latest
steps:
# - name: Set node environment
# run: |
# apt-get remove nodejs
# apt-get remove npm
# curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh | bash
# chmod +x ~/.nvm/nvm.sh
# ls -a ~
# nvm -v
# nvm install 20
# node -v
# npm -v
# - uses: actions/setup-node@v4
# with:
# node-version: 20
# - run: npm ci
# - run: npm test
# - uses: actions/checkout@v2
- name: Display CUDA Version
run: |
echo "CUDA Version:"
nvcc --version || true
- name: Display cuDNN Version
run: |
echo "cuDNN Version:"
cat /usr/local/cuda/include/cudnn_version.h | grep CUDNN_MAJOR -A 2 || true
- name: Verify EC2 Instance
run: |
echo "Checking instance information..."
# Check if we're on EC2
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type
echo "Checking system resources..."
lscpu
free -h
df -h
nvidia-smi # This will show GPU if available
echo "Checking environment..."
env | grep AWS || true
hostname
whoami
pwd
# Install the AWS CLI if not already available
if ! command -v aws &> /dev/null; then
apt-get update
apt-get install -y awscli
fi
# Get ECR login command and execute it
$(aws ecr get-login --no-include-email --region ap-south-1)
aws ecr get-login-password --region ap-south-1 | docker login --username AWS --password-stdin 306093656765.dkr.ecr.ap-south-1.amazonaws.com
- name: Set up AWS CLI credentials
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: ap-south-1 # Change to your desired region
run: |
# Create the AWS config and credentials files
mkdir -p ~/.aws
echo "[default]" > ~/.aws/config
echo "region=${AWS_DEFAULT_REGION}" >> ~/.aws/config
echo "[default]" > ~/.aws/credentials
echo "aws_access_key_id=${AWS_ACCESS_KEY_ID}" >> ~/.aws/credentials
echo "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" >> ~/.aws/credentials
- name: Test AWS CLI
run: |
# Now you can run any AWS CLI command
aws s3 ls # Example command to list S3 buckets
# - name: Authenticate with AWS ECR
# uses: aws-actions/configure-aws-credentials@v4
# with:
# aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
# aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
# aws-region: ${{ secrets.AWS_REGION }}
# - name: AWS ECR
# run: |
# aws configure aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}
# - name: Install Docker
# run: |
# curl -fsSL https://get.docker.com -o get-docker.sh
# sh get-docker.sh
# - name: Login to Amazon ECR
# id: login-ecr
# uses: aws-actions/amazon-ecr-login@v2
# - name: CUDA Check
# run: |
# docker run --gpus all -it pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime python3 -c "
# import torch;
# print(f'CUDA Available: {torch.cuda.is_available()}');
# if torch.cuda.is_available():
# print(f'Device: {torch.cuda.get_device_name(0)}')"
- name: Pull Docker image from ECR
run: |
docker pull ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
ls -a
- name: Run DVC commands in container
run: |
mkdir -p model_storage
docker run --gpus=all \
-v "$(pwd)/model_storage:/workspace/model_storage" \
-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
-e AWS_DEFAULT_REGION=${{ secrets.AWS_REGION }} \
${{ secrets.AWS_ECR_LOGIN_URI }}/${{ secrets.ECR_REPOSITORY_NAME }}:latest \
/bin/bash -c "
dvc pull -r myremote && \
mkdir -p model_storage && \
dvc repro -f
"
# # Wait a moment to ensure the container has started
# sleep 5
ls model_storage/
# # Print logs from the container
# docker logs $CONTAINER_ID
# # Stop the container after retrieving logs
# docker stop $CONTAINER_ID
- name: List files in folder
run: |
ls -l ./
- name: Install jq
run: |
sudo apt-get update
sudo apt-get install -y jq
- name: Get latest commit ID from the repository
id: get_commit_id
env:
REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
run: |
repo="ajithvcoder/emlo4-session-08-ajithvcoder"
latest_commit=$(curl -s -H "Authorization: token $REPO_TOKEN" \
"https://api.github.com/repos/$repo/commits?per_page=1" | \
jq -r '.[0].sha')
echo "COMMIT_ID=$latest_commit" >> $GITHUB_ENV
- name: List files in folder
run: |
ls -l ./model_storage
- name: Read best checkpoint file name
id: read_checkpoint
run: |
checkpoint_file=$(head -n 1 ./model_storage/best_model_checkpoint.txt)
echo "CHECKPOINT_FILE=$checkpoint_file" >> $GITHUB_ENV
- name: Upload checkpoint to S3
run: |
checkpoint_path="${{ env.CHECKPOINT_FILE }}" # Use the checkpoint path from the file
bucket_name="mybucket-emlo-mumbai/session-08-checkpoint/" # Change to your S3 bucket name
s3_key="session-08-checkpoint/${{ env.COMMIT_ID }}/$(basename "$checkpoint_path")" # Define S3 key
echo "Uploading $checkpoint_path to s3://$bucket_name/$s3_key"
aws s3 cp "$checkpoint_path" "s3://$bucket_name/$s3_key" --recursive
- name: Clean previous images and containers
run: |
docker system prune -f