CML-EC2-Runner #69

Workflow file for this run

.github/workflows/ec2-pipeline.yml at a45de89

	name: CML-EC2-Runner
	on:
	workflow_dispatch:
	jobs:
	launch-runner:
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v3
	- uses: iterative/setup-cml@v2
	- name: Deploy runner on AWS EC2
	env:
	REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
	AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
	AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	run: \|
	cml runner launch \
	--cloud=aws \
	--name=session-08 \
	--cloud-region=ap-south-1 \
	--cloud-type=g4dn.xlarge \
	--cloud-hdd-size=64 \
	--cloud-spot \
	--single \
	--labels=cml-gpu \
	--idle-timeout=100
	train-and-report:
	runs-on: [self-hosted, cml-gpu]
	needs: launch-runner
	timeout-minutes: 20
	# container:
	# image: docker://pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime
	# options: --gpus all
	# runs-on: ubuntu-latest
	steps:
	# - name: Set node environment
	# run: \|
	# apt-get remove nodejs
	# apt-get remove npm
	# curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh \| bash
	# chmod +x ~/.nvm/nvm.sh
	# ls -a ~
	# nvm -v
	# nvm install 20
	# node -v
	# npm -v

	# - uses: actions/setup-node@v4
	# with:
	# node-version: 20
	# - run: npm ci
	# - run: npm test

	# - uses: actions/checkout@v2

	- name: Display CUDA Version
	run: \|
	echo "CUDA Version:"
	nvcc --version \|\| true

	- name: Display cuDNN Version
	run: \|
	echo "cuDNN Version:"
	cat /usr/local/cuda/include/cudnn_version.h \| grep CUDNN_MAJOR -A 2 \|\| true

	- name: Verify EC2 Instance
	run: \|
	echo "Checking instance information..."
	# Check if we're on EC2
	TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
	curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type

	echo "Checking system resources..."
	lscpu
	free -h
	df -h
	nvidia-smi # This will show GPU if available

	echo "Checking environment..."
	env \| grep AWS \|\| true
	hostname
	whoami
	pwd
	# Install the AWS CLI if not already available
	if ! command -v aws &> /dev/null; then
	apt-get update
	apt-get install -y awscli
	fi

	# Get ECR login command and execute it
	$(aws ecr get-login --no-include-email --region ap-south-1)
	aws ecr get-login-password --region ap-south-1 \| docker login --username AWS --password-stdin 306093656765.dkr.ecr.ap-south-1.amazonaws.com

	- name: Set up AWS CLI credentials
	env:
	AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
	AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	AWS_DEFAULT_REGION: ap-south-1 # Change to your desired region
	run: \|
	# Create the AWS config and credentials files
	mkdir -p ~/.aws
	echo "[default]" > ~/.aws/config
	echo "region=${AWS_DEFAULT_REGION}" >> ~/.aws/config
	echo "[default]" > ~/.aws/credentials
	echo "aws_access_key_id=${AWS_ACCESS_KEY_ID}" >> ~/.aws/credentials
	echo "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" >> ~/.aws/credentials

	- name: Test AWS CLI
	run: \|
	# Now you can run any AWS CLI command
	aws s3 ls # Example command to list S3 buckets

	# - name: Authenticate with AWS ECR
	# uses: aws-actions/configure-aws-credentials@v4
	# with:
	# aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	# aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	# aws-region: ${{ secrets.AWS_REGION }}

	# - name: AWS ECR
	# run: \|
	# aws configure aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}


	# - name: Install Docker
	# run: \|
	# curl -fsSL https://get.docker.com -o get-docker.sh
	# sh get-docker.sh


	# - name: Login to Amazon ECR
	# id: login-ecr
	# uses: aws-actions/amazon-ecr-login@v2
	# - name: CUDA Check
	# run: \|
	# docker run --gpus all -it pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime python3 -c "
	# import torch;
	# print(f'CUDA Available: {torch.cuda.is_available()}');
	# if torch.cuda.is_available():
	# print(f'Device: {torch.cuda.get_device_name(0)}')"

	- name: Pull Docker image from ECR
	run: \|
	docker pull ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
	ls -a

	- name: Run DVC commands in container
	run: \|
	mkdir -p model_storage
	docker run --gpus=all \
	-v "$(pwd)/model_storage:/workspace/model_storage" \
	-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
	-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
	-e AWS_DEFAULT_REGION=${{ secrets.AWS_REGION }} \
	${{ secrets.AWS_ECR_LOGIN_URI }}/${{ secrets.ECR_REPOSITORY_NAME }}:latest \
	/bin/bash -c "
	dvc pull -r myremote && \
	mkdir -p model_storage && \
	dvc repro -f
	"

	# # Wait a moment to ensure the container has started
	# sleep 5

	ls model_storage/

	# # Print logs from the container
	# docker logs $CONTAINER_ID

	# # Stop the container after retrieving logs
	# docker stop $CONTAINER_ID

	- name: List files in folder
	run: \|
	ls -l ./

	- name: Install jq
	run: \|
	sudo apt-get update
	sudo apt-get install -y jq

	- name: Get latest commit ID from the repository
	id: get_commit_id
	env:
	REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
	run: \|
	repo="ajithvcoder/emlo4-session-08-ajithvcoder"
	latest_commit=$(curl -s -H "Authorization: token $REPO_TOKEN" \
	"https://api.github.com/repos/$repo/commits?per_page=1" \| \
	jq -r '.[0].sha')
	echo "COMMIT_ID=$latest_commit" >> $GITHUB_ENV

	- name: List files in folder
	run: \|
	ls -l ./model_storage

	- name: Read best checkpoint file name
	id: read_checkpoint
	run: \|
	checkpoint_file=$(head -n 1 ./model_storage/best_model_checkpoint.txt)
	echo "CHECKPOINT_FILE=$checkpoint_file" >> $GITHUB_ENV

	- name: Upload checkpoint to S3
	run: \|
	checkpoint_path="${{ env.CHECKPOINT_FILE }}" # Use the checkpoint path from the file
	bucket_name="mybucket-emlo-mumbai/session-08-checkpoint/" # Change to your S3 bucket name
	s3_key="session-08-checkpoint/${{ env.COMMIT_ID }}/$(basename "$checkpoint_path")" # Define S3 key
	echo "Uploading $checkpoint_path to s3://$bucket_name/$s3_key"
	aws s3 cp "$checkpoint_path" "s3://$bucket_name/$s3_key" --recursive

	- name: Clean previous images and containers
	run: \|
	docker system prune -f

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CML-EC2-Runner #69

Workflow file

CML-EC2-Runner #69

Jobs

Run details

Workflow file for this run