.github/workflows/ec2-pipeline.yml

name: CML-EC2-Runner
on: 
  workflow_dispatch:
jobs:

  build-and-push-ecr-image:
    name: Continuous Delivery
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Code
        uses: actions/checkout@v3

      - name: Install Utilities
        run: |
          sudo apt-get update
          sudo apt-get install -y jq unzip
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ secrets.AWS_REGION }}

      - name: Login to Amazon ECR
        id: login-ecr
        uses: aws-actions/amazon-ecr-login@v1

      - name: Build, tag, and push image to Amazon ECR
        id: build-image
        env:
          ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
          ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY_NAME }}
          IMAGE_TAG: latest
        run: |
          # Build a docker container and
          # push it to ECR so that it can
          # be deployed to ECS.
          docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
          docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
          echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
          

  launch-runner:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: iterative/setup-cml@v2
      - name: Deploy runner on AWS EC2 
        env:
          REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
        run: |
          cml runner launch \
          --cloud=aws \
          --name=session-08 \
          --cloud-region=ap-south-1 \
          --cloud-type=g4dn.xlarge \
          --cloud-hdd-size=64 \
          --cloud-spot \
          --single \
          --labels=cml-gpu \
          --idle-timeout=100 
  train-and-report:
    runs-on: [self-hosted, cml-gpu]
    needs: launch-runner
    timeout-minutes: 20
    # container:
    #   image: docker://pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime
    #   options: --gpus all
    # runs-on: ubuntu-latest
    steps:
      # - name: Set node environment
      #   run: |
      #     apt-get remove nodejs
      #     apt-get remove npm
      #     curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh | bash
      #     chmod +x ~/.nvm/nvm.sh
      #     ls -a ~
      #     nvm -v
      #     nvm install 20
      #     node -v
      #     npm -v

      # - uses: actions/setup-node@v4
      #   with:
      #     node-version: 20
      # - run: npm ci
      # - run: npm test

      # - uses: actions/checkout@v2

      - name: Display CUDA Version
        run: |         
          echo "CUDA Version:"
          nvcc --version || true

      - name: Display cuDNN Version
        run: |
          echo "cuDNN Version:"
          cat /usr/local/cuda/include/cudnn_version.h | grep CUDNN_MAJOR -A 2 || true

      - name: Verify EC2 Instance
        run: |
          echo "Checking instance information..."
          # Check if we're on EC2
          TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
          curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type
          
          echo "Checking system resources..."
          lscpu
          free -h
          df -h
          nvidia-smi  # This will show GPU if available
          
          echo "Checking environment..."
          env | grep AWS || true
          hostname
          whoami
          pwd
          # Install the AWS CLI if not already available
          if ! command -v aws &> /dev/null; then
            apt-get update
            apt-get install -y awscli
          fi
          
          # Get ECR login command and execute it
          $(aws ecr get-login --no-include-email --region ap-south-1)
          aws ecr get-login-password --region ap-south-1 | docker login --username AWS --password-stdin 306093656765.dkr.ecr.ap-south-1.amazonaws.com

      - name: Set up AWS CLI credentials
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: ap-south-1  # Change to your desired region
        run: |
          # Create the AWS config and credentials files
          mkdir -p ~/.aws
          echo "[default]" > ~/.aws/config
          echo "region=${AWS_DEFAULT_REGION}" >> ~/.aws/config
          echo "[default]" > ~/.aws/credentials
          echo "aws_access_key_id=${AWS_ACCESS_KEY_ID}" >> ~/.aws/credentials
          echo "aws_secret_access_key=${AWS_SECRET_ACCESS_KEY}" >> ~/.aws/credentials

      - name: Test AWS CLI
        run: |
          # Now you can run any AWS CLI command
          aws s3 ls  # Example command to list S3 buckets

      # - name: Authenticate with AWS ECR
      #   uses: aws-actions/configure-aws-credentials@v4
      #   with:
      #     aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
      #     aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      #     aws-region: ${{ secrets.AWS_REGION }}

      # - name: AWS ECR
      #   run: |
      #     aws configure aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}
        

      # - name: Install Docker
      #   run: |
      #     curl -fsSL https://get.docker.com -o get-docker.sh
      #     sh get-docker.sh


      # - name: Login to Amazon ECR
      #   id: login-ecr
      #   uses: aws-actions/amazon-ecr-login@v2
      # - name: CUDA Check
      #   run: |
      #     docker run --gpus all -it pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime python3 -c "
      #     import torch; 
      #     print(f'CUDA Available: {torch.cuda.is_available()}'); 
      #     if torch.cuda.is_available(): 
      #         print(f'Device: {torch.cuda.get_device_name(0)}')"

      - name: Pull Docker image from ECR
        run: |
          docker pull ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
          ls -a

      - name: Run DVC commands in container
        run: |
          mkdir -p model_storage
          docker run --gpus=all \
            -v "$(pwd)/model_storage:/workspace/model_storage" \
            -e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
            -e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
            -e AWS_DEFAULT_REGION=${{ secrets.AWS_REGION }} \
            ${{ secrets.AWS_ECR_LOGIN_URI }}/${{ secrets.ECR_REPOSITORY_NAME }}:latest \
            /bin/bash -c "
              dvc pull -r myremote && \
              mkdir -p model_storage && \
              dvc repro -f 
            "

          # # Wait a moment to ensure the container has started
          # sleep 5

          ls model_storage/
          
          # # Print logs from the container
          # docker logs $CONTAINER_ID

          # # Stop the container after retrieving logs
          # docker stop $CONTAINER_ID

      - name: List files in folder
        run: |
          ls -l ./  

      - name: Install jq
        run: |
          sudo apt-get update
          sudo apt-get install -y jq

      - name: Get latest commit ID from the repository
        id: get_commit_id
        env:
          REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
        run: |
          repo="ajithvcoder/emlo4-session-08-ajithvcoder"
          latest_commit=$(curl -s -H "Authorization: token $REPO_TOKEN" \
            "https://api.github.com/repos/$repo/commits?per_page=1" | \
            jq -r '.[0].sha')
          echo "COMMIT_ID=$latest_commit" >> $GITHUB_ENV

      - name: List files in folder
        run: |
          ls -l ./model_storage

      - name: Read best checkpoint file name
        id: read_checkpoint
        run: |
          checkpoint_file=$(head -n 1 ./model_storage/best_model_checkpoint.txt)
          echo "CHECKPOINT_FILE=$checkpoint_file" >> $GITHUB_ENV

      - name: Upload checkpoint to S3
        run: |
          checkpoint_path="${{ env.CHECKPOINT_FILE }}"  # Use the checkpoint path from the file
          bucket_name="mybucket-emlo-mumbai/session-08-checkpoint/"  # Change to your S3 bucket name
          s3_key="session-08-checkpoint/${{ env.COMMIT_ID }}/$(basename "$checkpoint_path")"  # Define S3 key
          echo "Uploading $checkpoint_path to s3://$bucket_name/$s3_key"
          aws s3 cp "$checkpoint_path" "s3://$bucket_name/$s3_key"

      - name: Clean previous images and containers
        run: |
          docker system prune -f