Skip to content

Commit

Permalink
.ci/aws: Attempt to add stable p3dn testing into CI
Browse files Browse the repository at this point in the history
Signed-off-by: Seth Zegelstein <szegel@amazon.com>
  • Loading branch information
a-szegel committed May 28, 2024
1 parent 309d834 commit 09a0e60
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 0 deletions.
10 changes: 10 additions & 0 deletions .ci/aws/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,23 @@ pipeline {
def addl_args_pr = "--test-aws-ofi-nccl-pr $env.CHANGE_ID --test-nccl-version ${nccl_version}"
def config = ".ci/aws/aws_ofi_nccl_pr_ci.yaml"
def num_instances = 4
def p3dn_lock_label = "p3dn-1-4node"
def p3dn_region = "us-east-1"
def p3dn_odcr = "cr-0ca89ca67f047efa8"
def p3dn_addl_args = "${addl_args_pr} --odcr-placement-group-name efa-placement-group2"
def p3dn_al2_addl_args = "${p3dn_addl_args} --ami-id ami-0b92996e003535762"
def p4d_lock_label = "p4d-1-4node"
def p4d_region = "us-east-2"
def p4d_odcr = "cr-0e5eebb3c896f6af0"
def p5_lock_label = "p5-1-4node"
def p5_region = "af-south-1"
def p5_odcr = "cr-02eb632dcd8175139"

// p3dn tests
stages["4_p3dn_al2"] = common.get_test_stage_with_lock("4_p3dn_al2", env.BUILD_TAG, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_al2_addl_args)
stages["4_p3dn_ubuntu2004"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2204"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args)

// p4d tests
stages["4_p4d_alinux2"] = common.get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, addl_args_pr)
stages["4_p4d_ubuntu2004"] = common.get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, addl_args_pr)
Expand Down
13 changes: 13 additions & 0 deletions .ci/aws/common.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,12 @@ def wait_for_odcr_capacity(region, instance_count, odcr) {
sh ". venv/bin/activate; ./PortaFiducia/scripts/wait_for_odcr_capacity.py --region ${region} --odcr-id ${odcr} --required-capacity ${instance_count}"
}

def p3dn_sleep_5min(instance_type) {
if (instance_type == "p3dn.24xlarge") {
sh "sleep 300"
}
}


def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, config, odcr, addl_args) {
/*
Expand All @@ -69,6 +75,13 @@ def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_
kill_all_clusters(instance_type, region)
wait_for_odcr_capacity(region, instance_count, odcr)

/*
* p3dn clusters are getting ICE'ed within an ODCR, when we try to launch them back to back.
* This is a non-deterministic work around to help us increase our chances of not getting ICE'ed.
* Worst case, this increases our time to publish results on PR's by 15 minutes.
*/
p3dn_sleep_5min(instance_type)

def cluster_name = get_cluster_name(build_tag, os, instance_type)
def args = "--config ${config} --os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml"
def ret = sh (
Expand Down

0 comments on commit 09a0e60

Please sign in to comment.