diff --git a/.ci/aws/Jenkinsfile b/.ci/aws/Jenkinsfile index 3db392ae6..542b653a5 100644 --- a/.ci/aws/Jenkinsfile +++ b/.ci/aws/Jenkinsfile @@ -58,6 +58,11 @@ pipeline { def addl_args_pr = "--test-aws-ofi-nccl-pr $env.CHANGE_ID --test-nccl-version ${nccl_version}" def config = ".ci/aws/aws_ofi_nccl_pr_ci.yaml" def num_instances = 4 + def p3dn_lock_label = "p3dn-1-4node" + def p3dn_region = "us-east-1" + def p3dn_odcr = "cr-0ca89ca67f047efa8" + def p3dn_addl_args = "${addl_args_pr} --odcr-placement-group-name efa-placement-group2" + def p3dn_al2_addl_args = "${p3dn_addl_args} --ami-id ami-0b92996e003535762" def p4d_lock_label = "p4d-1-4node" def p4d_region = "us-east-2" def p4d_odcr = "cr-0e5eebb3c896f6af0" @@ -65,6 +70,11 @@ pipeline { def p5_region = "af-south-1" def p5_odcr = "cr-02eb632dcd8175139" + // p3dn tests + stages["4_p3dn_al2"] = common.get_test_stage_with_lock("4_p3dn_al2", env.BUILD_TAG, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_al2_addl_args) + stages["4_p3dn_ubuntu2004"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args) + stages["4_p3dn_ubuntu2204"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args) + // p4d tests stages["4_p4d_alinux2"] = common.get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, addl_args_pr) stages["4_p4d_ubuntu2004"] = common.get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, addl_args_pr) diff --git a/.ci/aws/common.groovy b/.ci/aws/common.groovy index afef9bf85..e5dcbd528 100644 --- a/.ci/aws/common.groovy +++ b/.ci/aws/common.groovy @@ -54,6 +54,12 @@ def wait_for_odcr_capacity(region, instance_count, odcr) { sh ". venv/bin/activate; ./PortaFiducia/scripts/wait_for_odcr_capacity.py --region ${region} --odcr-id ${odcr} --required-capacity ${instance_count}" } +def p3dn_sleep_5min(instance_type) { + if (instance_type == "p3dn.24xlarge") { + sh "sleep 300" + } +} + def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, config, odcr, addl_args) { /* @@ -69,6 +75,13 @@ def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_ kill_all_clusters(instance_type, region) wait_for_odcr_capacity(region, instance_count, odcr) + /* + * p3dn clusters are getting ICE'ed within an ODCR, when we try to launch them back to back. + * This is a non-deterministic work around to help us increase our chances of not getting ICE'ed. + * Worst case, this increases our time to publish results on PR's by 15 minutes. + */ + p3dn_sleep_5min(instance_type) + def cluster_name = get_cluster_name(build_tag, os, instance_type) def args = "--config ${config} --os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" def ret = sh (