Skip to content

Commit

Permalink
.ci/aws: Wait until ODCR has enough capacity to run
Browse files Browse the repository at this point in the history
The p3dn ODCR is not immediately refilling the available instance count
after all the instances have been terminated.  This has put a race
condition in our code which sometimes causes us to get ICE'ed.  Attempt
to fix the ICE by waiting till the ODCR has the required capacity before
attempting to launch instances with it.

Signed-off-by: Seth Zegelstein <szegel@amazon.com>
  • Loading branch information
a-szegel committed May 17, 2024
1 parent b78f1ac commit b738a01
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 15 deletions.
20 changes: 9 additions & 11 deletions .ci/aws/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -61,29 +61,27 @@ pipeline {
def p3dn_lock_label = "p3dn-1-4node"
def p3dn_region = "us-east-1"
def p3dn_odcr = "cr-0ca89ca67f047efa8"
def p3dn_addl_args = "${addl_args_pr} --odcr ${p3dn_odcr} --odcr-placement-group-name efa-placement-group2"
def p3dn_addl_args = "${addl_args_pr} --odcr-placement-group-name efa-placement-group2"
def p4d_lock_label = "p4d-1-4node"
def p4d_region = "us-east-2"
def p4d_odcr = "cr-0e5eebb3c896f6af0"
def p4d_addl_args = "${addl_args_pr} --odcr ${p4d_odcr}"
def p5_lock_label = "p5-1-4node"
def p5_region = "af-south-1"
def p5_odcr = "cr-02eb632dcd8175139"
def p5_addl_args = "${addl_args_pr} --odcr ${p5_odcr}"

// p3dn tests
stages["4_p3dn_ubuntu2004"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_addl_args)
stages["4_p3dn_ubuntu2204"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_addl_args)
stages["4_p3dn_ubuntu2004"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2204"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args)

// p4d tests
stages["4_p4d_alinux2"] = common.get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_addl_args)
stages["4_p4d_ubuntu2004"] = common.get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_addl_args)
stages["4_p4d_ubuntu2204"] = common.get_test_stage_with_lock("4_p4d_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_addl_args)
stages["4_p4d_alinux2"] = common.get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, addl_args_pr)
stages["4_p4d_ubuntu2004"] = common.get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, addl_args_pr)
stages["4_p4d_ubuntu2204"] = common.get_test_stage_with_lock("4_p4d_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, addl_args_pr)

// p5 tests
stages["4_p5_alinux2"] = common.get_test_stage_with_lock("4_p5_alinux2", env.BUILD_TAG, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_addl_args)
stages["4_p5_ubuntu2004"] = common.get_test_stage_with_lock("4_p5_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_addl_args)
stages["4_p5_ubuntu2204"] = common.get_test_stage_with_lock("4_p5_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_addl_args)
stages["4_p5_alinux2"] = common.get_test_stage_with_lock("4_p5_alinux2", env.BUILD_TAG, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, addl_args_pr)
stages["4_p5_ubuntu2004"] = common.get_test_stage_with_lock("4_p5_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, addl_args_pr)
stages["4_p5_ubuntu2204"] = common.get_test_stage_with_lock("4_p5_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, addl_args_pr)

parallel stages
}
Expand Down
14 changes: 10 additions & 4 deletions .ci/aws/common.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,12 @@ def kill_all_clusters(instance_type, region) {
sh ". venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name \'*${instance_type_without_period}*\' --region ${region} || true"
}

def wait_for_odcr_capacity(region, instance_count, odcr) {
sh ". venv/bin/activate; ./PortaFiducia/scripts/wait_for_odcr_capacity.py --region ${region} --odcr-id ${odcr} --required-capacity ${instance_count}"
}


def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, config, addl_args) {
def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, config, odcr, addl_args) {
/*
* Run PortaFiducia/tests/test_orchestrator.py with given command line arguments
*/
Expand All @@ -63,9 +67,10 @@ def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_
* This stops us from being able to add additional capacity to the Jenkins service.
*/
kill_all_clusters(instance_type, region)
wait_for_odcr_capacity(region, instance_count, odcr)

def cluster_name = get_cluster_name(build_tag, os, instance_type)
def args = "--config ${config} --os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml"
def args = "--config ${config} --os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml"
def ret = sh (
script: ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}",
returnStatus: true
Expand Down Expand Up @@ -108,7 +113,7 @@ def get_cluster_name(build_tag, os, instance_type) {
}


def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, config, addl_args) {
def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, config, odcr, addl_args) {
/*
* Generate a single test stage that run test_orchestrator.py with the given parameters.
* The job will queue until it acquires the given number of locks. The locks will be released
Expand All @@ -121,13 +126,14 @@ def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, region, l
* param@ lock_label: str, the label of the lockable resources.
* param@ lock_count: int, the quantity of the lockable resources.
* param@ config: the name of the PortaFiducia config file
* param@ odcr: The on demand capacity reservation ID to create instances in
* param@ addl_args: additional arguments passed to test_orchestrator.py
* return@: the test stage.
*/
return {
stage("${stage_name}") {
lock(label: lock_label, quantity: lock_count) {
this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, lock_count, region, config, addl_args)
this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, lock_count, region, config, odcr, addl_args)
}
}
}
Expand Down

0 comments on commit b738a01

Please sign in to comment.